Merge pull request #14316 from Yu-QX/main

fix: Handle special cases of LLM output not following markdown syntax
2025-06-26 18:26:48 +00:00 · 2025-05-26 19:30:48 +04:00 · 2025-05-26 19:30:48 +04:00 · 040f29d058
commit 040f29d058
parent 4da75a9e78 bb62bd9c1f
1 changed files with 64 additions and 0 deletions
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@ -90,9 +90,72 @@ export const sanitizeResponseContent = (content: string) => {
 };

 export const processResponseContent = (content: string) => {
+	// This function is used to process the response content before the response content is rendered.
+	const lines = content.split('\n');
+	const processedLines = lines.map((line) => {
+		if (/[\u4e00-\u9fa5]/.test(line)) {
+			line = processChineseContent(line);
+		}
+		return line;
+	});
+	content = processedLines.join('\n');
 	return content.trim();
 };

+// Tackle "Model output issue not following the standard Markdown/LaTeX format" in Chinese.
+function processChineseContent(line: string): string {
+	// Problems caused by Chinese parentheses
+	/* Discription:
+	 *   When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style.
+	 *   - e.g. `**中文名（English）**中文内容` will be parsed directly,
+	 *          instead of `<strong>中文名（English）</strong>中文内容`.
+	 * Solution:
+	 *   Adding a `space` before and after the bold/italic part can solve the problem.
+	 *   - e.g. `**中文名（English）**中文内容` -> ` **中文名（English）** 中文内容`
+	 * Note:
+	 *   Similar problem was found with English parentheses and other full delimiters,
+	 *   but they are not handled here because they are less likely to appear in LLM output.
+	 *   Change the behavior in future if needed.
+	 */
+	if (line.includes('*')) {
+		// Handle **bold** with Chinese parentheses
+		line = processChineseParentheses(line, '**', '（', '）');
+		// Handle *italic* with Chinese parentheses
+		line = processChineseParentheses(line, '*', '（', '）');
+	}
+	return line;
+}
+
+function isChineseChar(char: string): boolean {
+	return /\p{Script=Han}/u.test(char);
+}
+
+// Helper function for `processChineseContent`
+function processChineseParentheses(
+	line: string,
+	symbol: string,
+	leftSymbol: string,
+	rightSymbol: string
+): string {
+	// NOTE: If needed, with a little modification, this function can be applied to more cases.
+	const escapedSymbol = escapeRegExp(symbol);
+	const regex = new RegExp(
+		`(.?)(?<!${escapedSymbol})(${escapedSymbol})([^${escapedSymbol}]+)(${escapedSymbol})(?!${escapedSymbol})(.)`,
+		'g'
+	);
+	return line.replace(regex, (match, l, left, content, right, r) => {
+		const result =
+			(content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) ||
+			(content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0]));
+
+		if (result) {
+			return `${l} ${left}${content}${right} ${r}`;
+		} else {
+			return match;
+		}
+	});
+}
+
 export function unescapeHtml(html: string) {
 	const doc = new DOMParser().parseFromString(html, 'text/html');
 	return doc.documentElement.textContent;
@ -324,6 +387,7 @@ export const copyToClipboard = async (text, formatted = false) => {
 		};
 		marked.use(markedKatexExtension(options));
 		marked.use(markedExtension(options));
+		// DEVELOPER NOTE: Go to `$lib/components/chat/Messages/Markdown.svelte` to add extra markdown extensions for rendering.

 		const htmlContent = marked.parse(text);