fix: Handle special cases of LLM output not following markdown syntax (20250525, formated) This commit introduces a utility function to address rendering issues in LLM output, particularly for Chinese characters and parentheses. The function ensures minimal modification of the original text while fixing markdown parsing problems. Changes include: - Added in for handling specific cases. - Updated in to incorporate the new utility. The fix ensures proper rendering of bold/italic text containing Chinese parentheses, improving readability for non-English content.

2025-06-26 18:26:48 +00:00 · 2025-05-25 19:37:43 +08:00
parent 49fe137553
commit 8ef7938c96
2 changed files with 90 additions and 83 deletions
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -20,7 +20,7 @@ import markedExtension from '$lib/utils/marked/extension';
 import markedKatexExtension from '$lib/utils/marked/katex-extension';
 import hljs from 'highlight.js';

-import { specialCases } from '$lib/utils/processResponseContent/special-cases'
+import { specialCases } from '$lib/utils/processResponseContent/special-cases';

 //////////////////////////
 // Helper functions
@@ -92,7 +92,7 @@ export const sanitizeResponseContent = (content: string) => {
 };

 export const processResponseContent = (content: string) => {
-	// This function is used to process the response content 
+	// This function is used to process the response content
 	// before the response content is rendered.
 	content = specialCases(content);
 	return content.trim();
--- a/src/lib/utils/processResponseContent/special-cases.ts
+++ b/src/lib/utils/processResponseContent/special-cases.ts
@@ -1,81 +1,88 @@
-/* Expliantion: 
- *   This file handles special cases of LLM output not following markdown syntax.
- *   It obeys the rule of modifying original text as **LITTLE** as possible.
- *   Detailed documentation of rendering problems is provided in comments.
- *   More special cases can be added in future.
- * Note:
- *   It should NOT handle the case unless there is clear evidence that it occurs.
- *   It only deals with special cases, especially with non-English characters, not general ones.
- *   Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`,
- *   and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly.
- */
-
-export const specialCases = (src: string): string => {
-    const lines = src.split('\n'); // Process from line to line.
-    const processedLines = lines.map(line => {
-
-        // 1. 中文 (Chinese, CN)
-        if (/[\u4e00-\u9fa5]/.test(line)) { // Only execute if there are Chinese characters.
-
-            // 1.1. Problems caused by Chinese parentheses
-            /* Discription:
-             *   When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style.
-             *   - e.g. `**中文名（English）**中文内容` will be parsed directly, 
-             *          instead of `<strong>中文名（English）</strong>中文内容`.
-             * Solution:
-             *   Adding a `space` before and after the bold/italic part can solve the problem.
-             *   - e.g. `**中文名（English）**中文内容` -> ` **中文名（English）** 中文内容`
-             * Note:
-             *   Similar problem was found with English parentheses and other full delimiters,
-             *   but they are not handled here because they are less likely to appear in LLM output.
-             *   Change the behavior in future if needed.
-             */
-            
-            if (line.includes('*')) { // Only execute if `*` is found in line.
-            // 1.1.1. Handle **bold** with Chinese parentheses
-            line = processCN_01(line, '**', '（', '）');
-            // 1.1.2. Handle *italic* with Chinese parentheses
-            line = processCN_01(line, '*', '（', '）');
-            }
-
-        }
-        return line;
-    });
-    const result = processedLines.join('\n');
-    return result;
-}
-
-//////////////////////////
-// Helper functions
-//////////////////////////
-
-function isChineseChar(char: string): boolean {
-    return /\p{Script=Han}/u.test(char);
-}
-
-function escapeRegExp(string: string): string {
-	return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
-}
-
-
-//////////////////////////
-// Main functions
-////////////////////////// 
-
-// Handle case `1.1.1` and `1.1.2`
-function processCN_01(line: string, symbol: string, leftSymbol: string, rightSymbol: string): string {
-    const escapedSymbol = escapeRegExp(symbol);
-    const regex = new RegExp(`(.*?)(?<!${escapedSymbol})(${escapedSymbol})([^${escapedSymbol}]+)(${escapedSymbol})(?!${escapedSymbol})(.*?)`, 'g');
-    return line.replace(regex, (match, l, left, content, right, r) => {
-        const result = (
-            (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) &&
-            (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) &&
-            (!r || (r && r.length > 0 && isChineseChar(r[0])))
-        )
-        if (result) {
-            return ` ${left}${content}${right} `;
-        } else {
-            return match;
-        }
-    });
-}
+/* Expliantion:
+ *   This file handles special cases of LLM output not following markdown syntax.
+ *   It obeys the rule of modifying original text as **LITTLE** as possible.
+ *   Detailed documentation of rendering problems is provided in comments.
+ *   More special cases can be added in future.
+ * Note:
+ *   It should NOT handle the case unless there is clear evidence that it occurs.
+ *   It only deals with special cases, especially with non-English characters, not general ones.
+ *   Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`,
+ *   and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly.
+ */
+
+export const specialCases = (src: string): string => {
+	const lines = src.split('\n'); // Process from line to line.
+	const processedLines = lines.map((line) => {
+		// 1. 中文 (Chinese, CN)
+		if (/[\u4e00-\u9fa5]/.test(line)) {
+			// Only execute if there are Chinese characters.
+
+			// 1.1. Problems caused by Chinese parentheses
+			/* Discription:
+			 *   When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style.
+			 *   - e.g. `**中文名（English）**中文内容` will be parsed directly,
+			 *          instead of `<strong>中文名（English）</strong>中文内容`.
+			 * Solution:
+			 *   Adding a `space` before and after the bold/italic part can solve the problem.
+			 *   - e.g. `**中文名（English）**中文内容` -> ` **中文名（English）** 中文内容`
+			 * Note:
+			 *   Similar problem was found with English parentheses and other full delimiters,
+			 *   but they are not handled here because they are less likely to appear in LLM output.
+			 *   Change the behavior in future if needed.
+			 */
+
+			if (line.includes('*')) {
+				// Only execute if `*` is found in line.
+
+				// 1.1.1. Handle **bold** with Chinese parentheses
+				line = processCN_01(line, '**', '（', '）');
+				// 1.1.2. Handle *italic* with Chinese parentheses
+				line = processCN_01(line, '*', '（', '）');
+			}
+		}
+		return line;
+	});
+	const result = processedLines.join('\n');
+	return result;
+};
+
+//////////////////////////
+// Helper functions
+//////////////////////////
+
+function isChineseChar(char: string): boolean {
+	return /\p{Script=Han}/u.test(char);
+}
+
+function escapeRegExp(string: string): string {
+	return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
+}
+
+//////////////////////////
+// Main functions
+//////////////////////////
+
+// Handle case `1.1.1` and `1.1.2`
+function processCN_01(
+	line: string,
+	symbol: string,
+	leftSymbol: string,
+	rightSymbol: string
+): string {
+	const escapedSymbol = escapeRegExp(symbol);
+	const regex = new RegExp(
+		`(.*?)(?<!${escapedSymbol})(${escapedSymbol})([^${escapedSymbol}]+)(${escapedSymbol})(?!${escapedSymbol})(.*?)`,
+		'g'
+	);
+	return line.replace(regex, (match, l, left, content, right, r) => {
+		const result =
+			(content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) &&
+			(!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) &&
+			(!r || (r && r.length > 0 && isChineseChar(r[0])));
+		if (result) {
+			return ` ${left}${content}${right} `;
+		} else {
+			return match;
+		}
+	});
+}