From 254dd4246ec6f664c818532eb03e50f76b81c425 Mon Sep 17 00:00:00 2001
From: Yu QX <bigheadcai@sina.com>
Date: Mon, 26 May 2025 19:35:28 +0800
Subject: [PATCH] Integrate file into `index.ts`.

---
 src/lib/utils/index.ts                        | 80 ++++++++++++++++--
 .../processResponseContent/special-cases.ts   | 84 -------------------
 2 files changed, 75 insertions(+), 89 deletions(-)
 delete mode 100644 src/lib/utils/processResponseContent/special-cases.ts
diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts
index 602a02687..7f9fb859f 100644
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -20,8 +20,6 @@ import markedExtension from '$lib/utils/marked/extension';
 import markedKatexExtension from '$lib/utils/marked/katex-extension';
 import hljs from 'highlight.js';
 
-import { specialCases } from '$lib/utils/processResponseContent/special-cases';
-
 //////////////////////////
 // Helper functions
 //////////////////////////
@@ -92,12 +90,84 @@ export const sanitizeResponseContent = (content: string) => {
 };
 
 export const processResponseContent = (content: string) => {
-	// This function is used to process the response content
-	// before the response content is rendered.
-	content = specialCases(content);
+	// This function is used to process the response content before the response content is rendered.
+	/* Discription:
+	 *   In future development, it is recommended to seperate `line to line` processes and `whole content` processes.
+	 *   To improve the maintainability, contents here are numbered with indexes to indicate their function，
+	 *   because the solution to problems under same category might be scattered between `line to line` and `whole content`.
+	 *
+	 * Index:
+	 *   1. Tackle "Model output issue not following the standard Markdown/LaTeX format".
+	 *      - This part obeys the rule of modifying original text as **LITTLE** as possible.
+	 *      - Detailed documentation of rendering problems must be provided in comments.
+	 *   1.1. Special cases
+	 *   1.1.1. 中文 (Chinese, CN)
+	 *   1.1.1.1. Handle **bold** with Chinese parentheses
+	 *   1.1.1.2. Handle *italic* with Chinese parentheses
+	 */
+
+	// Process from line to line.
+	const lines = content.split('\n');
+	const processedLines = lines.map((line) => {
+		// 1.1.1. 中文 (Chinese, CN)
+		if (/[\u4e00-\u9fa5]/.test(line)) {
+			// 1.1.1.x Problems caused by Chinese parentheses
+			/* Discription:
+			 *   When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style.
+			 *   - e.g. `**中文名（English）**中文内容` will be parsed directly,
+			 *          instead of `<strong>中文名（English）</strong>中文内容`.
+			 * Solution:
+			 *   Adding a `space` before and after the bold/italic part can solve the problem.
+			 *   - e.g. `**中文名（English）**中文内容` -> ` **中文名（English）** 中文内容`
+			 * Note:
+			 *   Similar problem was found with English parentheses and other full delimiters,
+			 *   but they are not handled here because they are less likely to appear in LLM output.
+			 *   Change the behavior in future if needed.
+			 */
+			if (line.includes('*')) {
+				// 1.1.1.1. Handle **bold** with Chinese parentheses
+				line = processResponseContent_CN_ParenthesesRelated(line, '**', '（', '）');
+				// 1.1.1.2. Handle *italic* with Chinese parentheses
+				line = processResponseContent_CN_ParenthesesRelated(line, '*', '（', '）');
+			}
+		}
+		return line;
+	});
+	content = processedLines.join('\n');
+
 	return content.trim();
 };
 
+function isChineseChar(char: string): boolean {
+	return /\p{Script=Han}/u.test(char);
+}
+
+// Helper function for `processResponseContent` case `1.1.1.1` and `1.1.1.2`
+function processResponseContent_CN_ParenthesesRelated(
+	line: string,
+	symbol: string,
+	leftSymbol: string,
+	rightSymbol: string
+): string {
+	// NOTE: If needed, with a little modification, this function can be applied to more cases.
+	const escapedSymbol = escapeRegExp(symbol);
+	const regex = new RegExp(
+		`(.?)(?<!${escapedSymbol})(${escapedSymbol})([^${escapedSymbol}]+)(${escapedSymbol})(?!${escapedSymbol})(.)`,
+		'g'
+	);
+	return line.replace(regex, (match, l, left, content, right, r) => {
+		const result =
+			(content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) ||
+			(content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0]));
+
+		if (result) {
+			return `${l} ${left}${content}${right} ${r}`;
+		} else {
+			return match;
+		}
+	});
+}
+
 export function unescapeHtml(html: string) {
 	const doc = new DOMParser().parseFromString(html, 'text/html');
 	return doc.documentElement.textContent;
diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts
deleted file mode 100644
index 771430554..000000000
--- a/src/lib/utils/processResponseContent/special-cases.ts
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Expliantion:
- *   This file handles special cases of LLM output not following markdown syntax.
- *   It obeys the rule of modifying original text as **LITTLE** as possible.
- *   Detailed documentation of rendering problems is provided in comments.
- *   More special cases can be added in future.
- * Note:
- *   It should NOT handle the case unless there is clear evidence that it occurs.
- *   It only deals with special cases, especially with non-English characters, not general ones.
- *   Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`,
- *   and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly.
- */
-
-export const specialCases = (src: string): string => {
-	const lines = src.split('\n'); // Process from line to line.
-	const processedLines = lines.map((line) => {
-		// 1. 中文 (Chinese, CN)
-		if (/[\u4e00-\u9fa5]/.test(line)) {
-			// 1.1. Problems caused by Chinese parentheses
-			/* Discription:
-			 *   When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style.
-			 *   - e.g. `**中文名（English）**中文内容` will be parsed directly,
-			 *          instead of `<strong>中文名（English）</strong>中文内容`.
-			 * Solution:
-			 *   Adding a `space` before and after the bold/italic part can solve the problem.
-			 *   - e.g. `**中文名（English）**中文内容` -> ` **中文名（English）** 中文内容`
-			 * Note:
-			 *   Similar problem was found with English parentheses and other full delimiters,
-			 *   but they are not handled here because they are less likely to appear in LLM output.
-			 *   Change the behavior in future if needed.
-			 */
-
-			if (line.includes('*')) {
-				// 1.1.1. Handle **bold** with Chinese parentheses
-				line = processCN_01(line, '**', '（', '）');
-				// 1.1.2. Handle *italic* with Chinese parentheses
-				line = processCN_01(line, '*', '（', '）');
-			}
-		}
-		return line;
-	});
-	const result = processedLines.join('\n');
-	return result;
-};
-
-//////////////////////////
-// Helper functions
-//////////////////////////
-
-function isChineseChar(char: string): boolean {
-	return /\p{Script=Han}/u.test(char);
-}
-
-function escapeRegExp(string: string): string {
-	return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
-}
-
-//////////////////////////
-// Main functions
-//////////////////////////
-
-// Handle case `1.1.1` and `1.1.2`
-function processCN_01(
-	line: string,
-	symbol: string,
-	leftSymbol: string,
-	rightSymbol: string
-): string {
-	const escapedSymbol = escapeRegExp(symbol);
-	const regex = new RegExp(
-		`(.?)(?<!${escapedSymbol})(${escapedSymbol})([^${escapedSymbol}]+)(${escapedSymbol})(?!${escapedSymbol})(.)`,
-		'g'
-	);
-	return line.replace(regex, (match, l, left, content, right, r) => {
-		const result =
-			(content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) ||
-			(content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0]));
-
-		if (result) {
-			return `${l} ${left}${content}${right} ${r}`;
-		} else {
-			return match;
-		}
-	});
-}