From 254dd4246ec6f664c818532eb03e50f76b81c425 Mon Sep 17 00:00:00 2001 From: Yu QX Date: Mon, 26 May 2025 19:35:28 +0800 Subject: [PATCH] Integrate file into `index.ts`. --- src/lib/utils/index.ts | 80 ++++++++++++++++-- .../processResponseContent/special-cases.ts | 84 ------------------- 2 files changed, 75 insertions(+), 89 deletions(-) delete mode 100644 src/lib/utils/processResponseContent/special-cases.ts diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 602a02687..7f9fb859f 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,8 +20,6 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; -import { specialCases } from '$lib/utils/processResponseContent/special-cases'; - ////////////////////////// // Helper functions ////////////////////////// @@ -92,12 +90,84 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { - // This function is used to process the response content - // before the response content is rendered. - content = specialCases(content); + // This function is used to process the response content before the response content is rendered. + /* Discription: + * In future development, it is recommended to seperate `line to line` processes and `whole content` processes. + * To improve the maintainability, contents here are numbered with indexes to indicate their function, + * because the solution to problems under same category might be scattered between `line to line` and `whole content`. + * + * Index: + * 1. Tackle "Model output issue not following the standard Markdown/LaTeX format". + * - This part obeys the rule of modifying original text as **LITTLE** as possible. + * - Detailed documentation of rendering problems must be provided in comments. + * 1.1. Special cases + * 1.1.1. 中文 (Chinese, CN) + * 1.1.1.1. Handle **bold** with Chinese parentheses + * 1.1.1.2. Handle *italic* with Chinese parentheses + */ + + // Process from line to line. + const lines = content.split('\n'); + const processedLines = lines.map((line) => { + // 1.1.1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { + // 1.1.1.x Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + if (line.includes('*')) { + // 1.1.1.1. Handle **bold** with Chinese parentheses + line = processResponseContent_CN_ParenthesesRelated(line, '**', '(', ')'); + // 1.1.1.2. Handle *italic* with Chinese parentheses + line = processResponseContent_CN_ParenthesesRelated(line, '*', '(', ')'); + } + } + return line; + }); + content = processedLines.join('\n'); + return content.trim(); }; +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +// Helper function for `processResponseContent` case `1.1.1.1` and `1.1.1.2` +function processResponseContent_CN_ParenthesesRelated( + line: string, + symbol: string, + leftSymbol: string, + rightSymbol: string +): string { + // NOTE: If needed, with a little modification, this function can be applied to more cases. + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp( + `(.?)(? { + const result = + (content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) || + (content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0])); + + if (result) { + return `${l} ${left}${content}${right} ${r}`; + } else { + return match; + } + }); +} + export function unescapeHtml(html: string) { const doc = new DOMParser().parseFromString(html, 'text/html'); return doc.documentElement.textContent; diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts deleted file mode 100644 index 771430554..000000000 --- a/src/lib/utils/processResponseContent/special-cases.ts +++ /dev/null @@ -1,84 +0,0 @@ -/* Expliantion: - * This file handles special cases of LLM output not following markdown syntax. - * It obeys the rule of modifying original text as **LITTLE** as possible. - * Detailed documentation of rendering problems is provided in comments. - * More special cases can be added in future. - * Note: - * It should NOT handle the case unless there is clear evidence that it occurs. - * It only deals with special cases, especially with non-English characters, not general ones. - * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, - * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. - */ - -export const specialCases = (src: string): string => { - const lines = src.split('\n'); // Process from line to line. - const processedLines = lines.map((line) => { - // 1. 中文 (Chinese, CN) - if (/[\u4e00-\u9fa5]/.test(line)) { - // 1.1. Problems caused by Chinese parentheses - /* Discription: - * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. - * - e.g. `**中文名(English)**中文内容` will be parsed directly, - * instead of `中文名(English)中文内容`. - * Solution: - * Adding a `space` before and after the bold/italic part can solve the problem. - * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` - * Note: - * Similar problem was found with English parentheses and other full delimiters, - * but they are not handled here because they are less likely to appear in LLM output. - * Change the behavior in future if needed. - */ - - if (line.includes('*')) { - // 1.1.1. Handle **bold** with Chinese parentheses - line = processCN_01(line, '**', '(', ')'); - // 1.1.2. Handle *italic* with Chinese parentheses - line = processCN_01(line, '*', '(', ')'); - } - } - return line; - }); - const result = processedLines.join('\n'); - return result; -}; - -////////////////////////// -// Helper functions -////////////////////////// - -function isChineseChar(char: string): boolean { - return /\p{Script=Han}/u.test(char); -} - -function escapeRegExp(string: string): string { - return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); -} - -////////////////////////// -// Main functions -////////////////////////// - -// Handle case `1.1.1` and `1.1.2` -function processCN_01( - line: string, - symbol: string, - leftSymbol: string, - rightSymbol: string -): string { - const escapedSymbol = escapeRegExp(symbol); - const regex = new RegExp( - `(.?)(? { - const result = - (content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) || - (content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0])); - - if (result) { - return `${l} ${left}${content}${right} ${r}`; - } else { - return match; - } - }); -}