From 49fe137553c6a47be388fac9e6e5e89fd95aee26 Mon Sep 17 00:00:00 2001 From: YuQX Date: Sun, 25 May 2025 13:58:32 +0800 Subject: [PATCH 1/6] fix: Handle special cases of LLM output not following markdown syntax 20250525 This commit introduces a utility function to address rendering issues in LLM output, particularly for Chinese characters and parentheses. The function ensures minimal modification of the original text while fixing markdown parsing problems. Changes include: - Added in for handling specific cases. - Updated in to incorporate the new utility. The fix ensures proper rendering of bold/italic text containing Chinese parentheses, improving readability for non-English content. --- src/lib/utils/index.ts | 6 ++ .../processResponseContent/special-cases.ts | 81 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 src/lib/utils/processResponseContent/special-cases.ts diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index acdcbe1c8..525e36b2f 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,6 +20,8 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; +import { specialCases } from '$lib/utils/processResponseContent/special-cases' + ////////////////////////// // Helper functions ////////////////////////// @@ -90,6 +92,9 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { + // This function is used to process the response content + // before the response content is rendered. + content = specialCases(content); return content.trim(); }; @@ -324,6 +329,7 @@ export const copyToClipboard = async (text, formatted = false) => { }; marked.use(markedKatexExtension(options)); marked.use(markedExtension(options)); + // DEVELOPER NOTE: Go to `$lib/components/chat/Messages/Markdown.svelte` to add extra markdown extensions for rendering. const htmlContent = marked.parse(text); diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts new file mode 100644 index 000000000..37fcf93a6 --- /dev/null +++ b/src/lib/utils/processResponseContent/special-cases.ts @@ -0,0 +1,81 @@ +/* Expliantion: + * This file handles special cases of LLM output not following markdown syntax. + * It obeys the rule of modifying original text as **LITTLE** as possible. + * Detailed documentation of rendering problems is provided in comments. + * More special cases can be added in future. + * Note: + * It should NOT handle the case unless there is clear evidence that it occurs. + * It only deals with special cases, especially with non-English characters, not general ones. + * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, + * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. + */ + +export const specialCases = (src: string): string => { + const lines = src.split('\n'); // Process from line to line. + const processedLines = lines.map(line => { + + // 1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { // Only execute if there are Chinese characters. + + // 1.1. Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + + if (line.includes('*')) { // Only execute if `*` is found in line. + // 1.1.1. Handle **bold** with Chinese parentheses + line = processCN_01(line, '**', '(', ')'); + // 1.1.2. Handle *italic* with Chinese parentheses + line = processCN_01(line, '*', '(', ')'); + } + + } + return line; + }); + const result = processedLines.join('\n'); + return result; +} + +////////////////////////// +// Helper functions +////////////////////////// + +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +function escapeRegExp(string: string): string { + return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + + +////////////////////////// +// Main functions +////////////////////////// + +// Handle case `1.1.1` and `1.1.2` +function processCN_01(line: string, symbol: string, leftSymbol: string, rightSymbol: string): string { + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp(`(.*?)(? { + const result = ( + (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && + (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && + (!r || (r && r.length > 0 && isChineseChar(r[0]))) + ) + if (result) { + return ` ${left}${content}${right} `; + } else { + return match; + } + }); +} From 8ef7938c961235b0c3c8c81335d869fa8a8c6c20 Mon Sep 17 00:00:00 2001 From: YuQX Date: Sun, 25 May 2025 19:37:43 +0800 Subject: [PATCH 2/6] fix: Handle special cases of LLM output not following markdown syntax (20250525, formated) This commit introduces a utility function to address rendering issues in LLM output, particularly for Chinese characters and parentheses. The function ensures minimal modification of the original text while fixing markdown parsing problems. Changes include: - Added in for handling specific cases. - Updated in to incorporate the new utility. The fix ensures proper rendering of bold/italic text containing Chinese parentheses, improving readability for non-English content. --- src/lib/utils/index.ts | 4 +- .../processResponseContent/special-cases.ts | 169 +++++++++--------- 2 files changed, 90 insertions(+), 83 deletions(-) diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 525e36b2f..602a02687 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,7 +20,7 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; -import { specialCases } from '$lib/utils/processResponseContent/special-cases' +import { specialCases } from '$lib/utils/processResponseContent/special-cases'; ////////////////////////// // Helper functions @@ -92,7 +92,7 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { - // This function is used to process the response content + // This function is used to process the response content // before the response content is rendered. content = specialCases(content); return content.trim(); diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts index 37fcf93a6..2267d90ad 100644 --- a/src/lib/utils/processResponseContent/special-cases.ts +++ b/src/lib/utils/processResponseContent/special-cases.ts @@ -1,81 +1,88 @@ -/* Expliantion: - * This file handles special cases of LLM output not following markdown syntax. - * It obeys the rule of modifying original text as **LITTLE** as possible. - * Detailed documentation of rendering problems is provided in comments. - * More special cases can be added in future. - * Note: - * It should NOT handle the case unless there is clear evidence that it occurs. - * It only deals with special cases, especially with non-English characters, not general ones. - * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, - * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. - */ - -export const specialCases = (src: string): string => { - const lines = src.split('\n'); // Process from line to line. - const processedLines = lines.map(line => { - - // 1. 中文 (Chinese, CN) - if (/[\u4e00-\u9fa5]/.test(line)) { // Only execute if there are Chinese characters. - - // 1.1. Problems caused by Chinese parentheses - /* Discription: - * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. - * - e.g. `**中文名(English)**中文内容` will be parsed directly, - * instead of `中文名(English)中文内容`. - * Solution: - * Adding a `space` before and after the bold/italic part can solve the problem. - * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` - * Note: - * Similar problem was found with English parentheses and other full delimiters, - * but they are not handled here because they are less likely to appear in LLM output. - * Change the behavior in future if needed. - */ - - if (line.includes('*')) { // Only execute if `*` is found in line. - // 1.1.1. Handle **bold** with Chinese parentheses - line = processCN_01(line, '**', '(', ')'); - // 1.1.2. Handle *italic* with Chinese parentheses - line = processCN_01(line, '*', '(', ')'); - } - - } - return line; - }); - const result = processedLines.join('\n'); - return result; -} - -////////////////////////// -// Helper functions -////////////////////////// - -function isChineseChar(char: string): boolean { - return /\p{Script=Han}/u.test(char); -} - -function escapeRegExp(string: string): string { - return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); -} - - -////////////////////////// -// Main functions -////////////////////////// - -// Handle case `1.1.1` and `1.1.2` -function processCN_01(line: string, symbol: string, leftSymbol: string, rightSymbol: string): string { - const escapedSymbol = escapeRegExp(symbol); - const regex = new RegExp(`(.*?)(? { - const result = ( - (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && - (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && - (!r || (r && r.length > 0 && isChineseChar(r[0]))) - ) - if (result) { - return ` ${left}${content}${right} `; - } else { - return match; - } - }); -} +/* Expliantion: + * This file handles special cases of LLM output not following markdown syntax. + * It obeys the rule of modifying original text as **LITTLE** as possible. + * Detailed documentation of rendering problems is provided in comments. + * More special cases can be added in future. + * Note: + * It should NOT handle the case unless there is clear evidence that it occurs. + * It only deals with special cases, especially with non-English characters, not general ones. + * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, + * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. + */ + +export const specialCases = (src: string): string => { + const lines = src.split('\n'); // Process from line to line. + const processedLines = lines.map((line) => { + // 1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { + // Only execute if there are Chinese characters. + + // 1.1. Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + + if (line.includes('*')) { + // Only execute if `*` is found in line. + + // 1.1.1. Handle **bold** with Chinese parentheses + line = processCN_01(line, '**', '(', ')'); + // 1.1.2. Handle *italic* with Chinese parentheses + line = processCN_01(line, '*', '(', ')'); + } + } + return line; + }); + const result = processedLines.join('\n'); + return result; +}; + +////////////////////////// +// Helper functions +////////////////////////// + +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +function escapeRegExp(string: string): string { + return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + +////////////////////////// +// Main functions +////////////////////////// + +// Handle case `1.1.1` and `1.1.2` +function processCN_01( + line: string, + symbol: string, + leftSymbol: string, + rightSymbol: string +): string { + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp( + `(.*?)(? { + const result = + (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && + (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && + (!r || (r && r.length > 0 && isChineseChar(r[0]))); + if (result) { + return ` ${left}${content}${right} `; + } else { + return match; + } + }); +} From 4483fa95b8a0cdc5c71960efdab3a58b4c78c4bf Mon Sep 17 00:00:00 2001 From: Yu QX Date: Mon, 26 May 2025 17:25:33 +0800 Subject: [PATCH 3/6] Perfect the logic of case 1.1.x --- .../utils/processResponseContent/special-cases.ts | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts index 2267d90ad..771430554 100644 --- a/src/lib/utils/processResponseContent/special-cases.ts +++ b/src/lib/utils/processResponseContent/special-cases.ts @@ -15,8 +15,6 @@ export const specialCases = (src: string): string => { const processedLines = lines.map((line) => { // 1. 中文 (Chinese, CN) if (/[\u4e00-\u9fa5]/.test(line)) { - // Only execute if there are Chinese characters. - // 1.1. Problems caused by Chinese parentheses /* Discription: * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. @@ -32,8 +30,6 @@ export const specialCases = (src: string): string => { */ if (line.includes('*')) { - // Only execute if `*` is found in line. - // 1.1.1. Handle **bold** with Chinese parentheses line = processCN_01(line, '**', '(', ')'); // 1.1.2. Handle *italic* with Chinese parentheses @@ -71,16 +67,16 @@ function processCN_01( ): string { const escapedSymbol = escapeRegExp(symbol); const regex = new RegExp( - `(.*?)(? { const result = - (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && - (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && - (!r || (r && r.length > 0 && isChineseChar(r[0]))); + (content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) || + (content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0])); + if (result) { - return ` ${left}${content}${right} `; + return `${l} ${left}${content}${right} ${r}`; } else { return match; } From 254dd4246ec6f664c818532eb03e50f76b81c425 Mon Sep 17 00:00:00 2001 From: Yu QX Date: Mon, 26 May 2025 19:35:28 +0800 Subject: [PATCH 4/6] Integrate file into `index.ts`. --- src/lib/utils/index.ts | 80 ++++++++++++++++-- .../processResponseContent/special-cases.ts | 84 ------------------- 2 files changed, 75 insertions(+), 89 deletions(-) delete mode 100644 src/lib/utils/processResponseContent/special-cases.ts diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 602a02687..7f9fb859f 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,8 +20,6 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; -import { specialCases } from '$lib/utils/processResponseContent/special-cases'; - ////////////////////////// // Helper functions ////////////////////////// @@ -92,12 +90,84 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { - // This function is used to process the response content - // before the response content is rendered. - content = specialCases(content); + // This function is used to process the response content before the response content is rendered. + /* Discription: + * In future development, it is recommended to seperate `line to line` processes and `whole content` processes. + * To improve the maintainability, contents here are numbered with indexes to indicate their function, + * because the solution to problems under same category might be scattered between `line to line` and `whole content`. + * + * Index: + * 1. Tackle "Model output issue not following the standard Markdown/LaTeX format". + * - This part obeys the rule of modifying original text as **LITTLE** as possible. + * - Detailed documentation of rendering problems must be provided in comments. + * 1.1. Special cases + * 1.1.1. 中文 (Chinese, CN) + * 1.1.1.1. Handle **bold** with Chinese parentheses + * 1.1.1.2. Handle *italic* with Chinese parentheses + */ + + // Process from line to line. + const lines = content.split('\n'); + const processedLines = lines.map((line) => { + // 1.1.1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { + // 1.1.1.x Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + if (line.includes('*')) { + // 1.1.1.1. Handle **bold** with Chinese parentheses + line = processResponseContent_CN_ParenthesesRelated(line, '**', '(', ')'); + // 1.1.1.2. Handle *italic* with Chinese parentheses + line = processResponseContent_CN_ParenthesesRelated(line, '*', '(', ')'); + } + } + return line; + }); + content = processedLines.join('\n'); + return content.trim(); }; +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +// Helper function for `processResponseContent` case `1.1.1.1` and `1.1.1.2` +function processResponseContent_CN_ParenthesesRelated( + line: string, + symbol: string, + leftSymbol: string, + rightSymbol: string +): string { + // NOTE: If needed, with a little modification, this function can be applied to more cases. + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp( + `(.?)(? { + const result = + (content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) || + (content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0])); + + if (result) { + return `${l} ${left}${content}${right} ${r}`; + } else { + return match; + } + }); +} + export function unescapeHtml(html: string) { const doc = new DOMParser().parseFromString(html, 'text/html'); return doc.documentElement.textContent; diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts deleted file mode 100644 index 771430554..000000000 --- a/src/lib/utils/processResponseContent/special-cases.ts +++ /dev/null @@ -1,84 +0,0 @@ -/* Expliantion: - * This file handles special cases of LLM output not following markdown syntax. - * It obeys the rule of modifying original text as **LITTLE** as possible. - * Detailed documentation of rendering problems is provided in comments. - * More special cases can be added in future. - * Note: - * It should NOT handle the case unless there is clear evidence that it occurs. - * It only deals with special cases, especially with non-English characters, not general ones. - * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, - * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. - */ - -export const specialCases = (src: string): string => { - const lines = src.split('\n'); // Process from line to line. - const processedLines = lines.map((line) => { - // 1. 中文 (Chinese, CN) - if (/[\u4e00-\u9fa5]/.test(line)) { - // 1.1. Problems caused by Chinese parentheses - /* Discription: - * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. - * - e.g. `**中文名(English)**中文内容` will be parsed directly, - * instead of `中文名(English)中文内容`. - * Solution: - * Adding a `space` before and after the bold/italic part can solve the problem. - * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` - * Note: - * Similar problem was found with English parentheses and other full delimiters, - * but they are not handled here because they are less likely to appear in LLM output. - * Change the behavior in future if needed. - */ - - if (line.includes('*')) { - // 1.1.1. Handle **bold** with Chinese parentheses - line = processCN_01(line, '**', '(', ')'); - // 1.1.2. Handle *italic* with Chinese parentheses - line = processCN_01(line, '*', '(', ')'); - } - } - return line; - }); - const result = processedLines.join('\n'); - return result; -}; - -////////////////////////// -// Helper functions -////////////////////////// - -function isChineseChar(char: string): boolean { - return /\p{Script=Han}/u.test(char); -} - -function escapeRegExp(string: string): string { - return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); -} - -////////////////////////// -// Main functions -////////////////////////// - -// Handle case `1.1.1` and `1.1.2` -function processCN_01( - line: string, - symbol: string, - leftSymbol: string, - rightSymbol: string -): string { - const escapedSymbol = escapeRegExp(symbol); - const regex = new RegExp( - `(.?)(? { - const result = - (content.startsWith(leftSymbol) && l && l.length > 0 && isChineseChar(l[l.length - 1])) || - (content.endsWith(rightSymbol) && r && r.length > 0 && isChineseChar(r[0])); - - if (result) { - return `${l} ${left}${content}${right} ${r}`; - } else { - return match; - } - }); -} From b9f74a66b247c5737850ae28c5570042e741a70f Mon Sep 17 00:00:00 2001 From: Yu QX Date: Mon, 26 May 2025 20:43:21 +0800 Subject: [PATCH 5/6] Seperate into `processChineseContent` --- src/lib/utils/index.ts | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 7f9fb859f..9c4a5c7ef 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -91,27 +91,16 @@ export const sanitizeResponseContent = (content: string) => { export const processResponseContent = (content: string) => { // This function is used to process the response content before the response content is rendered. - /* Discription: - * In future development, it is recommended to seperate `line to line` processes and `whole content` processes. - * To improve the maintainability, contents here are numbered with indexes to indicate their function, - * because the solution to problems under same category might be scattered between `line to line` and `whole content`. - * - * Index: - * 1. Tackle "Model output issue not following the standard Markdown/LaTeX format". - * - This part obeys the rule of modifying original text as **LITTLE** as possible. - * - Detailed documentation of rendering problems must be provided in comments. - * 1.1. Special cases - * 1.1.1. 中文 (Chinese, CN) - * 1.1.1.1. Handle **bold** with Chinese parentheses - * 1.1.1.2. Handle *italic* with Chinese parentheses - */ + content = processChineseContent(content); + return content.trim(); +}; - // Process from line to line. +function processChineseContent(content: string): string { + // Tackle "Model output issue not following the standard Markdown/LaTeX format" in Chinese. const lines = content.split('\n'); const processedLines = lines.map((line) => { - // 1.1.1. 中文 (Chinese, CN) if (/[\u4e00-\u9fa5]/.test(line)) { - // 1.1.1.x Problems caused by Chinese parentheses + // Problems caused by Chinese parentheses /* Discription: * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. * - e.g. `**中文名(English)**中文内容` will be parsed directly, @@ -125,25 +114,23 @@ export const processResponseContent = (content: string) => { * Change the behavior in future if needed. */ if (line.includes('*')) { - // 1.1.1.1. Handle **bold** with Chinese parentheses - line = processResponseContent_CN_ParenthesesRelated(line, '**', '(', ')'); - // 1.1.1.2. Handle *italic* with Chinese parentheses - line = processResponseContent_CN_ParenthesesRelated(line, '*', '(', ')'); + // Handle **bold** with Chinese parentheses + line = processChineseContent_ParenthesesRelated(line, '**', '(', ')'); + // Handle *italic* with Chinese parentheses + line = processChineseContent_ParenthesesRelated(line, '*', '(', ')'); } } return line; }); - content = processedLines.join('\n'); - - return content.trim(); -}; + return processedLines.join('\n'); +} function isChineseChar(char: string): boolean { return /\p{Script=Han}/u.test(char); } -// Helper function for `processResponseContent` case `1.1.1.1` and `1.1.1.2` -function processResponseContent_CN_ParenthesesRelated( +// Helper function for `processChineseContent` +function processChineseContent_ParenthesesRelated( line: string, symbol: string, leftSymbol: string, From bb62bd9c1f5366f771ff8410e4f498eae24564f1 Mon Sep 17 00:00:00 2001 From: Yu QX Date: Mon, 26 May 2025 21:04:08 +0800 Subject: [PATCH 6/6] Extracting the traversal. --- src/lib/utils/index.ts | 55 +++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 9c4a5c7ef..58f16c6e9 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -91,38 +91,39 @@ export const sanitizeResponseContent = (content: string) => { export const processResponseContent = (content: string) => { // This function is used to process the response content before the response content is rendered. - content = processChineseContent(content); - return content.trim(); -}; - -function processChineseContent(content: string): string { - // Tackle "Model output issue not following the standard Markdown/LaTeX format" in Chinese. const lines = content.split('\n'); const processedLines = lines.map((line) => { if (/[\u4e00-\u9fa5]/.test(line)) { - // Problems caused by Chinese parentheses - /* Discription: - * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. - * - e.g. `**中文名(English)**中文内容` will be parsed directly, - * instead of `中文名(English)中文内容`. - * Solution: - * Adding a `space` before and after the bold/italic part can solve the problem. - * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` - * Note: - * Similar problem was found with English parentheses and other full delimiters, - * but they are not handled here because they are less likely to appear in LLM output. - * Change the behavior in future if needed. - */ - if (line.includes('*')) { - // Handle **bold** with Chinese parentheses - line = processChineseContent_ParenthesesRelated(line, '**', '(', ')'); - // Handle *italic* with Chinese parentheses - line = processChineseContent_ParenthesesRelated(line, '*', '(', ')'); - } + line = processChineseContent(line); } return line; }); - return processedLines.join('\n'); + content = processedLines.join('\n'); + return content.trim(); +}; + +// Tackle "Model output issue not following the standard Markdown/LaTeX format" in Chinese. +function processChineseContent(line: string): string { + // Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + if (line.includes('*')) { + // Handle **bold** with Chinese parentheses + line = processChineseParentheses(line, '**', '(', ')'); + // Handle *italic* with Chinese parentheses + line = processChineseParentheses(line, '*', '(', ')'); + } + return line; } function isChineseChar(char: string): boolean { @@ -130,7 +131,7 @@ function isChineseChar(char: string): boolean { } // Helper function for `processChineseContent` -function processChineseContent_ParenthesesRelated( +function processChineseParentheses( line: string, symbol: string, leftSymbol: string,