diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index 525e36b2f..602a02687 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,7 +20,7 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; -import { specialCases } from '$lib/utils/processResponseContent/special-cases' +import { specialCases } from '$lib/utils/processResponseContent/special-cases'; ////////////////////////// // Helper functions @@ -92,7 +92,7 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { - // This function is used to process the response content + // This function is used to process the response content // before the response content is rendered. content = specialCases(content); return content.trim(); diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts index 37fcf93a6..2267d90ad 100644 --- a/src/lib/utils/processResponseContent/special-cases.ts +++ b/src/lib/utils/processResponseContent/special-cases.ts @@ -1,81 +1,88 @@ -/* Expliantion: - * This file handles special cases of LLM output not following markdown syntax. - * It obeys the rule of modifying original text as **LITTLE** as possible. - * Detailed documentation of rendering problems is provided in comments. - * More special cases can be added in future. - * Note: - * It should NOT handle the case unless there is clear evidence that it occurs. - * It only deals with special cases, especially with non-English characters, not general ones. - * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, - * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. - */ - -export const specialCases = (src: string): string => { - const lines = src.split('\n'); // Process from line to line. - const processedLines = lines.map(line => { - - // 1. 中文 (Chinese, CN) - if (/[\u4e00-\u9fa5]/.test(line)) { // Only execute if there are Chinese characters. - - // 1.1. Problems caused by Chinese parentheses - /* Discription: - * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. - * - e.g. `**中文名(English)**中文内容` will be parsed directly, - * instead of `中文名(English)中文内容`. - * Solution: - * Adding a `space` before and after the bold/italic part can solve the problem. - * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` - * Note: - * Similar problem was found with English parentheses and other full delimiters, - * but they are not handled here because they are less likely to appear in LLM output. - * Change the behavior in future if needed. - */ - - if (line.includes('*')) { // Only execute if `*` is found in line. - // 1.1.1. Handle **bold** with Chinese parentheses - line = processCN_01(line, '**', '(', ')'); - // 1.1.2. Handle *italic* with Chinese parentheses - line = processCN_01(line, '*', '(', ')'); - } - - } - return line; - }); - const result = processedLines.join('\n'); - return result; -} - -////////////////////////// -// Helper functions -////////////////////////// - -function isChineseChar(char: string): boolean { - return /\p{Script=Han}/u.test(char); -} - -function escapeRegExp(string: string): string { - return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); -} - - -////////////////////////// -// Main functions -////////////////////////// - -// Handle case `1.1.1` and `1.1.2` -function processCN_01(line: string, symbol: string, leftSymbol: string, rightSymbol: string): string { - const escapedSymbol = escapeRegExp(symbol); - const regex = new RegExp(`(.*?)(? { - const result = ( - (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && - (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && - (!r || (r && r.length > 0 && isChineseChar(r[0]))) - ) - if (result) { - return ` ${left}${content}${right} `; - } else { - return match; - } - }); -} +/* Expliantion: + * This file handles special cases of LLM output not following markdown syntax. + * It obeys the rule of modifying original text as **LITTLE** as possible. + * Detailed documentation of rendering problems is provided in comments. + * More special cases can be added in future. + * Note: + * It should NOT handle the case unless there is clear evidence that it occurs. + * It only deals with special cases, especially with non-English characters, not general ones. + * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, + * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. + */ + +export const specialCases = (src: string): string => { + const lines = src.split('\n'); // Process from line to line. + const processedLines = lines.map((line) => { + // 1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { + // Only execute if there are Chinese characters. + + // 1.1. Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + + if (line.includes('*')) { + // Only execute if `*` is found in line. + + // 1.1.1. Handle **bold** with Chinese parentheses + line = processCN_01(line, '**', '(', ')'); + // 1.1.2. Handle *italic* with Chinese parentheses + line = processCN_01(line, '*', '(', ')'); + } + } + return line; + }); + const result = processedLines.join('\n'); + return result; +}; + +////////////////////////// +// Helper functions +////////////////////////// + +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +function escapeRegExp(string: string): string { + return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + +////////////////////////// +// Main functions +////////////////////////// + +// Handle case `1.1.1` and `1.1.2` +function processCN_01( + line: string, + symbol: string, + leftSymbol: string, + rightSymbol: string +): string { + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp( + `(.*?)(? { + const result = + (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && + (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && + (!r || (r && r.length > 0 && isChineseChar(r[0]))); + if (result) { + return ` ${left}${content}${right} `; + } else { + return match; + } + }); +}