diff --git a/src/lib/utils/index.ts b/src/lib/utils/index.ts index acdcbe1c8..525e36b2f 100644 --- a/src/lib/utils/index.ts +++ b/src/lib/utils/index.ts @@ -20,6 +20,8 @@ import markedExtension from '$lib/utils/marked/extension'; import markedKatexExtension from '$lib/utils/marked/katex-extension'; import hljs from 'highlight.js'; +import { specialCases } from '$lib/utils/processResponseContent/special-cases' + ////////////////////////// // Helper functions ////////////////////////// @@ -90,6 +92,9 @@ export const sanitizeResponseContent = (content: string) => { }; export const processResponseContent = (content: string) => { + // This function is used to process the response content + // before the response content is rendered. + content = specialCases(content); return content.trim(); }; @@ -324,6 +329,7 @@ export const copyToClipboard = async (text, formatted = false) => { }; marked.use(markedKatexExtension(options)); marked.use(markedExtension(options)); + // DEVELOPER NOTE: Go to `$lib/components/chat/Messages/Markdown.svelte` to add extra markdown extensions for rendering. const htmlContent = marked.parse(text); diff --git a/src/lib/utils/processResponseContent/special-cases.ts b/src/lib/utils/processResponseContent/special-cases.ts new file mode 100644 index 000000000..37fcf93a6 --- /dev/null +++ b/src/lib/utils/processResponseContent/special-cases.ts @@ -0,0 +1,81 @@ +/* Expliantion: + * This file handles special cases of LLM output not following markdown syntax. + * It obeys the rule of modifying original text as **LITTLE** as possible. + * Detailed documentation of rendering problems is provided in comments. + * More special cases can be added in future. + * Note: + * It should NOT handle the case unless there is clear evidence that it occurs. + * It only deals with special cases, especially with non-English characters, not general ones. + * Other general issues found, new files shall be added to folder `'$lib/utils/processResponseContent/`, + * and function `processResponseContent` in `$lib/utils/index.ts` should be updated accordingly. + */ + +export const specialCases = (src: string): string => { + const lines = src.split('\n'); // Process from line to line. + const processedLines = lines.map(line => { + + // 1. 中文 (Chinese, CN) + if (/[\u4e00-\u9fa5]/.test(line)) { // Only execute if there are Chinese characters. + + // 1.1. Problems caused by Chinese parentheses + /* Discription: + * When `*` has Chinese parentheses on the inside, markdown parser ignore bold or italic style. + * - e.g. `**中文名(English)**中文内容` will be parsed directly, + * instead of `中文名(English)中文内容`. + * Solution: + * Adding a `space` before and after the bold/italic part can solve the problem. + * - e.g. `**中文名(English)**中文内容` -> ` **中文名(English)** 中文内容` + * Note: + * Similar problem was found with English parentheses and other full delimiters, + * but they are not handled here because they are less likely to appear in LLM output. + * Change the behavior in future if needed. + */ + + if (line.includes('*')) { // Only execute if `*` is found in line. + // 1.1.1. Handle **bold** with Chinese parentheses + line = processCN_01(line, '**', '(', ')'); + // 1.1.2. Handle *italic* with Chinese parentheses + line = processCN_01(line, '*', '(', ')'); + } + + } + return line; + }); + const result = processedLines.join('\n'); + return result; +} + +////////////////////////// +// Helper functions +////////////////////////// + +function isChineseChar(char: string): boolean { + return /\p{Script=Han}/u.test(char); +} + +function escapeRegExp(string: string): string { + return string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + + +////////////////////////// +// Main functions +////////////////////////// + +// Handle case `1.1.1` and `1.1.2` +function processCN_01(line: string, symbol: string, leftSymbol: string, rightSymbol: string): string { + const escapedSymbol = escapeRegExp(symbol); + const regex = new RegExp(`(.*?)(? { + const result = ( + (content.startsWith(leftSymbol) || content.endsWith(rightSymbol)) && + (!l || (l && l.length > 0 && isChineseChar(l[l.length - 1]))) && + (!r || (r && r.length > 0 && isChineseChar(r[0]))) + ) + if (result) { + return ` ${left}${content}${right} `; + } else { + return match; + } + }); +}