diff --git a/api/src/helper/lib/__test__/base-nlp-helper.spec.ts b/api/src/helper/lib/__test__/base-nlp-helper.spec.ts index 404592de..e9f4b88b 100644 --- a/api/src/helper/lib/__test__/base-nlp-helper.spec.ts +++ b/api/src/helper/lib/__test__/base-nlp-helper.spec.ts @@ -30,6 +30,7 @@ import BaseNlpHelper from '../base-nlp-helper'; const mockLoggerService = { log: jest.fn(), error: jest.fn(), + warn: jest.fn(), } as unknown as LoggerService; const mockSettingService = { @@ -218,4 +219,218 @@ describe('BaseNlpHelper', () => { ); }); }); + + describe('extractKeywordBasedSlots', () => { + it('should return matches for exact keywords and synonyms', () => { + const entity: NlpEntityFull = { + name: 'color', + values: [ + { value: 'blue', expressions: ['azure', 'navy'] }, + { value: 'green', expressions: ['emerald', 'lime'] }, + ], + } as any; + + const result = helper.extractKeywordBasedSlots( + 'The sky is azure and emerald', + entity, + ); + expect(result).toEqual([ + { + entity: 'color', + value: 'blue', + start: 11, + end: 16, + confidence: 1, + }, + { + entity: 'color', + value: 'green', + start: 21, + end: 28, + confidence: 1, + }, + ]); + }); + + it('should return empty array if no values present', () => { + const result = helper.extractKeywordBasedSlots('anything', { + name: 'empty', + values: [], + } as any); + + expect(result).toEqual([]); + }); + }); + + describe('extractPatternBasedSlots', () => { + it('should match using a valid regex pattern', () => { + const entity: NlpEntityFull = { + name: 'number', + values: [ + { + value: 'number', + metadata: { pattern: '\\d+', wordBoundary: true }, + }, + ], + } as any; + + const result = helper.extractPatternBasedSlots( + 'Order 123 and 456 now!', + entity, + ); + expect(result).toEqual([ + { + entity: 'number', + value: '123', + start: 6, + end: 9, + confidence: 1, + }, + { + entity: 'number', + value: '456', + start: 14, + end: 17, + confidence: 1, + }, + ]); + }); + + it('should respect metadata like toLowerCase and removeSpaces', () => { + const entity: NlpEntityFull = { + name: 'code', + values: [ + { + value: 'Code', + metadata: { + pattern: 'HEX BOT', + toLowerCase: true, + removeSpaces: true, + }, + }, + ], + } as any; + + const result = helper.extractPatternBasedSlots( + 'My CODE is HEX BOT!', + entity, + ); + expect(result).toEqual([ + { + entity: 'code', + value: 'hexbot', + start: 11, + end: 18, + confidence: 1, + }, + ]); + }); + + it('should return empty array if no values', () => { + const result = helper.extractPatternBasedSlots('test', { + name: 'noop', + values: [], + } as any); + + expect(result).toEqual([]); + }); + + it('should handle invalid regex pattern gracefully', () => { + const entity: NlpEntityFull = { + name: 'fail', + values: [ + { + value: 'Invalid', + metadata: { pattern: '[a-', wordBoundary: true }, + }, + ], + } as any; + + const result = helper.extractPatternBasedSlots('test', entity); + expect(result).toEqual([]); + }); + }); + + describe('runDeterministicSlotFilling', () => { + it('should call keyword-based extractor for keyword lookup strategy', () => { + const mockEntities: NlpEntityFull[] = [ + { + name: 'product', + lookups: ['keywords'], + values: [ + { + value: 'tshirt', + expressions: [], + }, + { + value: 'pizza', + expressions: [], + }, + ], + } as unknown as NlpEntityFull, + ]; + jest.spyOn(helper, 'extractKeywordBasedSlots'); + jest.spyOn(helper, 'extractPatternBasedSlots'); + + const result = helper.runDeterministicSlotFilling( + 'order pizza', + mockEntities, + ); + + expect(helper.extractKeywordBasedSlots).toHaveBeenCalledTimes(1); + expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled(); + expect(result).toHaveLength(1); + expect(result[0].entity).toBe('product'); + }); + + it('should call pattern-based extractor for pattern lookup strategy', () => { + const mockEntities: NlpEntityFull[] = [ + { + name: 'number', + lookups: ['pattern'], + values: [ + { + value: 'phone', + metadata: { pattern: '\\d+' }, + expressions: [], + }, + ], + } as unknown as NlpEntityFull, + ]; + + jest.spyOn(helper, 'extractKeywordBasedSlots'); + jest.spyOn(helper, 'extractPatternBasedSlots'); + + const result = helper.runDeterministicSlotFilling( + 'call me at 1234567890', + mockEntities, + ); + + expect(helper.extractPatternBasedSlots).toHaveBeenCalledTimes(1); + expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled(); + expect(result).toHaveLength(1); + expect(result[0].entity).toBe('number'); + }); + + it('should skip entities that do not support the selected lookup strategy', () => { + const mockEntities: NlpEntityFull[] = [ + { + name: 'irrelevant', + lookups: ['trait'], + values: [], + } as unknown as NlpEntityFull, + ]; + jest.spyOn(helper, 'extractKeywordBasedSlots'); + jest.spyOn(helper, 'extractPatternBasedSlots'); + + const result = helper.runDeterministicSlotFilling( + 'any text', + mockEntities, + ); + + expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled(); + expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled(); + expect(result).toHaveLength(0); + }); + }); }); diff --git a/api/src/helper/lib/base-nlp-helper.ts b/api/src/helper/lib/base-nlp-helper.ts index 5b0e8310..94b766bc 100644 --- a/api/src/helper/lib/base-nlp-helper.ts +++ b/api/src/helper/lib/base-nlp-helper.ts @@ -225,4 +225,145 @@ export default abstract class BaseNlpHelper< threshold?: boolean, project?: string, ): Promise; + + /** + * Finds entities in a given text based on their values and synonyms. + * + * This function takes a string of text and an array of entities, where each entity contains a value + * and a list of synonyms. It returns an array of objects, each representing an entity found in the text + * along with its start and end positions. + * + * @param text - The input text to search for entities. + * @param entities - An array of entities to search for, each containing a `value` and a list of `synonyms`. + * + * @returns An array of objects representing the found entities, with their `value`, `start`, and `end` positions. + */ + public extractKeywordBasedSlots( + text: string, + entity: NlpEntityFull, + ): NLU.ParseEntity[] { + if (!entity.values?.length) { + this.logger.warn('NLP entity has no values'); + return []; + } + + return (entity.values + .flatMap(({ value, expressions }) => { + const allValues = [value, ...expressions]; + + // Filter the terms that are found in the text + return allValues + .flatMap((term) => { + const regex = new RegExp(`\\b${term}\\b`, 'g'); + const matches = [...text.matchAll(regex)]; + + // Map matches to FoundEntity format + return matches.map((match) => ({ + entity: entity.name, + value, + start: match.index!, + end: match.index! + term.length, + confidence: 1, + })); + }) + .shift(); + }) + .filter((v) => !!v) || []) as NLU.ParseEntity[]; + } + + /** + * Finds entities in a given text based on regex patterns (stored in `value` field). + * + * @param text - Input text to evaluate. + * @param entity - NlpEntityFull with regex values in `value` and optional metadata. + * @returns An array of matched entities with value, position, and confidence. + */ + public extractPatternBasedSlots( + text: string, + entity: NlpEntityFull, + ): NLU.ParseEntity[] { + if (!entity.values?.length) { + this.logger.warn('NLP entity has no values'); + return []; + } + + return (entity.values + .flatMap((patternValue) => { + const processedText = text; + const pattern = patternValue.metadata?.pattern; + + if (!pattern) { + this.logger.error('Missing NLP regex pattern'); + return []; + } + + let regex: RegExp; + try { + const shouldWrap = patternValue.metadata?.wordBoundary; + regex = new RegExp(shouldWrap ? `\\b${pattern}\\b` : pattern, 'gi'); + } catch { + this.logger.error('Invalid NLP regex pattern'); + return []; + } + + const matches = [...processedText.matchAll(regex)]; + + return matches.map((match) => { + let value = match[0]; + + // Apply preprocessing if needed + if (patternValue.metadata?.removeSpaces) { + value = value.replace(/\s+/g, ''); + } + + if (patternValue.metadata?.toLowerCase) { + value = value.toLowerCase(); + } + + if (patternValue.metadata?.stripDiacritics) { + value = text.normalize('NFD').replace(/[\u0300-\u036f]/g, ''); + } + + return { + entity: entity.name, + value, + start: match.index!, + end: match.index! + match[0].length, + confidence: 1, + }; + }); + }) + .filter((v) => !!v) || []) as NLU.ParseEntity[]; + } + + /** + * Extracts slot values from text based on the specified lookup strategy. + * + * This function supports deterministic slot filling by scanning the input text using either + * keyword-based or pattern-based entity recognition, depending on the provided lookup strategy. + * + * - For `keywords`: It uses exact term and synonym matching with word boundaries. + * - For `pattern`: It uses regular expressions defined in each entity value (stored in `value` field), + * optionally applying preprocessing such as `removeSpaces`, `lowercase`, and `stripDiacritics`. + * + * @param text - The input text from which to extract slot values. + * @param entities - An array of NlpEntityFull objects, each containing slot values and metadata. + * @param lookup - The lookup strategy to use: either `keywords` or `pattern`. + * + * @returns An array of `ParseEntity` objects containing the entity name, matched value, position, and confidence. + */ + public runDeterministicSlotFilling( + text: string, + entities: NlpEntityFull[], + ): NLU.ParseEntity[] { + return entities.flatMap((e) => { + if (e.lookups.includes('keywords')) { + return this.extractKeywordBasedSlots(text, e); + } else if (e.lookups.includes('pattern')) { + return this.extractPatternBasedSlots(text, e); + } else { + return []; + } + }); + } } diff --git a/api/src/nlp/schemas/types.ts b/api/src/nlp/schemas/types.ts index 87c9f027..d8408bdf 100644 --- a/api/src/nlp/schemas/types.ts +++ b/api/src/nlp/schemas/types.ts @@ -40,6 +40,7 @@ export type NlpCacheMap = Map; export type NlpMetadata = { // Required when lookups is "pattern" pattern?: string; + wordBoundary?: boolean; removeSpaces?: boolean; toLowerCase?: boolean; stripDiacritics?: boolean; diff --git a/frontend/public/locales/en/translation.json b/frontend/public/locales/en/translation.json index 0aadb564..e3ed3a75 100644 --- a/frontend/public/locales/en/translation.json +++ b/frontend/public/locales/en/translation.json @@ -351,6 +351,7 @@ "doc": "Documentation", "builtin": "Built-in?", "weight": "Weight", + "word_boundary": "Word boundary", "remove_spaces": "Remove spaces", "to_lower_case": "Lowercase", "strip_diacritics": "Strip diacritics", diff --git a/frontend/public/locales/fr/translation.json b/frontend/public/locales/fr/translation.json index 5a7ac6f6..ad69c256 100644 --- a/frontend/public/locales/fr/translation.json +++ b/frontend/public/locales/fr/translation.json @@ -350,6 +350,7 @@ "synonyms": "Synonymes", "doc": "Documentation", "weight": "Poids", + "word_boundary": "Délimiter (Mot)", "remove_spaces": "Supprimer les espaces", "to_lower_case": "Mettre en minucules", "strip_diacritics": "Supprimer les accents", diff --git a/frontend/src/components/nlp/components/NlpValueForm.tsx b/frontend/src/components/nlp/components/NlpValueForm.tsx index dc0e2916..113e154d 100644 --- a/frontend/src/components/nlp/components/NlpValueForm.tsx +++ b/frontend/src/components/nlp/components/NlpValueForm.tsx @@ -36,6 +36,7 @@ const getDefaultNlpMetadata = ( if (nlpEntity?.lookups.includes(LookupStrategy.pattern)) { return { pattern: "//", + wordBoundary: true, removeSpaces: false, toLowerCase: false, stripDiacritics: false, @@ -160,6 +161,18 @@ export const NlpValueForm: FC> = ({ flags={["i"]} /> + + ( + } + label={t("label.word_boundary")} + /> + )} + /> +