feat: add determinisic slot filling (keyword and patter lookup strategies)

2025-06-26 18:27:28 +00:00 · 2025-05-13 12:12:40 +01:00 · 2025-05-13 12:12:40 +01:00 · 6e89192f88
commit 6e89192f88
parent 946d940370
7 changed files with 373 additions and 0 deletions
--- a/api/src/helper/lib/test/base-nlp-helper.spec.ts
+++ b/api/src/helper/lib/test/base-nlp-helper.spec.ts
@ -30,6 +30,7 @@ import BaseNlpHelper from '../base-nlp-helper';
 const mockLoggerService = {
  log: jest.fn(),
  error: jest.fn(),
+  warn: jest.fn(),
 } as unknown as LoggerService;

 const mockSettingService = {
@ -218,4 +219,218 @@ describe('BaseNlpHelper', () => {
      );
    });
  });
+
+  describe('extractKeywordBasedSlots', () => {
+    it('should return matches for exact keywords and synonyms', () => {
+      const entity: NlpEntityFull = {
+        name: 'color',
+        values: [
+          { value: 'blue', expressions: ['azure', 'navy'] },
+          { value: 'green', expressions: ['emerald', 'lime'] },
+        ],
+      } as any;
+
+      const result = helper.extractKeywordBasedSlots(
+        'The sky is azure and emerald',
+        entity,
+      );
+      expect(result).toEqual([
+        {
+          entity: 'color',
+          value: 'blue',
+          start: 11,
+          end: 16,
+          confidence: 1,
+        },
+        {
+          entity: 'color',
+          value: 'green',
+          start: 21,
+          end: 28,
+          confidence: 1,
+        },
+      ]);
+    });
+
+    it('should return empty array if no values present', () => {
+      const result = helper.extractKeywordBasedSlots('anything', {
+        name: 'empty',
+        values: [],
+      } as any);
+
+      expect(result).toEqual([]);
+    });
+  });
+
+  describe('extractPatternBasedSlots', () => {
+    it('should match using a valid regex pattern', () => {
+      const entity: NlpEntityFull = {
+        name: 'number',
+        values: [
+          {
+            value: 'number',
+            metadata: { pattern: '\\d+', wordBoundary: true },
+          },
+        ],
+      } as any;
+
+      const result = helper.extractPatternBasedSlots(
+        'Order 123 and 456 now!',
+        entity,
+      );
+      expect(result).toEqual([
+        {
+          entity: 'number',
+          value: '123',
+          start: 6,
+          end: 9,
+          confidence: 1,
+        },
+        {
+          entity: 'number',
+          value: '456',
+          start: 14,
+          end: 17,
+          confidence: 1,
+        },
+      ]);
+    });
+
+    it('should respect metadata like toLowerCase and removeSpaces', () => {
+      const entity: NlpEntityFull = {
+        name: 'code',
+        values: [
+          {
+            value: 'Code',
+            metadata: {
+              pattern: 'HEX BOT',
+              toLowerCase: true,
+              removeSpaces: true,
+            },
+          },
+        ],
+      } as any;
+
+      const result = helper.extractPatternBasedSlots(
+        'My CODE is HEX BOT!',
+        entity,
+      );
+      expect(result).toEqual([
+        {
+          entity: 'code',
+          value: 'hexbot',
+          start: 11,
+          end: 18,
+          confidence: 1,
+        },
+      ]);
+    });
+
+    it('should return empty array if no values', () => {
+      const result = helper.extractPatternBasedSlots('test', {
+        name: 'noop',
+        values: [],
+      } as any);
+
+      expect(result).toEqual([]);
+    });
+
+    it('should handle invalid regex pattern gracefully', () => {
+      const entity: NlpEntityFull = {
+        name: 'fail',
+        values: [
+          {
+            value: 'Invalid',
+            metadata: { pattern: '[a-', wordBoundary: true },
+          },
+        ],
+      } as any;
+
+      const result = helper.extractPatternBasedSlots('test', entity);
+      expect(result).toEqual([]);
+    });
+  });
+
+  describe('runDeterministicSlotFilling', () => {
+    it('should call keyword-based extractor for keyword lookup strategy', () => {
+      const mockEntities: NlpEntityFull[] = [
+        {
+          name: 'product',
+          lookups: ['keywords'],
+          values: [
+            {
+              value: 'tshirt',
+              expressions: [],
+            },
+            {
+              value: 'pizza',
+              expressions: [],
+            },
+          ],
+        } as unknown as NlpEntityFull,
+      ];
+      jest.spyOn(helper, 'extractKeywordBasedSlots');
+      jest.spyOn(helper, 'extractPatternBasedSlots');
+
+      const result = helper.runDeterministicSlotFilling(
+        'order pizza',
+        mockEntities,
+      );
+
+      expect(helper.extractKeywordBasedSlots).toHaveBeenCalledTimes(1);
+      expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
+      expect(result).toHaveLength(1);
+      expect(result[0].entity).toBe('product');
+    });
+
+    it('should call pattern-based extractor for pattern lookup strategy', () => {
+      const mockEntities: NlpEntityFull[] = [
+        {
+          name: 'number',
+          lookups: ['pattern'],
+          values: [
+            {
+              value: 'phone',
+              metadata: { pattern: '\\d+' },
+              expressions: [],
+            },
+          ],
+        } as unknown as NlpEntityFull,
+      ];
+
+      jest.spyOn(helper, 'extractKeywordBasedSlots');
+      jest.spyOn(helper, 'extractPatternBasedSlots');
+
+      const result = helper.runDeterministicSlotFilling(
+        'call me at 1234567890',
+        mockEntities,
+      );
+
+      expect(helper.extractPatternBasedSlots).toHaveBeenCalledTimes(1);
+      expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
+      expect(result).toHaveLength(1);
+      expect(result[0].entity).toBe('number');
+    });
+
+    it('should skip entities that do not support the selected lookup strategy', () => {
+      const mockEntities: NlpEntityFull[] = [
+        {
+          name: 'irrelevant',
+          lookups: ['trait'],
+          values: [],
+        } as unknown as NlpEntityFull,
+      ];
+      jest.spyOn(helper, 'extractKeywordBasedSlots');
+      jest.spyOn(helper, 'extractPatternBasedSlots');
+
+      const result = helper.runDeterministicSlotFilling(
+        'any text',
+        mockEntities,
+      );
+
+      expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
+      expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
+      expect(result).toHaveLength(0);
+    });
+  });
 });
--- a/api/src/helper/lib/base-nlp-helper.ts
+++ b/api/src/helper/lib/base-nlp-helper.ts
@ -225,4 +225,145 @@ export default abstract class BaseNlpHelper<
    threshold?: boolean,
    project?: string,
  ): Promise<NLU.ParseEntities>;
+
+  /**
+   * Finds entities in a given text based on their values and synonyms.
+   *
+   * This function takes a string of text and an array of entities, where each entity contains a value
+   * and a list of synonyms. It returns an array of objects, each representing an entity found in the text
+   * along with its start and end positions.
+   *
+   * @param text - The input text to search for entities.
+   * @param entities - An array of entities to search for, each containing a `value` and a list of `synonyms`.
+   *
+   * @returns An array of objects representing the found entities, with their `value`, `start`, and `end` positions.
+   */
+  public extractKeywordBasedSlots(
+    text: string,
+    entity: NlpEntityFull,
+  ): NLU.ParseEntity[] {
+    if (!entity.values?.length) {
+      this.logger.warn('NLP entity has no values');
+      return [];
+    }
+
+    return (entity.values
+      .flatMap(({ value, expressions }) => {
+        const allValues = [value, ...expressions];
+
+        // Filter the terms that are found in the text
+        return allValues
+          .flatMap((term) => {
+            const regex = new RegExp(`\\b${term}\\b`, 'g');
+            const matches = [...text.matchAll(regex)];
+
+            // Map matches to FoundEntity format
+            return matches.map((match) => ({
+              entity: entity.name,
+              value,
+              start: match.index!,
+              end: match.index! + term.length,
+              confidence: 1,
+            }));
+          })
+          .shift();
+      })
+      .filter((v) => !!v) || []) as NLU.ParseEntity[];
+  }
+
+  /**
+   * Finds entities in a given text based on regex patterns (stored in `value` field).
+   *
+   * @param text - Input text to evaluate.
+   * @param entity - NlpEntityFull with regex values in `value` and optional metadata.
+   * @returns An array of matched entities with value, position, and confidence.
+   */
+  public extractPatternBasedSlots(
+    text: string,
+    entity: NlpEntityFull,
+  ): NLU.ParseEntity[] {
+    if (!entity.values?.length) {
+      this.logger.warn('NLP entity has no values');
+      return [];
+    }
+
+    return (entity.values
+      .flatMap((patternValue) => {
+        const processedText = text;
+        const pattern = patternValue.metadata?.pattern;
+
+        if (!pattern) {
+          this.logger.error('Missing NLP regex pattern');
+          return [];
+        }
+
+        let regex: RegExp;
+        try {
+          const shouldWrap = patternValue.metadata?.wordBoundary;
+          regex = new RegExp(shouldWrap ? `\\b${pattern}\\b` : pattern, 'gi');
+        } catch {
+          this.logger.error('Invalid NLP regex pattern');
+          return [];
+        }
+
+        const matches = [...processedText.matchAll(regex)];
+
+        return matches.map((match) => {
+          let value = match[0];
+
+          // Apply preprocessing if needed
+          if (patternValue.metadata?.removeSpaces) {
+            value = value.replace(/\s+/g, '');
+          }
+
+          if (patternValue.metadata?.toLowerCase) {
+            value = value.toLowerCase();
+          }
+
+          if (patternValue.metadata?.stripDiacritics) {
+            value = text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
+          }
+
+          return {
+            entity: entity.name,
+            value,
+            start: match.index!,
+            end: match.index! + match[0].length,
+            confidence: 1,
+          };
+        });
+      })
+      .filter((v) => !!v) || []) as NLU.ParseEntity[];
+  }
+
+  /**
+   * Extracts slot values from text based on the specified lookup strategy.
+   *
+   * This function supports deterministic slot filling by scanning the input text using either
+   * keyword-based or pattern-based entity recognition, depending on the provided lookup strategy.
+   *
+   * - For `keywords`: It uses exact term and synonym matching with word boundaries.
+   * - For `pattern`: It uses regular expressions defined in each entity value (stored in `value` field),
+   *   optionally applying preprocessing such as `removeSpaces`, `lowercase`, and `stripDiacritics`.
+   *
+   * @param text - The input text from which to extract slot values.
+   * @param entities - An array of NlpEntityFull objects, each containing slot values and metadata.
+   * @param lookup - The lookup strategy to use: either `keywords` or `pattern`.
+   *
+   * @returns An array of `ParseEntity` objects containing the entity name, matched value, position, and confidence.
+   */
+  public runDeterministicSlotFilling(
+    text: string,
+    entities: NlpEntityFull[],
+  ): NLU.ParseEntity[] {
+    return entities.flatMap((e) => {
+      if (e.lookups.includes('keywords')) {
+        return this.extractKeywordBasedSlots(text, e);
+      } else if (e.lookups.includes('pattern')) {
+        return this.extractPatternBasedSlots(text, e);
+      } else {
+        return [];
+      }
+    });
+  }
 }
--- a/api/src/nlp/schemas/types.ts
+++ b/api/src/nlp/schemas/types.ts
@ -40,6 +40,7 @@ export type NlpCacheMap = Map<string, NlpEntityFull>;
 export type NlpMetadata = {
  // Required when lookups is "pattern"
  pattern?: string;
+  wordBoundary?: boolean;
  removeSpaces?: boolean;
  toLowerCase?: boolean;
  stripDiacritics?: boolean;
--- a/frontend/public/locales/en/translation.json
+++ b/frontend/public/locales/en/translation.json
@ -351,6 +351,7 @@
    "doc": "Documentation",
    "builtin": "Built-in?",
    "weight": "Weight",
+    "word_boundary": "Word boundary",
    "remove_spaces": "Remove spaces",
    "to_lower_case": "Lowercase",
    "strip_diacritics": "Strip diacritics",
--- a/frontend/public/locales/fr/translation.json
+++ b/frontend/public/locales/fr/translation.json
@ -350,6 +350,7 @@
    "synonyms": "Synonymes",
    "doc": "Documentation",
    "weight": "Poids",
+    "word_boundary": "Délimiter (Mot)",
    "remove_spaces": "Supprimer les espaces",
    "to_lower_case": "Mettre en minucules",
    "strip_diacritics": "Supprimer les accents",
--- a/frontend/src/components/nlp/components/NlpValueForm.tsx
+++ b/frontend/src/components/nlp/components/NlpValueForm.tsx
@ -36,6 +36,7 @@ const getDefaultNlpMetadata = (
  if (nlpEntity?.lookups.includes(LookupStrategy.pattern)) {
    return {
      pattern: "//",
+      wordBoundary: true,
      removeSpaces: false,
      toLowerCase: false,
      stripDiacritics: false,
@ -160,6 +161,18 @@ export const NlpValueForm: FC<ComponentFormProps<INlpValue, INlpEntity>> = ({
                  flags={["i"]}
                />
              </ContentItem>
+              <ContentItem>
+                <Controller
+                  name="metadata.wordBoundary"
+                  control={control}
+                  render={({ field }) => (
+                    <FormControlLabel
+                      control={<Switch {...field} checked={field.value} />}
+                      label={t("label.word_boundary")}
+                    />
+                  )}
+                />
+              </ContentItem>
              <ContentItem>
                <Controller
                  name="metadata.removeSpaces"
--- a/frontend/src/types/nlp-entity.types.ts
+++ b/frontend/src/types/nlp-entity.types.ts
@ -23,6 +23,7 @@ export type Lookup = `${LookupStrategy}`;
 export interface INlpMetadata {
  // Required when lookups is "pattern"
  pattern?: string;
+  wordBoundary?: boolean;
  removeSpaces?: boolean;
  toLowerCase?: boolean;
  stripDiacritics?: boolean;