feat: add determinisic slot filling (keyword and patter lookup strategies)

This commit is contained in:
Mohamed Marrouchi 2025-05-13 12:12:40 +01:00
parent 946d940370
commit 6e89192f88
7 changed files with 373 additions and 0 deletions

View File

@ -30,6 +30,7 @@ import BaseNlpHelper from '../base-nlp-helper';
const mockLoggerService = {
log: jest.fn(),
error: jest.fn(),
warn: jest.fn(),
} as unknown as LoggerService;
const mockSettingService = {
@ -218,4 +219,218 @@ describe('BaseNlpHelper', () => {
);
});
});
describe('extractKeywordBasedSlots', () => {
it('should return matches for exact keywords and synonyms', () => {
const entity: NlpEntityFull = {
name: 'color',
values: [
{ value: 'blue', expressions: ['azure', 'navy'] },
{ value: 'green', expressions: ['emerald', 'lime'] },
],
} as any;
const result = helper.extractKeywordBasedSlots(
'The sky is azure and emerald',
entity,
);
expect(result).toEqual([
{
entity: 'color',
value: 'blue',
start: 11,
end: 16,
confidence: 1,
},
{
entity: 'color',
value: 'green',
start: 21,
end: 28,
confidence: 1,
},
]);
});
it('should return empty array if no values present', () => {
const result = helper.extractKeywordBasedSlots('anything', {
name: 'empty',
values: [],
} as any);
expect(result).toEqual([]);
});
});
describe('extractPatternBasedSlots', () => {
it('should match using a valid regex pattern', () => {
const entity: NlpEntityFull = {
name: 'number',
values: [
{
value: 'number',
metadata: { pattern: '\\d+', wordBoundary: true },
},
],
} as any;
const result = helper.extractPatternBasedSlots(
'Order 123 and 456 now!',
entity,
);
expect(result).toEqual([
{
entity: 'number',
value: '123',
start: 6,
end: 9,
confidence: 1,
},
{
entity: 'number',
value: '456',
start: 14,
end: 17,
confidence: 1,
},
]);
});
it('should respect metadata like toLowerCase and removeSpaces', () => {
const entity: NlpEntityFull = {
name: 'code',
values: [
{
value: 'Code',
metadata: {
pattern: 'HEX BOT',
toLowerCase: true,
removeSpaces: true,
},
},
],
} as any;
const result = helper.extractPatternBasedSlots(
'My CODE is HEX BOT!',
entity,
);
expect(result).toEqual([
{
entity: 'code',
value: 'hexbot',
start: 11,
end: 18,
confidence: 1,
},
]);
});
it('should return empty array if no values', () => {
const result = helper.extractPatternBasedSlots('test', {
name: 'noop',
values: [],
} as any);
expect(result).toEqual([]);
});
it('should handle invalid regex pattern gracefully', () => {
const entity: NlpEntityFull = {
name: 'fail',
values: [
{
value: 'Invalid',
metadata: { pattern: '[a-', wordBoundary: true },
},
],
} as any;
const result = helper.extractPatternBasedSlots('test', entity);
expect(result).toEqual([]);
});
});
describe('runDeterministicSlotFilling', () => {
it('should call keyword-based extractor for keyword lookup strategy', () => {
const mockEntities: NlpEntityFull[] = [
{
name: 'product',
lookups: ['keywords'],
values: [
{
value: 'tshirt',
expressions: [],
},
{
value: 'pizza',
expressions: [],
},
],
} as unknown as NlpEntityFull,
];
jest.spyOn(helper, 'extractKeywordBasedSlots');
jest.spyOn(helper, 'extractPatternBasedSlots');
const result = helper.runDeterministicSlotFilling(
'order pizza',
mockEntities,
);
expect(helper.extractKeywordBasedSlots).toHaveBeenCalledTimes(1);
expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
expect(result).toHaveLength(1);
expect(result[0].entity).toBe('product');
});
it('should call pattern-based extractor for pattern lookup strategy', () => {
const mockEntities: NlpEntityFull[] = [
{
name: 'number',
lookups: ['pattern'],
values: [
{
value: 'phone',
metadata: { pattern: '\\d+' },
expressions: [],
},
],
} as unknown as NlpEntityFull,
];
jest.spyOn(helper, 'extractKeywordBasedSlots');
jest.spyOn(helper, 'extractPatternBasedSlots');
const result = helper.runDeterministicSlotFilling(
'call me at 1234567890',
mockEntities,
);
expect(helper.extractPatternBasedSlots).toHaveBeenCalledTimes(1);
expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
expect(result).toHaveLength(1);
expect(result[0].entity).toBe('number');
});
it('should skip entities that do not support the selected lookup strategy', () => {
const mockEntities: NlpEntityFull[] = [
{
name: 'irrelevant',
lookups: ['trait'],
values: [],
} as unknown as NlpEntityFull,
];
jest.spyOn(helper, 'extractKeywordBasedSlots');
jest.spyOn(helper, 'extractPatternBasedSlots');
const result = helper.runDeterministicSlotFilling(
'any text',
mockEntities,
);
expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
expect(result).toHaveLength(0);
});
});
});

View File

@ -225,4 +225,145 @@ export default abstract class BaseNlpHelper<
threshold?: boolean,
project?: string,
): Promise<NLU.ParseEntities>;
/**
* Finds entities in a given text based on their values and synonyms.
*
* This function takes a string of text and an array of entities, where each entity contains a value
* and a list of synonyms. It returns an array of objects, each representing an entity found in the text
* along with its start and end positions.
*
* @param text - The input text to search for entities.
* @param entities - An array of entities to search for, each containing a `value` and a list of `synonyms`.
*
* @returns An array of objects representing the found entities, with their `value`, `start`, and `end` positions.
*/
public extractKeywordBasedSlots(
text: string,
entity: NlpEntityFull,
): NLU.ParseEntity[] {
if (!entity.values?.length) {
this.logger.warn('NLP entity has no values');
return [];
}
return (entity.values
.flatMap(({ value, expressions }) => {
const allValues = [value, ...expressions];
// Filter the terms that are found in the text
return allValues
.flatMap((term) => {
const regex = new RegExp(`\\b${term}\\b`, 'g');
const matches = [...text.matchAll(regex)];
// Map matches to FoundEntity format
return matches.map((match) => ({
entity: entity.name,
value,
start: match.index!,
end: match.index! + term.length,
confidence: 1,
}));
})
.shift();
})
.filter((v) => !!v) || []) as NLU.ParseEntity[];
}
/**
* Finds entities in a given text based on regex patterns (stored in `value` field).
*
* @param text - Input text to evaluate.
* @param entity - NlpEntityFull with regex values in `value` and optional metadata.
* @returns An array of matched entities with value, position, and confidence.
*/
public extractPatternBasedSlots(
text: string,
entity: NlpEntityFull,
): NLU.ParseEntity[] {
if (!entity.values?.length) {
this.logger.warn('NLP entity has no values');
return [];
}
return (entity.values
.flatMap((patternValue) => {
const processedText = text;
const pattern = patternValue.metadata?.pattern;
if (!pattern) {
this.logger.error('Missing NLP regex pattern');
return [];
}
let regex: RegExp;
try {
const shouldWrap = patternValue.metadata?.wordBoundary;
regex = new RegExp(shouldWrap ? `\\b${pattern}\\b` : pattern, 'gi');
} catch {
this.logger.error('Invalid NLP regex pattern');
return [];
}
const matches = [...processedText.matchAll(regex)];
return matches.map((match) => {
let value = match[0];
// Apply preprocessing if needed
if (patternValue.metadata?.removeSpaces) {
value = value.replace(/\s+/g, '');
}
if (patternValue.metadata?.toLowerCase) {
value = value.toLowerCase();
}
if (patternValue.metadata?.stripDiacritics) {
value = text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}
return {
entity: entity.name,
value,
start: match.index!,
end: match.index! + match[0].length,
confidence: 1,
};
});
})
.filter((v) => !!v) || []) as NLU.ParseEntity[];
}
/**
* Extracts slot values from text based on the specified lookup strategy.
*
* This function supports deterministic slot filling by scanning the input text using either
* keyword-based or pattern-based entity recognition, depending on the provided lookup strategy.
*
* - For `keywords`: It uses exact term and synonym matching with word boundaries.
* - For `pattern`: It uses regular expressions defined in each entity value (stored in `value` field),
* optionally applying preprocessing such as `removeSpaces`, `lowercase`, and `stripDiacritics`.
*
* @param text - The input text from which to extract slot values.
* @param entities - An array of NlpEntityFull objects, each containing slot values and metadata.
* @param lookup - The lookup strategy to use: either `keywords` or `pattern`.
*
* @returns An array of `ParseEntity` objects containing the entity name, matched value, position, and confidence.
*/
public runDeterministicSlotFilling(
text: string,
entities: NlpEntityFull[],
): NLU.ParseEntity[] {
return entities.flatMap((e) => {
if (e.lookups.includes('keywords')) {
return this.extractKeywordBasedSlots(text, e);
} else if (e.lookups.includes('pattern')) {
return this.extractPatternBasedSlots(text, e);
} else {
return [];
}
});
}
}

View File

@ -40,6 +40,7 @@ export type NlpCacheMap = Map<string, NlpEntityFull>;
export type NlpMetadata = {
// Required when lookups is "pattern"
pattern?: string;
wordBoundary?: boolean;
removeSpaces?: boolean;
toLowerCase?: boolean;
stripDiacritics?: boolean;

View File

@ -351,6 +351,7 @@
"doc": "Documentation",
"builtin": "Built-in?",
"weight": "Weight",
"word_boundary": "Word boundary",
"remove_spaces": "Remove spaces",
"to_lower_case": "Lowercase",
"strip_diacritics": "Strip diacritics",

View File

@ -350,6 +350,7 @@
"synonyms": "Synonymes",
"doc": "Documentation",
"weight": "Poids",
"word_boundary": "Délimiter (Mot)",
"remove_spaces": "Supprimer les espaces",
"to_lower_case": "Mettre en minucules",
"strip_diacritics": "Supprimer les accents",

View File

@ -36,6 +36,7 @@ const getDefaultNlpMetadata = (
if (nlpEntity?.lookups.includes(LookupStrategy.pattern)) {
return {
pattern: "//",
wordBoundary: true,
removeSpaces: false,
toLowerCase: false,
stripDiacritics: false,
@ -160,6 +161,18 @@ export const NlpValueForm: FC<ComponentFormProps<INlpValue, INlpEntity>> = ({
flags={["i"]}
/>
</ContentItem>
<ContentItem>
<Controller
name="metadata.wordBoundary"
control={control}
render={({ field }) => (
<FormControlLabel
control={<Switch {...field} checked={field.value} />}
label={t("label.word_boundary")}
/>
)}
/>
</ContentItem>
<ContentItem>
<Controller
name="metadata.removeSpaces"

View File

@ -23,6 +23,7 @@ export type Lookup = `${LookupStrategy}`;
export interface INlpMetadata {
// Required when lookups is "pattern"
pattern?: string;
wordBoundary?: boolean;
removeSpaces?: boolean;
toLowerCase?: boolean;
stripDiacritics?: boolean;