mirror of
https://github.com/hexastack/hexabot
synced 2025-06-26 18:27:28 +00:00
feat: add determinisic slot filling (keyword and patter lookup strategies)
This commit is contained in:
parent
946d940370
commit
6e89192f88
@ -30,6 +30,7 @@ import BaseNlpHelper from '../base-nlp-helper';
|
||||
const mockLoggerService = {
|
||||
log: jest.fn(),
|
||||
error: jest.fn(),
|
||||
warn: jest.fn(),
|
||||
} as unknown as LoggerService;
|
||||
|
||||
const mockSettingService = {
|
||||
@ -218,4 +219,218 @@ describe('BaseNlpHelper', () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractKeywordBasedSlots', () => {
|
||||
it('should return matches for exact keywords and synonyms', () => {
|
||||
const entity: NlpEntityFull = {
|
||||
name: 'color',
|
||||
values: [
|
||||
{ value: 'blue', expressions: ['azure', 'navy'] },
|
||||
{ value: 'green', expressions: ['emerald', 'lime'] },
|
||||
],
|
||||
} as any;
|
||||
|
||||
const result = helper.extractKeywordBasedSlots(
|
||||
'The sky is azure and emerald',
|
||||
entity,
|
||||
);
|
||||
expect(result).toEqual([
|
||||
{
|
||||
entity: 'color',
|
||||
value: 'blue',
|
||||
start: 11,
|
||||
end: 16,
|
||||
confidence: 1,
|
||||
},
|
||||
{
|
||||
entity: 'color',
|
||||
value: 'green',
|
||||
start: 21,
|
||||
end: 28,
|
||||
confidence: 1,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('should return empty array if no values present', () => {
|
||||
const result = helper.extractKeywordBasedSlots('anything', {
|
||||
name: 'empty',
|
||||
values: [],
|
||||
} as any);
|
||||
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractPatternBasedSlots', () => {
|
||||
it('should match using a valid regex pattern', () => {
|
||||
const entity: NlpEntityFull = {
|
||||
name: 'number',
|
||||
values: [
|
||||
{
|
||||
value: 'number',
|
||||
metadata: { pattern: '\\d+', wordBoundary: true },
|
||||
},
|
||||
],
|
||||
} as any;
|
||||
|
||||
const result = helper.extractPatternBasedSlots(
|
||||
'Order 123 and 456 now!',
|
||||
entity,
|
||||
);
|
||||
expect(result).toEqual([
|
||||
{
|
||||
entity: 'number',
|
||||
value: '123',
|
||||
start: 6,
|
||||
end: 9,
|
||||
confidence: 1,
|
||||
},
|
||||
{
|
||||
entity: 'number',
|
||||
value: '456',
|
||||
start: 14,
|
||||
end: 17,
|
||||
confidence: 1,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('should respect metadata like toLowerCase and removeSpaces', () => {
|
||||
const entity: NlpEntityFull = {
|
||||
name: 'code',
|
||||
values: [
|
||||
{
|
||||
value: 'Code',
|
||||
metadata: {
|
||||
pattern: 'HEX BOT',
|
||||
toLowerCase: true,
|
||||
removeSpaces: true,
|
||||
},
|
||||
},
|
||||
],
|
||||
} as any;
|
||||
|
||||
const result = helper.extractPatternBasedSlots(
|
||||
'My CODE is HEX BOT!',
|
||||
entity,
|
||||
);
|
||||
expect(result).toEqual([
|
||||
{
|
||||
entity: 'code',
|
||||
value: 'hexbot',
|
||||
start: 11,
|
||||
end: 18,
|
||||
confidence: 1,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('should return empty array if no values', () => {
|
||||
const result = helper.extractPatternBasedSlots('test', {
|
||||
name: 'noop',
|
||||
values: [],
|
||||
} as any);
|
||||
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it('should handle invalid regex pattern gracefully', () => {
|
||||
const entity: NlpEntityFull = {
|
||||
name: 'fail',
|
||||
values: [
|
||||
{
|
||||
value: 'Invalid',
|
||||
metadata: { pattern: '[a-', wordBoundary: true },
|
||||
},
|
||||
],
|
||||
} as any;
|
||||
|
||||
const result = helper.extractPatternBasedSlots('test', entity);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('runDeterministicSlotFilling', () => {
|
||||
it('should call keyword-based extractor for keyword lookup strategy', () => {
|
||||
const mockEntities: NlpEntityFull[] = [
|
||||
{
|
||||
name: 'product',
|
||||
lookups: ['keywords'],
|
||||
values: [
|
||||
{
|
||||
value: 'tshirt',
|
||||
expressions: [],
|
||||
},
|
||||
{
|
||||
value: 'pizza',
|
||||
expressions: [],
|
||||
},
|
||||
],
|
||||
} as unknown as NlpEntityFull,
|
||||
];
|
||||
jest.spyOn(helper, 'extractKeywordBasedSlots');
|
||||
jest.spyOn(helper, 'extractPatternBasedSlots');
|
||||
|
||||
const result = helper.runDeterministicSlotFilling(
|
||||
'order pizza',
|
||||
mockEntities,
|
||||
);
|
||||
|
||||
expect(helper.extractKeywordBasedSlots).toHaveBeenCalledTimes(1);
|
||||
expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].entity).toBe('product');
|
||||
});
|
||||
|
||||
it('should call pattern-based extractor for pattern lookup strategy', () => {
|
||||
const mockEntities: NlpEntityFull[] = [
|
||||
{
|
||||
name: 'number',
|
||||
lookups: ['pattern'],
|
||||
values: [
|
||||
{
|
||||
value: 'phone',
|
||||
metadata: { pattern: '\\d+' },
|
||||
expressions: [],
|
||||
},
|
||||
],
|
||||
} as unknown as NlpEntityFull,
|
||||
];
|
||||
|
||||
jest.spyOn(helper, 'extractKeywordBasedSlots');
|
||||
jest.spyOn(helper, 'extractPatternBasedSlots');
|
||||
|
||||
const result = helper.runDeterministicSlotFilling(
|
||||
'call me at 1234567890',
|
||||
mockEntities,
|
||||
);
|
||||
|
||||
expect(helper.extractPatternBasedSlots).toHaveBeenCalledTimes(1);
|
||||
expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].entity).toBe('number');
|
||||
});
|
||||
|
||||
it('should skip entities that do not support the selected lookup strategy', () => {
|
||||
const mockEntities: NlpEntityFull[] = [
|
||||
{
|
||||
name: 'irrelevant',
|
||||
lookups: ['trait'],
|
||||
values: [],
|
||||
} as unknown as NlpEntityFull,
|
||||
];
|
||||
jest.spyOn(helper, 'extractKeywordBasedSlots');
|
||||
jest.spyOn(helper, 'extractPatternBasedSlots');
|
||||
|
||||
const result = helper.runDeterministicSlotFilling(
|
||||
'any text',
|
||||
mockEntities,
|
||||
);
|
||||
|
||||
expect(helper.extractKeywordBasedSlots).not.toHaveBeenCalled();
|
||||
expect(helper.extractPatternBasedSlots).not.toHaveBeenCalled();
|
||||
expect(result).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -225,4 +225,145 @@ export default abstract class BaseNlpHelper<
|
||||
threshold?: boolean,
|
||||
project?: string,
|
||||
): Promise<NLU.ParseEntities>;
|
||||
|
||||
/**
|
||||
* Finds entities in a given text based on their values and synonyms.
|
||||
*
|
||||
* This function takes a string of text and an array of entities, where each entity contains a value
|
||||
* and a list of synonyms. It returns an array of objects, each representing an entity found in the text
|
||||
* along with its start and end positions.
|
||||
*
|
||||
* @param text - The input text to search for entities.
|
||||
* @param entities - An array of entities to search for, each containing a `value` and a list of `synonyms`.
|
||||
*
|
||||
* @returns An array of objects representing the found entities, with their `value`, `start`, and `end` positions.
|
||||
*/
|
||||
public extractKeywordBasedSlots(
|
||||
text: string,
|
||||
entity: NlpEntityFull,
|
||||
): NLU.ParseEntity[] {
|
||||
if (!entity.values?.length) {
|
||||
this.logger.warn('NLP entity has no values');
|
||||
return [];
|
||||
}
|
||||
|
||||
return (entity.values
|
||||
.flatMap(({ value, expressions }) => {
|
||||
const allValues = [value, ...expressions];
|
||||
|
||||
// Filter the terms that are found in the text
|
||||
return allValues
|
||||
.flatMap((term) => {
|
||||
const regex = new RegExp(`\\b${term}\\b`, 'g');
|
||||
const matches = [...text.matchAll(regex)];
|
||||
|
||||
// Map matches to FoundEntity format
|
||||
return matches.map((match) => ({
|
||||
entity: entity.name,
|
||||
value,
|
||||
start: match.index!,
|
||||
end: match.index! + term.length,
|
||||
confidence: 1,
|
||||
}));
|
||||
})
|
||||
.shift();
|
||||
})
|
||||
.filter((v) => !!v) || []) as NLU.ParseEntity[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds entities in a given text based on regex patterns (stored in `value` field).
|
||||
*
|
||||
* @param text - Input text to evaluate.
|
||||
* @param entity - NlpEntityFull with regex values in `value` and optional metadata.
|
||||
* @returns An array of matched entities with value, position, and confidence.
|
||||
*/
|
||||
public extractPatternBasedSlots(
|
||||
text: string,
|
||||
entity: NlpEntityFull,
|
||||
): NLU.ParseEntity[] {
|
||||
if (!entity.values?.length) {
|
||||
this.logger.warn('NLP entity has no values');
|
||||
return [];
|
||||
}
|
||||
|
||||
return (entity.values
|
||||
.flatMap((patternValue) => {
|
||||
const processedText = text;
|
||||
const pattern = patternValue.metadata?.pattern;
|
||||
|
||||
if (!pattern) {
|
||||
this.logger.error('Missing NLP regex pattern');
|
||||
return [];
|
||||
}
|
||||
|
||||
let regex: RegExp;
|
||||
try {
|
||||
const shouldWrap = patternValue.metadata?.wordBoundary;
|
||||
regex = new RegExp(shouldWrap ? `\\b${pattern}\\b` : pattern, 'gi');
|
||||
} catch {
|
||||
this.logger.error('Invalid NLP regex pattern');
|
||||
return [];
|
||||
}
|
||||
|
||||
const matches = [...processedText.matchAll(regex)];
|
||||
|
||||
return matches.map((match) => {
|
||||
let value = match[0];
|
||||
|
||||
// Apply preprocessing if needed
|
||||
if (patternValue.metadata?.removeSpaces) {
|
||||
value = value.replace(/\s+/g, '');
|
||||
}
|
||||
|
||||
if (patternValue.metadata?.toLowerCase) {
|
||||
value = value.toLowerCase();
|
||||
}
|
||||
|
||||
if (patternValue.metadata?.stripDiacritics) {
|
||||
value = text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
||||
}
|
||||
|
||||
return {
|
||||
entity: entity.name,
|
||||
value,
|
||||
start: match.index!,
|
||||
end: match.index! + match[0].length,
|
||||
confidence: 1,
|
||||
};
|
||||
});
|
||||
})
|
||||
.filter((v) => !!v) || []) as NLU.ParseEntity[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts slot values from text based on the specified lookup strategy.
|
||||
*
|
||||
* This function supports deterministic slot filling by scanning the input text using either
|
||||
* keyword-based or pattern-based entity recognition, depending on the provided lookup strategy.
|
||||
*
|
||||
* - For `keywords`: It uses exact term and synonym matching with word boundaries.
|
||||
* - For `pattern`: It uses regular expressions defined in each entity value (stored in `value` field),
|
||||
* optionally applying preprocessing such as `removeSpaces`, `lowercase`, and `stripDiacritics`.
|
||||
*
|
||||
* @param text - The input text from which to extract slot values.
|
||||
* @param entities - An array of NlpEntityFull objects, each containing slot values and metadata.
|
||||
* @param lookup - The lookup strategy to use: either `keywords` or `pattern`.
|
||||
*
|
||||
* @returns An array of `ParseEntity` objects containing the entity name, matched value, position, and confidence.
|
||||
*/
|
||||
public runDeterministicSlotFilling(
|
||||
text: string,
|
||||
entities: NlpEntityFull[],
|
||||
): NLU.ParseEntity[] {
|
||||
return entities.flatMap((e) => {
|
||||
if (e.lookups.includes('keywords')) {
|
||||
return this.extractKeywordBasedSlots(text, e);
|
||||
} else if (e.lookups.includes('pattern')) {
|
||||
return this.extractPatternBasedSlots(text, e);
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -40,6 +40,7 @@ export type NlpCacheMap = Map<string, NlpEntityFull>;
|
||||
export type NlpMetadata = {
|
||||
// Required when lookups is "pattern"
|
||||
pattern?: string;
|
||||
wordBoundary?: boolean;
|
||||
removeSpaces?: boolean;
|
||||
toLowerCase?: boolean;
|
||||
stripDiacritics?: boolean;
|
||||
|
@ -351,6 +351,7 @@
|
||||
"doc": "Documentation",
|
||||
"builtin": "Built-in?",
|
||||
"weight": "Weight",
|
||||
"word_boundary": "Word boundary",
|
||||
"remove_spaces": "Remove spaces",
|
||||
"to_lower_case": "Lowercase",
|
||||
"strip_diacritics": "Strip diacritics",
|
||||
|
@ -350,6 +350,7 @@
|
||||
"synonyms": "Synonymes",
|
||||
"doc": "Documentation",
|
||||
"weight": "Poids",
|
||||
"word_boundary": "Délimiter (Mot)",
|
||||
"remove_spaces": "Supprimer les espaces",
|
||||
"to_lower_case": "Mettre en minucules",
|
||||
"strip_diacritics": "Supprimer les accents",
|
||||
|
@ -36,6 +36,7 @@ const getDefaultNlpMetadata = (
|
||||
if (nlpEntity?.lookups.includes(LookupStrategy.pattern)) {
|
||||
return {
|
||||
pattern: "//",
|
||||
wordBoundary: true,
|
||||
removeSpaces: false,
|
||||
toLowerCase: false,
|
||||
stripDiacritics: false,
|
||||
@ -160,6 +161,18 @@ export const NlpValueForm: FC<ComponentFormProps<INlpValue, INlpEntity>> = ({
|
||||
flags={["i"]}
|
||||
/>
|
||||
</ContentItem>
|
||||
<ContentItem>
|
||||
<Controller
|
||||
name="metadata.wordBoundary"
|
||||
control={control}
|
||||
render={({ field }) => (
|
||||
<FormControlLabel
|
||||
control={<Switch {...field} checked={field.value} />}
|
||||
label={t("label.word_boundary")}
|
||||
/>
|
||||
)}
|
||||
/>
|
||||
</ContentItem>
|
||||
<ContentItem>
|
||||
<Controller
|
||||
name="metadata.removeSpaces"
|
||||
|
@ -23,6 +23,7 @@ export type Lookup = `${LookupStrategy}`;
|
||||
export interface INlpMetadata {
|
||||
// Required when lookups is "pattern"
|
||||
pattern?: string;
|
||||
wordBoundary?: boolean;
|
||||
removeSpaces?: boolean;
|
||||
toLowerCase?: boolean;
|
||||
stripDiacritics?: boolean;
|
||||
|
Loading…
Reference in New Issue
Block a user