From 34628a256ca66466c19e5e3bc134b4d33b32dbe3 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 19:56:28 +0100 Subject: [PATCH 01/10] fix: fix inference, retrieve synonym map & fix slot names --- nlu/data_loaders/jisfdl.py | 12 +++++++++++- nlu/models/slot_filler.py | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index ce497918..b9ae7397 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -116,7 +116,17 @@ class JISFDL(tfbp.DataLoader): # Filter out language entities slot_entities = filter( lambda e: e["entity"] != "language", entities) - slots = {e["entity"]: e["value"] for e in slot_entities} + slots = {} + for e in slot_entities: + # Create slots with entity values and resolve synonyms + if "start" in e and "end" in e and isinstance(e["start"], int) and isinstance(e["end"], int): + original_value = text[e["start"]:e["end"]] + entity_value = e["value"] + if entity_value != original_value: + entity_value = original_value.lower() + slots[e["entity"]] = entity_value + else: + continue positions = [[e.get("start", -1), e.get("end", -1)] for e in slot_entities] diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index b1929a3d..e6445c78 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -1,6 +1,8 @@ import os import functools import json +import re +from utils.json_helper import JsonHelper from transformers import TFBertModel, AutoTokenizer from keras.layers import Dropout, Dense from sys import platform @@ -179,6 +181,19 @@ class SlotFiller(tfbp.Model): # Optionally, provide a way to exit the loop if input("Try again? (y/n): ").lower() != 'y': break + + def get_synonym_map(self): + helper = JsonHelper() + + helper.read_dataset_json_file('train.json') + data = helper.read_dataset_json_file('train.json') + synonyms = data["entity_synonyms"] + synonym_map = {} + for entry in data["entity_synonyms"]: + value = entry["value"] + for synonym in entry["synonyms"]: + synonym_map[synonym] = value + return synonym_map def get_slots_prediction(self, text: str, inputs, slot_probas): @@ -202,6 +217,10 @@ class SlotFiller(tfbp.Model): token = tokens[idx] slot_id = slot_ids[idx] + # Skip special tokens + # if token in special_tokens: + # idx += 1 + # continue # Get slot name slot_name = self.extra_params["slot_names"][slot_id] @@ -243,13 +262,24 @@ class SlotFiller(tfbp.Model): # Convert tokens to string slot_value = self.tokenizer.convert_tokens_to_string(slot_tokens).strip() + # slot_value = re.sub(r'\s+', '', slot_value) + + # Ensure the slot value exists in the text (avoid -1 for start index) + start_idx = text.find(slot_value) + if start_idx == -1: + print(f"Skipping entity for '{slot_name}' because '{slot_value}' was not found in text.") + continue # Skip this entity if not found in text + + #Post Processing + synonym_map = self.get_synonym_map() + final_slot_value = synonym_map.get(slot_value) # Calculate entity start and end indices entity = { "entity": slot_name, - "value": slot_value, - "start": text.find(slot_value), - "end": text.find(slot_value) + len(slot_value), + "value": final_slot_value, + "start": start_idx, + "end": start_idx + len(slot_value), "confidence": 0, } From 0d8a4ce764f384ae5866c8d312f8546007f206fb Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:05:02 +0100 Subject: [PATCH 02/10] fix: remove typo --- nlu/models/slot_filler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index e6445c78..20ae5df8 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -184,8 +184,6 @@ class SlotFiller(tfbp.Model): def get_synonym_map(self): helper = JsonHelper() - - helper.read_dataset_json_file('train.json') data = helper.read_dataset_json_file('train.json') synonyms = data["entity_synonyms"] synonym_map = {} From b075c293d8246f87a4df3650f965eee2df91e7b6 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:05:32 +0100 Subject: [PATCH 03/10] fix: remove typo --- nlu/models/slot_filler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index 20ae5df8..a1622b2d 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -187,7 +187,7 @@ class SlotFiller(tfbp.Model): data = helper.read_dataset_json_file('train.json') synonyms = data["entity_synonyms"] synonym_map = {} - for entry in data["entity_synonyms"]: + for entry in synonyms: value = entry["value"] for synonym in entry["synonyms"]: synonym_map[synonym] = value From 18705c6fb50affbd9c6b1404003f4ba3c56df905 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:08:21 +0100 Subject: [PATCH 04/10] fix: restore regex --- nlu/models/slot_filler.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index a1622b2d..5789ee56 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -215,11 +215,6 @@ class SlotFiller(tfbp.Model): token = tokens[idx] slot_id = slot_ids[idx] - # Skip special tokens - # if token in special_tokens: - # idx += 1 - # continue - # Get slot name slot_name = self.extra_params["slot_names"][slot_id] if slot_name == "": @@ -260,7 +255,7 @@ class SlotFiller(tfbp.Model): # Convert tokens to string slot_value = self.tokenizer.convert_tokens_to_string(slot_tokens).strip() - # slot_value = re.sub(r'\s+', '', slot_value) + slot_value = re.sub(r'\s+', '', slot_value) # Ensure the slot value exists in the text (avoid -1 for start index) start_idx = text.find(slot_value) From 05ecd719789cd419fcbc0ed082c2119c1e9bc517 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:13:47 +0100 Subject: [PATCH 05/10] fix: revert to default slot value --- nlu/models/slot_filler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index 5789ee56..917fa883 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -266,6 +266,8 @@ class SlotFiller(tfbp.Model): #Post Processing synonym_map = self.get_synonym_map() final_slot_value = synonym_map.get(slot_value) + if final_slot_value is None: + final_slot_value = slot_value # Calculate entity start and end indices entity = { From 68cb90534fca0efa23f37bb572161df1f8863138 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:15:22 +0100 Subject: [PATCH 06/10] fix: identation in comments --- nlu/models/slot_filler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index 917fa883..b83df0fc 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -263,7 +263,7 @@ class SlotFiller(tfbp.Model): print(f"Skipping entity for '{slot_name}' because '{slot_value}' was not found in text.") continue # Skip this entity if not found in text - #Post Processing + # Post Processing synonym_map = self.get_synonym_map() final_slot_value = synonym_map.get(slot_value) if final_slot_value is None: From 476dc510ea2246d69c1d7e52057d7d3b90dc100f Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:17:33 +0100 Subject: [PATCH 07/10] fix: uncase input text --- nlu/models/slot_filler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index b83df0fc..c18a4840 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -172,7 +172,7 @@ class SlotFiller(tfbp.Model): def predict(self): while True: text = input("Provide text: ") - info = self.get_prediction(text) + info = self.get_prediction(text.lower()) print(self.summary()) print("Text : " + text) From d39bd145b63f0ac02e60bc5b39ec447af81fb4c2 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:18:42 +0100 Subject: [PATCH 08/10] fix: uncase input text for training dataset --- nlu/data_loaders/jisfdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index b9ae7397..2025c961 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -109,7 +109,7 @@ class JISFDL(tfbp.DataLoader): # Parse raw data for exp in examples: - text = exp["text"] + text = exp["text"].lower() intent = exp["intent"] entities = exp["entities"] From 896c54600056743bb02d5071bfb1036302e6ddff Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:48:01 +0100 Subject: [PATCH 09/10] fix: extra refactoring --- nlu/data_loaders/jisfdl.py | 12 ++++++++++++ nlu/models/slot_filler.py | 18 ++---------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index 2025c961..babec3b1 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -93,6 +93,18 @@ class JISFDL(tfbp.DataLoader): return encoded_slots + def get_synonym_map(self): + helper = JsonHelper() + helper.read_dataset_json_file('train.json') + data = helper.read_dataset_json_file('train.json') + synonyms = data["entity_synonyms"] + synonym_map = {} + for entry in synonyms: + value = entry["value"] + for synonym in entry["synonyms"]: + synonym_map[synonym] = value + return synonym_map + def parse_dataset_intents(self, data): intents = [] diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index c18a4840..d7f4dfb5 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -2,7 +2,6 @@ import os import functools import json import re -from utils.json_helper import JsonHelper from transformers import TFBertModel, AutoTokenizer from keras.layers import Dropout, Dense from sys import platform @@ -125,8 +124,7 @@ class SlotFiller(tfbp.Model): # Persist the model self.extra_params["slot_names"] = slot_names - - self.save() + self.extra_params["synonym_map"] = self.data_loader.get_synonym_map() @tfbp.runnable def evaluate(self): @@ -181,18 +179,6 @@ class SlotFiller(tfbp.Model): # Optionally, provide a way to exit the loop if input("Try again? (y/n): ").lower() != 'y': break - - def get_synonym_map(self): - helper = JsonHelper() - data = helper.read_dataset_json_file('train.json') - synonyms = data["entity_synonyms"] - synonym_map = {} - for entry in synonyms: - value = entry["value"] - for synonym in entry["synonyms"]: - synonym_map[synonym] = value - return synonym_map - def get_slots_prediction(self, text: str, inputs, slot_probas): slot_probas_np = slot_probas.numpy() @@ -264,7 +250,7 @@ class SlotFiller(tfbp.Model): continue # Skip this entity if not found in text # Post Processing - synonym_map = self.get_synonym_map() + synonym_map = self.extra_params["synonym_map"] final_slot_value = synonym_map.get(slot_value) if final_slot_value is None: final_slot_value = slot_value From 3f5b98cc72b34f434aed9e8ad0dae77696e027e2 Mon Sep 17 00:00:00 2001 From: hexastack Date: Thu, 28 Nov 2024 20:51:36 +0100 Subject: [PATCH 10/10] fix: restore save call --- nlu/models/slot_filler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index d7f4dfb5..60fbfee6 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -125,6 +125,7 @@ class SlotFiller(tfbp.Model): # Persist the model self.extra_params["slot_names"] = slot_names self.extra_params["synonym_map"] = self.data_loader.get_synonym_map() + self.save() @tfbp.runnable def evaluate(self):