From 34628a256ca66466c19e5e3bc134b4d33b32dbe3 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 19:56:28 +0100
Subject: [PATCH 01/10] fix: fix inference, retrieve synonym map & fix slot
 names

---
 nlu/data_loaders/jisfdl.py | 12 +++++++++++-
 nlu/models/slot_filler.py  | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py
index ce497918..b9ae7397 100644
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@@ -116,7 +116,17 @@ class JISFDL(tfbp.DataLoader):
             # Filter out language entities
             slot_entities = filter(
                 lambda e: e["entity"] != "language", entities)
-            slots = {e["entity"]: e["value"] for e in slot_entities}
+            slots = {}
+            for e in slot_entities: 
+            # Create slots with entity values and resolve synonyms
+                if "start" in e and "end" in e and isinstance(e["start"], int) and isinstance(e["end"], int):
+                    original_value = text[e["start"]:e["end"]]
+                    entity_value = e["value"]
+                    if entity_value != original_value:
+                        entity_value = original_value.lower()
+                    slots[e["entity"]] = entity_value
+                else:
+                    continue
             positions = [[e.get("start", -1), e.get("end", -1)]
                          for e in slot_entities]
 
diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index b1929a3d..e6445c78 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -1,6 +1,8 @@
 import os
 import functools
 import json
+import re
+from utils.json_helper import JsonHelper
 from transformers import TFBertModel, AutoTokenizer
 from keras.layers import Dropout, Dense
 from sys import platform
@@ -179,6 +181,19 @@ class SlotFiller(tfbp.Model):
             # Optionally, provide a way to exit the loop
             if input("Try again? (y/n): ").lower() != 'y':
                 break
+    
+    def get_synonym_map(self):
+        helper = JsonHelper()
+
+        helper.read_dataset_json_file('train.json')
+        data = helper.read_dataset_json_file('train.json')
+        synonyms = data["entity_synonyms"]
+        synonym_map = {}
+        for entry in data["entity_synonyms"]:
+            value = entry["value"]
+            for synonym in entry["synonyms"]:
+                synonym_map[synonym] = value    
+        return synonym_map 
 
 
     def get_slots_prediction(self, text: str, inputs, slot_probas):
@@ -202,6 +217,10 @@ class SlotFiller(tfbp.Model):
             token = tokens[idx]
             slot_id = slot_ids[idx]
 
+            # Skip special tokens
+            # if token in special_tokens:
+            #     idx += 1
+            #     continue
 
             # Get slot name
             slot_name = self.extra_params["slot_names"][slot_id]
@@ -243,13 +262,24 @@ class SlotFiller(tfbp.Model):
 
             # Convert tokens to string
             slot_value = self.tokenizer.convert_tokens_to_string(slot_tokens).strip()
+            # slot_value = re.sub(r'\s+', '', slot_value)            
+
+            # Ensure the slot value exists in the text (avoid -1 for start index)
+            start_idx = text.find(slot_value)
+            if start_idx == -1:
+                print(f"Skipping entity for '{slot_name}' because '{slot_value}' was not found in text.")
+                continue  # Skip this entity if not found in text
+
+            #Post Processing 
+            synonym_map = self.get_synonym_map()
+            final_slot_value = synonym_map.get(slot_value)
 
             # Calculate entity start and end indices
             entity = {
                 "entity": slot_name,
-                "value": slot_value,
-                "start": text.find(slot_value),
-                "end": text.find(slot_value) + len(slot_value),
+                "value": final_slot_value,
+                "start": start_idx,
+                "end": start_idx + len(slot_value),
                 "confidence": 0,
             }
 

From 0d8a4ce764f384ae5866c8d312f8546007f206fb Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:05:02 +0100
Subject: [PATCH 02/10] fix: remove typo

---
 nlu/models/slot_filler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index e6445c78..20ae5df8 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -184,8 +184,6 @@ class SlotFiller(tfbp.Model):
     
     def get_synonym_map(self):
         helper = JsonHelper()
-
-        helper.read_dataset_json_file('train.json')
         data = helper.read_dataset_json_file('train.json')
         synonyms = data["entity_synonyms"]
         synonym_map = {}

From b075c293d8246f87a4df3650f965eee2df91e7b6 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:05:32 +0100
Subject: [PATCH 03/10] fix: remove typo

---
 nlu/models/slot_filler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index 20ae5df8..a1622b2d 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -187,7 +187,7 @@ class SlotFiller(tfbp.Model):
         data = helper.read_dataset_json_file('train.json')
         synonyms = data["entity_synonyms"]
         synonym_map = {}
-        for entry in data["entity_synonyms"]:
+        for entry in synonyms:
             value = entry["value"]
             for synonym in entry["synonyms"]:
                 synonym_map[synonym] = value    

From 18705c6fb50affbd9c6b1404003f4ba3c56df905 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:08:21 +0100
Subject: [PATCH 04/10] fix: restore regex

---
 nlu/models/slot_filler.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index a1622b2d..5789ee56 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -215,11 +215,6 @@ class SlotFiller(tfbp.Model):
             token = tokens[idx]
             slot_id = slot_ids[idx]
 
-            # Skip special tokens
-            # if token in special_tokens:
-            #     idx += 1
-            #     continue
-
             # Get slot name
             slot_name = self.extra_params["slot_names"][slot_id]
             if slot_name == "<PAD>":
@@ -260,7 +255,7 @@ class SlotFiller(tfbp.Model):
 
             # Convert tokens to string
             slot_value = self.tokenizer.convert_tokens_to_string(slot_tokens).strip()
-            # slot_value = re.sub(r'\s+', '', slot_value)            
+            slot_value = re.sub(r'\s+', '', slot_value)            
 
             # Ensure the slot value exists in the text (avoid -1 for start index)
             start_idx = text.find(slot_value)

From 05ecd719789cd419fcbc0ed082c2119c1e9bc517 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:13:47 +0100
Subject: [PATCH 05/10] fix: revert to default slot value

---
 nlu/models/slot_filler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index 5789ee56..917fa883 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -266,6 +266,8 @@ class SlotFiller(tfbp.Model):
             #Post Processing 
             synonym_map = self.get_synonym_map()
             final_slot_value = synonym_map.get(slot_value)
+            if final_slot_value is None: 
+                final_slot_value = slot_value
 
             # Calculate entity start and end indices
             entity = {

From 68cb90534fca0efa23f37bb572161df1f8863138 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:15:22 +0100
Subject: [PATCH 06/10] fix: identation in comments

---
 nlu/models/slot_filler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index 917fa883..b83df0fc 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -263,7 +263,7 @@ class SlotFiller(tfbp.Model):
                 print(f"Skipping entity for '{slot_name}' because '{slot_value}' was not found in text.")
                 continue  # Skip this entity if not found in text
 
-            #Post Processing 
+            # Post Processing 
             synonym_map = self.get_synonym_map()
             final_slot_value = synonym_map.get(slot_value)
             if final_slot_value is None: 

From 476dc510ea2246d69c1d7e52057d7d3b90dc100f Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:17:33 +0100
Subject: [PATCH 07/10] fix: uncase input text

---
 nlu/models/slot_filler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index b83df0fc..c18a4840 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -172,7 +172,7 @@ class SlotFiller(tfbp.Model):
     def predict(self):
         while True:
             text = input("Provide text: ")
-            info = self.get_prediction(text)
+            info = self.get_prediction(text.lower())
 
             print(self.summary())
             print("Text : " + text)

From d39bd145b63f0ac02e60bc5b39ec447af81fb4c2 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:18:42 +0100
Subject: [PATCH 08/10] fix: uncase input text for training dataset

---
 nlu/data_loaders/jisfdl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py
index b9ae7397..2025c961 100644
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@@ -109,7 +109,7 @@ class JISFDL(tfbp.DataLoader):
 
         # Parse raw data
         for exp in examples:
-            text = exp["text"]
+            text = exp["text"].lower()
             intent = exp["intent"]
             entities = exp["entities"]
 

From 896c54600056743bb02d5071bfb1036302e6ddff Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:48:01 +0100
Subject: [PATCH 09/10] fix: extra refactoring

---
 nlu/data_loaders/jisfdl.py | 12 ++++++++++++
 nlu/models/slot_filler.py  | 18 ++----------------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py
index 2025c961..babec3b1 100644
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@@ -93,6 +93,18 @@ class JISFDL(tfbp.DataLoader):
 
         return encoded_slots
 
+    def get_synonym_map(self):
+        helper = JsonHelper()
+        helper.read_dataset_json_file('train.json')
+        data = helper.read_dataset_json_file('train.json')
+        synonyms = data["entity_synonyms"]
+        synonym_map = {}
+        for entry in synonyms:
+            value = entry["value"]
+            for synonym in entry["synonyms"]:
+                synonym_map[synonym] = value    
+        return synonym_map 
+    
     def parse_dataset_intents(self, data):
 
         intents = []
diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index c18a4840..d7f4dfb5 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -2,7 +2,6 @@ import os
 import functools
 import json
 import re
-from utils.json_helper import JsonHelper
 from transformers import TFBertModel, AutoTokenizer
 from keras.layers import Dropout, Dense
 from sys import platform
@@ -125,8 +124,7 @@ class SlotFiller(tfbp.Model):
 
         # Persist the model
         self.extra_params["slot_names"] = slot_names
-
-        self.save()
+        self.extra_params["synonym_map"] = self.data_loader.get_synonym_map()
 
     @tfbp.runnable
     def evaluate(self):
@@ -181,18 +179,6 @@ class SlotFiller(tfbp.Model):
             # Optionally, provide a way to exit the loop
             if input("Try again? (y/n): ").lower() != 'y':
                 break
-    
-    def get_synonym_map(self):
-        helper = JsonHelper()
-        data = helper.read_dataset_json_file('train.json')
-        synonyms = data["entity_synonyms"]
-        synonym_map = {}
-        for entry in synonyms:
-            value = entry["value"]
-            for synonym in entry["synonyms"]:
-                synonym_map[synonym] = value    
-        return synonym_map 
-
 
     def get_slots_prediction(self, text: str, inputs, slot_probas):
         slot_probas_np = slot_probas.numpy()
@@ -264,7 +250,7 @@ class SlotFiller(tfbp.Model):
                 continue  # Skip this entity if not found in text
 
             # Post Processing 
-            synonym_map = self.get_synonym_map()
+            synonym_map = self.extra_params["synonym_map"]
             final_slot_value = synonym_map.get(slot_value)
             if final_slot_value is None: 
                 final_slot_value = slot_value

From 3f5b98cc72b34f434aed9e8ad0dae77696e027e2 Mon Sep 17 00:00:00 2001
From: hexastack <mohamedali.bouhaouala@ensi-uma.tn>
Date: Thu, 28 Nov 2024 20:51:36 +0100
Subject: [PATCH 10/10] fix: restore save call

---
 nlu/models/slot_filler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index d7f4dfb5..60fbfee6 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -125,6 +125,7 @@ class SlotFiller(tfbp.Model):
         # Persist the model
         self.extra_params["slot_names"] = slot_names
         self.extra_params["synonym_map"] = self.data_loader.get_synonym_map()
+        self.save()
 
     @tfbp.runnable
     def evaluate(self):