fix: restore data loader

2025-05-08 06:44:49 +00:00 · 2024-10-22 13:46:03 +01:00 · 2024-10-22 13:46:03 +01:00 · 13df530881
commit 13df530881
parent 9b61e36c88
3 changed files with 28 additions and 27 deletions
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@ -99,28 +99,28 @@ class JISFDL(tfbp.DataLoader):
        k = 0

        # Filter examples by language
-        # lang = self.hparams.language
-        # all_examples = data["common_examples"]
-        #
-        # if not bool(lang):
-        #     examples = all_examples
-        # else:
-        #     examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
+        lang = self.hparams.language
+        all_examples = data["common_examples"]
+
+        if not bool(lang):
+            examples = all_examples
+        else:
+            examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)

        # Parse raw data
-        for exp in data:
+        for exp in examples:
            text = exp["text"]
            intent = exp["intent"]
-            # entities = exp["entities"]
+            entities = exp["entities"]

            # Filter out language entities
-            # slot_entities = filter(
-            #     lambda e: e["entity"] != "language", entities)
-            # slots = {e["entity"]: e["value"] for e in slot_entities}
-            # positions = [[e.get("start", -1), e.get("end", -1)]
-            #              for e in slot_entities]
+            slot_entities = filter(
+                lambda e: e["entity"] != "language", entities)
+            slots = {e["entity"]: e["value"] for e in slot_entities}
+            positions = [[e.get("start", -1), e.get("end", -1)]
+                         for e in slot_entities]

-            temp = JointRawData(k, intent, None, None, text)
+            temp = JointRawData(k, intent, positions, slots, text)
            k += 1
            intents.append(temp)

@ -133,7 +133,7 @@ class JISFDL(tfbp.DataLoader):
        helper = JsonHelper()

        if self.method in ["fit", "train"]:
-            dataset = helper.read_dataset_json_file('english.json')
+            dataset = helper.read_dataset_json_file('train.json')
            train_data = self.parse_dataset_intents(dataset)
            return self._transform_dataset(train_data, tokenizer)
        elif self.method in ["evaluate"]:
@ -154,14 +154,14 @@ class JISFDL(tfbp.DataLoader):
            intent_names = list(set(intents))
            # Map slots, load from the model (evaluate), recompute from dataset otherwise (train)
            slot_names = set()
-            # for td in dataset:
-            #     slots = td.slots
-            #     for slot in slots:
-            #         slot_names.add(slot)
-            # slot_names = list(slot_names)
-            # # To pad all the texts to the same length, the tokenizer will use special characters.
-            # # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
-            # slot_names.insert(0, "<PAD>")
+            for td in dataset:
+                slots = td.slots
+                for slot in slots:
+                    slot_names.add(slot)
+            slot_names = list(slot_names)
+            # To pad all the texts to the same length, the tokenizer will use special characters.
+            # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
+            slot_names.insert(0, "<PAD>")
        else:
            if "intent_names" in model_params:
                intent_names = model_params["intent_names"]
--- a/nlu/models/intent_classifier.py
+++ b/nlu/models/intent_classifier.py
@ -43,6 +43,8 @@ class IntentClassifier(tfbp.Model):
        "num_epochs": 2,
        "dropout_prob": 0.1,
        "intent_num_labels": 7,
+        "gamma": 2,
+        "k": 3
    }
    data_loader: JISFDL

@ -129,7 +131,7 @@ class IntentClassifier(tfbp.Model):
        # Hyperparams, Optimizer and Loss function
        opt = Adam(learning_rate=3e-5, epsilon=1e-08)

-        losses = SparseCategoricalFocalLoss(gamma=2.5)
+        losses = SparseCategoricalFocalLoss(gamma=self.hparams.gamma)

        metrics = [SparseCategoricalAccuracy("accuracy")]

@ -203,7 +205,7 @@ class IntentClassifier(tfbp.Model):

    def compute_normalized_confidence_margin(self, probs):
        highest_proba = np.max(probs[0])
-        sum_of_probas = self.compute_top_k_confidence(probs)
+        sum_of_probas = self.compute_top_k_confidence(probs, self.hparams.k)
        # Normalized margin
        normalized_margin = highest_proba / sum_of_probas
        return normalized_margin
--- a/nlu/utils/json_helper.py
+++ b/nlu/utils/json_helper.py
@ -6,7 +6,6 @@ class JsonHelper:

    def __init__(self, model:str = "intent_classifier"):
        self.data_folder=os.path.join("data",model)
-        # self.data_folder = os.path.join(os.path.dirname(__file__), '..', 'data', model)

    def read_dataset_json_file(self, filename):
        file_path = os.path.join(self.data_folder, filename)