From c58a080ebf10709d7361d5c62d05a579bdcc5ac6 Mon Sep 17 00:00:00 2001
From: Mohamed Marrouchi <marrouchi.mohamed@gmail.com>
Date: Thu, 19 Sep 2024 09:14:17 +0100
Subject: [PATCH 1/7] feat: break NLU JISF apart

---
 nlu/.env.dev                                 |   3 +-
 nlu/.env.example                             |   4 +-
 nlu/README.md                                |   4 +-
 nlu/data_loaders/jisfdl.py                   |  50 ++--
 nlu/data_loaders/tflcdl.py                   |   2 +-
 nlu/main.py                                  |  43 +++-
 nlu/models/{jisf.py => intent_classifier.py} | 129 +---------
 nlu/models/slot_filler.py                    | 250 +++++++++++++++++++
 nlu/utils/json_helper.py                     |   2 +-
 9 files changed, 337 insertions(+), 150 deletions(-)
 rename nlu/models/{jisf.py => intent_classifier.py} (59%)
 create mode 100644 nlu/models/slot_filler.py
diff --git a/nlu/.env.dev b/nlu/.env.dev
index d5023df..6fac306 100644
--- a/nlu/.env.dev
+++ b/nlu/.env.dev
@@ -2,4 +2,5 @@ AUTH_TOKEN=123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=ar,fr,tn
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
+SLOT_FILLER_REPO_ID=Hexastack/slot-filler
diff --git a/nlu/.env.example b/nlu/.env.example
index 52370c7..a863e43 100644
--- a/nlu/.env.example
+++ b/nlu/.env.example
@@ -1,5 +1,5 @@
 AUTH_TOKEN=
 LANGUAGE_CLASSIFIER=
 INTENT_CLASSIFIERS=
-TFLC_REPO_ID=
-JISF_REPO_ID=
\ No newline at end of file
+INTENT_CLASSIFIER_REPO_ID=
+SLOT_FILLER_REPO_ID=
\ No newline at end of file
diff --git a/nlu/README.md b/nlu/README.md
index ff5ba60..dd8a00b 100644
--- a/nlu/README.md
+++ b/nlu/README.md
@@ -40,7 +40,7 @@ pip install -r requirements.txt
 You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`:
 ```bash
 $ cat env.sh
-source env/bin/activate
+source venv/bin/activate
 alias run='python run.py'
 ```
 
@@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1
 Examples :
 ```bash
 # Intent classification
-run fit intent-classifier-en-30072024 jisf  --intent_num_labels=88 --slot_num_labels=17 --language=en
+run fit intent-classifier-en-30072024 intent_classifier  --intent_num_labels=88 --slot_num_labels=17 --language=en
 run predict intent-classifier-fr-30072024  --intent_num_labels=7 --slot_num_labels=2 --language=fr
 
 # Language classification
diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py
index 75ae949..7c4096e 100644
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@@ -4,8 +4,8 @@ import json
 import numpy as np
 from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
 
+
 import boilerplate as tfbp
-from utils.jisf_data_mapper import JisfDataMapper
 from utils.json_helper import JsonHelper
 
 
@@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader):
         # Filter examples by language
         lang = self.hparams.language
         all_examples = data["common_examples"]
-        examples = filter(lambda exp: any(
-            e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
+
+        if not lang:
+            examples = all_examples
+        else:
+            examples = filter(lambda exp: any(not lang or (e['entity'] == 'language' and e['value'] == lang) for e in exp['entities']), all_examples)
 
         # Parse raw data
         for exp in examples:
@@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader):
         # the classifier.
         texts = [d.text for d in dataset]
         encoded_texts = self.encode_texts(texts, tokenizer)
-
         # Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
         intents = [d.intent for d in dataset]
         if not model_params:
@@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader):
             # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
             slot_names.insert(0, "<PAD>")
         else:
-            intent_names = model_params.intent_names
-            slot_names = model_params.slot_names
+            if "intent_names" in model_params:
+                intent_names = model_params["intent_names"]
+            else:
+                intent_names = None
+            
+            if "slot_names" in model_params:
+                slot_names = model_params["slot_names"]
+            else:
+                slot_names = None
 
-        intent_map = dict()  # Dict : intent -> index
-        for idx, ui in enumerate(intent_names):
-            intent_map[ui] = idx
+        if intent_names:
+            intent_map = dict()  # Dict : intent -> index
+            for idx, ui in enumerate(intent_names):
+                intent_map[ui] = idx
+        else:
+            intent_map = None
 
         # Encode intents
-        encoded_intents = self.encode_intents(intents, intent_map)
+        if intent_map:
+            encoded_intents = self.encode_intents(intents, intent_map)
+        else:
+            encoded_intents = None
 
-        slot_map: Dict[str, int] = dict()  # slot -> index
-        for idx, us in enumerate(slot_names):
-            slot_map[us] = idx
+        if slot_names:
+            slot_map: Dict[str, int] = dict()  # slot -> index
+            for idx, us in enumerate(slot_names):
+                slot_map[us] = idx
+        else:
+            slot_map = None
 
         # Encode slots
         # Text : Add a tune to my elrow Guest List
@@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader):
         max_len = len(encoded_texts["input_ids"][0])  # type: ignore
         all_slots = [td.slots for td in dataset]
         all_texts = [td.text for td in dataset]
-        encoded_slots = self.encode_slots(tokenizer,
+        
+        if slot_map:
+            encoded_slots = self.encode_slots(tokenizer,
                                           all_slots, all_texts, slot_map, max_len)
+        else:
+            encoded_slots = None
 
         return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
 
diff --git a/nlu/data_loaders/tflcdl.py b/nlu/data_loaders/tflcdl.py
index 09a23c0..b765f78 100644
--- a/nlu/data_loaders/tflcdl.py
+++ b/nlu/data_loaders/tflcdl.py
@@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader):
 
         self.json_helper = JsonHelper("tflc")
         self._save_dir = save_dir
-        print(hparams)
+
         # We will opt for a TF-IDF representation of the data as the frequency of word
         # roots should give us a good idea about which language we're dealing with.
         if method == "fit":
diff --git a/nlu/main.py b/nlu/main.py
index b85ce11..f7e4f8b 100644
--- a/nlu/main.py
+++ b/nlu/main.py
@@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED")
 
 AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',')
 TFLC_REPO_ID = os.getenv("TFLC_REPO_ID")
-JISF_REPO_ID = os.getenv("JISF_REPO_ID")
-
+INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID")
+SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID")
 
 def load_language_classifier():
     # Init language classifier model
@@ -27,21 +27,31 @@ def load_language_classifier():
     logging.info(f'Successfully loaded the language classifier model')
     return model
 
-
 def load_intent_classifiers():
-    Model = tfbp.get_model("jisf")
-    models = {}
+    Model = tfbp.get_model("intent_classifier")
+    intent_classifiers = {}
     for language in AVAILABLE_LANGUAGES:
         kwargs = {}
-        models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs)
-        models[language].load_model()
+        intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs)
+        intent_classifiers[language].load_model()
         logging.info(f'Successfully loaded the intent classifier {language} model')
-    return models
+    return intent_classifiers
+
+def load_slot_classifiers():
+    Model = tfbp.get_model("slot_classifier")
+    slot_fillers = {}
+    for language in AVAILABLE_LANGUAGES:
+        kwargs = {}
+        slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs)
+        slot_fillers[language].load_model()
+        logging.info(f'Successfully loaded the slot filler {language} model')
+    return slot_fillers
 
 
 def load_models():
     app.language_classifier = load_language_classifier()  # type: ignore
     app.intent_classifiers = load_intent_classifiers()  # type: ignore
+    app.slot_fillers = load_intent_classifiers()  # type: ignore
 
 app = FastAPI()
 
@@ -74,13 +84,20 @@ async def check_health():
 
 @app.post("/parse")
 def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]):
-    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'):
+    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'):
         headers = {"Retry-After": "120"}  # Suggest retrying after 2 minutes
-        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers)
+        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers)
     
     language = app.language_classifier.get_prediction(input.q)  # type: ignore
     lang = language.get("value")
-    prediction = app.intent_classifiers[lang].get_prediction(
+    intent_prediction = app.intent_classifiers[lang].get_prediction(
         input.q)  # type: ignore
-    prediction.get("entities").append(language)
-    return prediction
+    slot_prediction = app.slot_fillers[lang].get_prediction(
+        input.q)  # type: ignore
+    slot_prediction.get("entities").append(language)
+
+    return {
+        "text": input.q,
+        "intent": intent_prediction.get("intent"),
+        "entities": slot_prediction.get("entities"),
+    }
diff --git a/nlu/models/jisf.py b/nlu/models/intent_classifier.py
similarity index 59%
rename from nlu/models/jisf.py
rename to nlu/models/intent_classifier.py
index 71c14ef..2d2f27c 100644
--- a/nlu/models/jisf.py
+++ b/nlu/models/intent_classifier.py
@@ -1,4 +1,3 @@
-import functools
 import json
 import math
 from typing import Tuple, Dict, List
@@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL
 import boilerplate as tfbp
 
 ##
-# JISF : Joint Intent Classification and Slot filling with BERT
-# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
+# Intent Classification with BERT
+# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
 # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
 #
 # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
@@ -33,19 +32,16 @@ import boilerplate as tfbp
 BERT_MODEL_BY_LANGUAGE = {
     'en': "bert-base-cased",
     'fr': "dbmdz/bert-base-french-europeana-cased",
-    'ar': 'asafaya/bert-base-arabic',
-    'tn': 'dbmdz/bert-base-french-europeana-cased'
 }
 
 
 @tfbp.default_export
-class JISF(tfbp.Model):
+class IntentClassifier(tfbp.Model):
     default_hparams = {
-        "language": "fr",
+        "language": None,
         "num_epochs": 2,
         "dropout_prob": 0.1,
         "intent_num_labels": 7,
-        "slot_num_labels": 40
     }
     data_loader: JISFDL
 
@@ -57,8 +53,8 @@ class JISF(tfbp.Model):
 
         # Load Tokenizer from transformers
         # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
-        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language]
-        # bert_model_name = typing.cast(str, self.hparams.bert_model_name)
+        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
+
         self.tokenizer = AutoTokenizer.from_pretrained(
             bert_model_name, use_fast=False)
         self.bert = TFBertModel.from_pretrained(bert_model_name)
@@ -66,27 +62,18 @@ class JISF(tfbp.Model):
         self.dropout = Dropout(self.hparams.dropout_prob)
         self.intent_classifier = Dense(self.hparams.intent_num_labels,
                                        name="intent_classifier", activation="softmax")
-        self.slot_classifier = Dense(self.hparams.slot_num_labels,
-                                     name="slot_classifier", activation="softmax")
 
 
     def call(self, inputs, **kwargs):
-        # two outputs from BERT
         trained_bert = self.bert(inputs, **kwargs)
         pooled_output = trained_bert.pooler_output
-        sequence_output = trained_bert.last_hidden_state
-
-        # sequence_output will be used for slot_filling / classification
-        sequence_output = self.dropout(sequence_output,
-                                       training=kwargs.get("training", False))
-        slot_probas = self.slot_classifier(sequence_output)
-
+        
         # pooled_output for intent classification
         pooled_output = self.dropout(pooled_output,
                                      training=kwargs.get("training", False))
         intent_probas = self.intent_classifier(pooled_output)
 
-        return slot_probas, intent_probas
+        return intent_probas
 
     def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]:
         return data_loader(self.tokenizer)
@@ -137,18 +124,11 @@ class JISF(tfbp.Model):
             raise ValueError(
                 f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}"
             )
-        if self.hparams.slot_num_labels != len(slot_names):
-            raise ValueError(
-                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
-            )
 
         # Hyperparams, Optimizer and Loss function
         opt = Adam(learning_rate=3e-5, epsilon=1e-08)
 
-        # two outputs, one for slots, another for intents
-        # we have to fine tune for both
-        losses = [SparseCategoricalCrossentropy(),
-                  SparseCategoricalCrossentropy()]
+        losses = SparseCategoricalCrossentropy()
 
         metrics = [SparseCategoricalAccuracy("accuracy")]
 
@@ -159,11 +139,10 @@ class JISF(tfbp.Model):
              "attention_mask": encoded_texts["attention_mask"]}
 
         super().fit(
-            x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
+            x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
 
         # Persist the model
         self.extra_params["intent_names"] = intent_names
-        self.extra_params["slot_names"] = slot_names
 
         self.save()
 
@@ -175,8 +154,8 @@ class JISF(tfbp.Model):
         metrics = [SparseCategoricalAccuracy("accuracy")]
         self.compile(metrics=metrics)
 
-        _, intent_probas = self(encoded_texts)  # type: ignore
-
+        intent_probas = self(encoded_texts)  # type: ignore
+        
         scores = self.get_metrics_by_intent(intent_probas, encoded_intents)
 
         overall_score = {}
@@ -204,85 +183,10 @@ class JISF(tfbp.Model):
         print(json.dumps(info, indent=2))
 
         return json.dumps(info, indent=2)
-    
-    def get_slots_prediction(self, text: str, inputs, slot_probas):
-        slot_probas_np = slot_probas.numpy()
-        # Get the indices of the maximum values
-        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
-
-        # get all slot names and add to out_dict as keys
-        out_dict = {}
-        predicted_slots = set([self.extra_params["slot_names"][s]
-                            for s in slot_ids if s != 0])
-        for ps in predicted_slots:
-            out_dict[ps] = []
-        
-        # retrieving the tokenization that was used in the predictions
-        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-
-        # We'd like to eliminate all special tokens from our output
-        special_tokens = self.tokenizer.special_tokens_map.values()
-
-        for token, slot_id in zip(tokens, slot_ids):
-            if token in special_tokens:
-                continue
-            # add all to out_dict
-            slot_name = self.extra_params["slot_names"][slot_id]
-
-            if slot_name == "<PAD>":
-                continue
-
-            # collect tokens
-            collected_tokens = [token]
-            idx = tokens.index(token)
-
-            # see if it starts with ##
-            # then it belongs to the previous token
-            if token.startswith("##"):
-                # check if the token already exists or not
-                if tokens[idx - 1] not in out_dict[slot_name]:
-                    collected_tokens.insert(0, tokens[idx - 1])
-
-            # add collected tokens to slots
-            out_dict[slot_name].extend(collected_tokens)
-
-        slot_names_to_ids = {value: key for key, value in enumerate(
-            self.extra_params["slot_names"])}
-
-        entities = []
-        # process out_dict
-        for slot_name in out_dict:
-            slot_id = slot_names_to_ids[slot_name]
-            slot_tokens = out_dict[slot_name]
-
-            slot_value = self.tokenizer.convert_tokens_to_string(
-                slot_tokens).strip()
-
-            entity = {
-                "entity": slot_name,
-                "value": slot_value,
-                "start": text.find(slot_value),
-                "end":  text.find(slot_value) + len(slot_value),
-                "confidence": 0,
-            }
-
-            # The confidence of a slot is the average confidence of tokens in that slot.
-            indices = [tokens.index(token) for token in slot_tokens]
-            if len(slot_tokens) > 0:
-                total = functools.reduce(
-                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
-                entity["confidence"] = total / len(slot_tokens)
-            else:
-                entity["confidence"] = 0
-
-            entities.append(entity)
-
-        return entities
-
 
     def get_prediction(self, text: str):
         inputs = self.data_loader.encode_text(text, self.tokenizer)
-        slot_probas, intent_probas = self(inputs)  # type: ignore
+        intent_probas = self(inputs)  # type: ignore
 
         intent_probas_np = intent_probas.numpy()
         
@@ -292,15 +196,8 @@ class JISF(tfbp.Model):
         # get the confidences for each intent
         intent_confidences = intent_probas_np[0]
 
-
-        entities = []
-        if slot_probas is not None:
-            entities = self.get_slots_prediction(text, inputs, slot_probas)
-
         return {
             "text": text,
             "intent": {"name": self.extra_params["intent_names"][intent_id],
                        "confidence": float(intent_confidences[intent_id])},
-            "entities": entities,
         }
-
diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
new file mode 100644
index 0000000..81eb54a
--- /dev/null
+++ b/nlu/models/slot_filler.py
@@ -0,0 +1,250 @@
+import functools
+import json
+from transformers import TFBertModel, AutoTokenizer
+from keras.layers import Dropout, Dense
+from sys import platform
+
+if platform == "darwin":
+    from keras.optimizers.legacy import Adam
+else:
+    from keras.optimizers import Adam
+
+from keras.losses import SparseCategoricalCrossentropy
+from keras.metrics import SparseCategoricalAccuracy
+import numpy as np
+
+from data_loaders.jisfdl import JISFDL
+
+from sklearn.metrics import classification_report
+
+
+import boilerplate as tfbp
+
+##
+# Slot filling with BERT
+# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
+# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
+#
+# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
+# the paper with the original dataset.
+##
+
+BERT_MODEL_BY_LANGUAGE = {
+    'en': "bert-base-cased",
+    'fr': "dbmdz/bert-base-french-europeana-cased",
+}
+
+
+@tfbp.default_export
+class SlotFiller(tfbp.Model):
+    default_hparams = {
+        "language": None,
+        "num_epochs": 2,
+        "dropout_prob": 0.1,
+        "slot_num_labels": 40
+    }
+    data_loader: JISFDL
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Init data loader
+        self.data_loader = JISFDL(**kwargs)
+
+        # Load Tokenizer from transformers
+        # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
+        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            bert_model_name, use_fast=False)
+        self.bert = TFBertModel.from_pretrained(bert_model_name)
+
+        self.dropout = Dropout(self.hparams.dropout_prob)
+        self.slot_classifier = Dense(self.hparams.slot_num_labels,
+                                     name="slot_classifier", activation="softmax")
+
+
+    def call(self, inputs, **kwargs):
+        trained_bert = self.bert(inputs, **kwargs)
+        sequence_output = trained_bert.last_hidden_state
+
+        # sequence_output will be used for slot_filling
+        sequence_output = self.dropout(sequence_output,
+                                       training=kwargs.get("training", False))
+        slot_probas = self.slot_classifier(sequence_output)
+
+        return slot_probas
+
+    @tfbp.runnable
+    def fit(self):
+        """Training"""
+        encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader(
+            self.tokenizer)
+
+        if self.hparams.slot_num_labels != len(slot_names):
+            raise ValueError(
+                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
+            )
+
+        # Hyperparams, Optimizer and Loss function
+        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
+
+        # two outputs, one for slots, another for intents
+        # we have to fine tune for both
+        losses = SparseCategoricalCrossentropy()
+
+        metrics = [SparseCategoricalAccuracy("accuracy")]
+
+        # Compile model
+        self.compile(optimizer=opt, loss=losses, metrics=metrics)
+
+        x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],
+             "attention_mask": encoded_texts["attention_mask"]}
+
+        super().fit(
+            x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
+
+        # Persist the model
+        self.extra_params["slot_names"] = slot_names
+
+        self.save()
+
+    @tfbp.runnable
+    def evaluate(self):
+        """Evaluation"""
+        # Load test data
+        # Assuming your data loader can return test data when mode='test' is specified
+        encoded_texts, _, encoded_slots, _, slot_names = self.data_loader(
+            self.tokenizer, self.extra_params)
+
+        # Get predictions
+        predictions = self(encoded_texts)
+        predicted_slot_ids = np.argmax(predictions, axis=-1)  # Shape: (batch_size, sequence_length)
+
+        true_labels = encoded_slots.flatten()
+        pred_labels = predicted_slot_ids.flatten()
+
+        # Filter out padding tokens (assuming padding label id is 0)
+        mask = true_labels != 0
+        filtered_true_labels = true_labels[mask]
+        filtered_pred_labels = pred_labels[mask]
+
+        # Adjust labels to start from 0 (since padding label 0 is removed)
+        filtered_true_labels -= 1
+        filtered_pred_labels -= 1
+
+        # Get slot names excluding padding
+        slot_names_no_pad = self.extra_params["slot_names"][1:]  # Exclude padding label
+
+
+        report = classification_report(
+            filtered_true_labels,
+            filtered_pred_labels,
+            target_names=slot_names_no_pad,
+            zero_division=0
+        )
+
+        print(report)
+
+        # Optionally, you can return the report as a string or dictionary
+        return report
+
+    @tfbp.runnable
+    def predict(self):
+        text = self.data_loader.get_prediction_data()
+
+        info = self.get_prediction(text)
+
+        print(self.summary())
+        print("Text : " + text)
+        print(json.dumps(info, indent=2))
+
+        return json.dumps(info, indent=2)
+    
+    def get_slots_prediction(self, text: str, inputs, slot_probas):
+        slot_probas_np = slot_probas.numpy()
+        # Get the indices of the maximum values
+        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
+
+        # get all slot names and add to out_dict as keys
+        out_dict = {}
+        predicted_slots = set([self.extra_params["slot_names"][s]
+                            for s in slot_ids if s != 0])
+        for ps in predicted_slots:
+            out_dict[ps] = []
+        
+        # retrieving the tokenization that was used in the predictions
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+
+        # We'd like to eliminate all special tokens from our output
+        special_tokens = self.tokenizer.special_tokens_map.values()
+
+        for token, slot_id in zip(tokens, slot_ids):
+            if token in special_tokens:
+                continue
+            # add all to out_dict
+            slot_name = self.extra_params["slot_names"][slot_id]
+
+            if slot_name == "<PAD>":
+                continue
+
+            # collect tokens
+            collected_tokens = [token]
+            idx = tokens.index(token)
+
+            # see if it starts with ##
+            # then it belongs to the previous token
+            if token.startswith("##"):
+                # check if the token already exists or not
+                if tokens[idx - 1] not in out_dict[slot_name]:
+                    collected_tokens.insert(0, tokens[idx - 1])
+
+            # add collected tokens to slots
+            out_dict[slot_name].extend(collected_tokens)
+
+        slot_names_to_ids = {value: key for key, value in enumerate(
+            self.extra_params["slot_names"])}
+
+        entities = []
+        # process out_dict
+        for slot_name in out_dict:
+            slot_id = slot_names_to_ids[slot_name]
+            slot_tokens = out_dict[slot_name]
+
+            slot_value = self.tokenizer.convert_tokens_to_string(
+                slot_tokens).strip()
+
+            entity = {
+                "entity": slot_name,
+                "value": slot_value,
+                "start": text.find(slot_value),
+                "end":  text.find(slot_value) + len(slot_value),
+                "confidence": 0,
+            }
+
+            # The confidence of a slot is the average confidence of tokens in that slot.
+            indices = [tokens.index(token) for token in slot_tokens]
+            if len(slot_tokens) > 0:
+                total = functools.reduce(
+                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
+                entity["confidence"] = total / len(slot_tokens)
+            else:
+                entity["confidence"] = 0
+
+            entities.append(entity)
+
+        return entities
+
+
+    def get_prediction(self, text: str):
+        inputs = self.data_loader.encode_text(text, self.tokenizer)
+        slot_probas = self(inputs)  # type: ignore
+
+        entities = []
+        if slot_probas is not None:
+            entities = self.get_slots_prediction(text, inputs, slot_probas)
+
+        return {
+            "text": text,
+            "entities": entities,
+        }
diff --git a/nlu/utils/json_helper.py b/nlu/utils/json_helper.py
index 7292e72..c22a6e0 100644
--- a/nlu/utils/json_helper.py
+++ b/nlu/utils/json_helper.py
@@ -4,7 +4,7 @@ import json
 class JsonHelper:
     data_folder: str
 
-    def __init__(self, model:str="jisf"):
+    def __init__(self, model:str = "intent_classifier"):
         self.data_folder=os.path.join("data",model)
         
     def read_dataset_json_file(self, filename):

From dab9d9f7163c42bfc527478595749c7788ebcf89 Mon Sep 17 00:00:00 2001
From: Mohamed Marrouchi <marrouchi.mohamed@gmail.com>
Date: Thu, 19 Sep 2024 10:52:18 +0100
Subject: [PATCH 2/7] fix: model lang

---
 nlu/data_loaders/jisfdl.py      | 4 ++--
 nlu/models/intent_classifier.py | 2 +-
 nlu/models/slot_filler.py       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py
index 7c4096e..18f8a89 100644
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@@ -102,10 +102,10 @@ class JISFDL(tfbp.DataLoader):
         lang = self.hparams.language
         all_examples = data["common_examples"]
 
-        if not lang:
+        if not bool(lang):
             examples = all_examples
         else:
-            examples = filter(lambda exp: any(not lang or (e['entity'] == 'language' and e['value'] == lang) for e in exp['entities']), all_examples)
+            examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
 
         # Parse raw data
         for exp in examples:
diff --git a/nlu/models/intent_classifier.py b/nlu/models/intent_classifier.py
index 2d2f27c..5491cb8 100644
--- a/nlu/models/intent_classifier.py
+++ b/nlu/models/intent_classifier.py
@@ -38,7 +38,7 @@ BERT_MODEL_BY_LANGUAGE = {
 @tfbp.default_export
 class IntentClassifier(tfbp.Model):
     default_hparams = {
-        "language": None,
+        "language": "",
         "num_epochs": 2,
         "dropout_prob": 0.1,
         "intent_num_labels": 7,
diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py
index 81eb54a..0393fb3 100644
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@@ -38,7 +38,7 @@ BERT_MODEL_BY_LANGUAGE = {
 @tfbp.default_export
 class SlotFiller(tfbp.Model):
     default_hparams = {
-        "language": None,
+        "language": "",
         "num_epochs": 2,
         "dropout_prob": 0.1,
         "slot_num_labels": 40

From 6183bf3a46b4e7eb135118507b8ef71bd1727ad6 Mon Sep 17 00:00:00 2001
From: Mohamed Marrouchi <marrouchi.mohamed@gmail.com>
Date: Fri, 20 Sep 2024 15:35:02 +0100
Subject: [PATCH 3/7] fix: update env template

---
 docker/.env.example | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/.env.example b/docker/.env.example
index 2104dcc..c63f981 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -43,7 +43,8 @@ AUTH_TOKEN=token123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=en,fr
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
+SLOT_FILLER_REPO_ID=Hexastack/slot-filler
 NLP_PORT=5000
 
 # Frontend (Next.js)

From 24b8bcf1bad5f5b32e5df7f4b534ce94ed636ec5 Mon Sep 17 00:00:00 2001
From: auraofdivinity <auraofdivinity@gmail.com>
Date: Sat, 21 Sep 2024 10:41:04 +0530
Subject: [PATCH 4/7] fix: add request queue to handle concurrent zoom & offset
 requests

---
 .../components/visual-editor/v2/Diagrams.tsx  | 21 ++++++------
 frontend/src/utils/requestQueue.ts            | 33 +++++++++++++++++++
 2 files changed, 43 insertions(+), 11 deletions(-)
 create mode 100644 frontend/src/utils/requestQueue.ts

diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx
index 37f29fc..8ea2e44 100644
--- a/frontend/src/components/visual-editor/v2/Diagrams.tsx
+++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx
@@ -51,6 +51,7 @@ import { BlockPorts } from "@/types/visual-editor.types";
 import BlockDialog from "../BlockDialog";
 import { ZOOM_LEVEL } from "../constants";
 import { useVisualEditor } from "../hooks/useVisualEditor";
+import { RequestQueue } from "@/utils/requestQueue";
 
 const Diagrams = () => {
   const { t } = useTranslation();
@@ -108,25 +109,23 @@ const Diagrams = () => {
   const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
     invalidate: false,
   });
+
+  const requestQueue = useRef(new RequestQueue<ICategory>());
+  const enqueueUpdate = (id: string, params: any) => {
+    requestQueue.current.enqueue(() => updateCategory({ id, params }));
+  };
+
   const debouncedZoomEvent = debounce((event) => {
     if (selectedCategoryId) {
       engine?.repaintCanvas();
-      updateCategory({
-        id: selectedCategoryId,
-        params: {
-          zoom: event.zoom,
-        },
-      });
+      enqueueUpdate(selectedCategoryId, { zoom: event.zoom });
     }
     event.stopPropagation();
   }, 200);
   const debouncedOffsetEvent = debounce((event) => {
     if (selectedCategoryId) {
-      updateCategory({
-        id: selectedCategoryId,
-        params: {
-          offset: [event.offsetX, event.offsetY],
-        },
+      enqueueUpdate(selectedCategoryId, {
+        offset: [event.offsetX, event.offsetY],
       });
     }
     event.stopPropagation();
diff --git a/frontend/src/utils/requestQueue.ts b/frontend/src/utils/requestQueue.ts
new file mode 100644
index 0000000..742e8d4
--- /dev/null
+++ b/frontend/src/utils/requestQueue.ts
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2024 Hexastack. All rights reserved.
+ *
+ * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
+ * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
+ * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
+ * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
+ */
+
+export class RequestQueue<T> {
+  private queue: Array<() => Promise<T>> = [];
+  private isProcessing = false;
+
+  enqueue(request: () => Promise<T>) {
+    this.queue.push(request);
+    this.processQueue();
+  }
+
+  private async processQueue() {
+    if (this.isProcessing) return;
+
+    this.isProcessing = true;
+
+    while (this.queue.length > 0) {
+      const request = this.queue.shift();
+      if (request) {
+        await request();
+      }
+    }
+
+    this.isProcessing = false;
+  }
+}

From dbf1fb002f3a05c1ac3f3e701fed4371aa27f4eb Mon Sep 17 00:00:00 2001
From: auraofdivinity <auraofdivinity@gmail.com>
Date: Sun, 22 Sep 2024 18:33:01 +0530
Subject: [PATCH 5/7] fix: extracting debounced update to a custom hook

---
 .../components/visual-editor/v2/Diagrams.tsx  | 61 ++++++++++++-------
 frontend/src/hooks/useDebouncedUpdate.tsx     | 47 ++++++++++++++
 frontend/src/utils/requestQueue.ts            | 33 ----------
 3 files changed, 86 insertions(+), 55 deletions(-)
 create mode 100644 frontend/src/hooks/useDebouncedUpdate.tsx
 delete mode 100644 frontend/src/utils/requestQueue.ts

diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx
index 8ea2e44..1b0b514 100644
--- a/frontend/src/components/visual-editor/v2/Diagrams.tsx
+++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx
@@ -32,7 +32,13 @@ import {
   DiagramModel,
   DiagramModelGenerics,
 } from "@projectstorm/react-diagrams";
-import { SyntheticEvent, useEffect, useRef, useState } from "react";
+import {
+  SyntheticEvent,
+  useCallback,
+  useEffect,
+  useRef,
+  useState,
+} from "react";
 import { useTranslation } from "react-i18next";
 
 import { DeleteDialog } from "@/app-components/dialogs";
@@ -45,13 +51,13 @@ import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
 import { useSearch } from "@/hooks/useSearch";
 import { EntityType, Format } from "@/services/types";
 import { IBlock } from "@/types/block.types";
-import { ICategory } from "@/types/category.types";
+import { ICategory, ICategoryAttributes } from "@/types/category.types";
 import { BlockPorts } from "@/types/visual-editor.types";
 
 import BlockDialog from "../BlockDialog";
 import { ZOOM_LEVEL } from "../constants";
 import { useVisualEditor } from "../hooks/useVisualEditor";
-import { RequestQueue } from "@/utils/requestQueue";
+import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
 
 const Diagrams = () => {
   const { t } = useTranslation();
@@ -110,26 +116,37 @@ const Diagrams = () => {
     invalidate: false,
   });
 
-  const requestQueue = useRef(new RequestQueue<ICategory>());
-  const enqueueUpdate = (id: string, params: any) => {
-    requestQueue.current.enqueue(() => updateCategory({ id, params }));
-  };
+  const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
+  const debouncedZoomEvent = useCallback(
+    (event: any) => {
+      if (selectedCategoryId) {
+        engine?.repaintCanvas();
+        debouncedUpdateCategory({
+          id: selectedCategoryId,
+          params: {
+            zoom: event.zoom,
+          },
+        });
+      }
+      event.stopPropagation();
+    },
+    [selectedCategoryId, debouncedUpdateCategory],
+  );
+  const debouncedOffsetEvent = useCallback(
+    (event: any) => {
+      if (selectedCategoryId) {
+        debouncedUpdateCategory({
+          id: selectedCategoryId,
+          params: {
+            offset: [event.offsetX, event.offsetY],
+          },
+        });
+      }
+      event.stopPropagation();
+    },
+    [selectedCategoryId, debouncedUpdateCategory],
+  );
 
-  const debouncedZoomEvent = debounce((event) => {
-    if (selectedCategoryId) {
-      engine?.repaintCanvas();
-      enqueueUpdate(selectedCategoryId, { zoom: event.zoom });
-    }
-    event.stopPropagation();
-  }, 200);
-  const debouncedOffsetEvent = debounce((event) => {
-    if (selectedCategoryId) {
-      enqueueUpdate(selectedCategoryId, {
-        offset: [event.offsetX, event.offsetY],
-      });
-    }
-    event.stopPropagation();
-  }, 200);
   const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
   const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
   const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);
diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx
new file mode 100644
index 0000000..5df92b4
--- /dev/null
+++ b/frontend/src/hooks/useDebouncedUpdate.tsx
@@ -0,0 +1,47 @@
+import { debounce } from "@mui/material";
+import { useCallback, useEffect, useRef } from "react";
+
+type DebouncedUpdateParams = {
+  id: string;
+  params: Record<string, any>;
+};
+
+function useDebouncedUpdate(
+  apiUpdate: (params: DebouncedUpdateParams) => void,
+  delay: number = 300,
+) {
+  const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
+
+  const processUpdates = useRef(
+    debounce(() => {
+      if (accumulatedUpdates.current) {
+        apiUpdate(accumulatedUpdates.current);
+        accumulatedUpdates.current = null;
+      }
+    }, delay),
+  ).current;
+
+  const handleUpdate = useCallback(
+    (params: DebouncedUpdateParams) => {
+      accumulatedUpdates.current = {
+        id: params.id,
+        params: {
+          ...(accumulatedUpdates.current?.params || {}),
+          ...params.params,
+        },
+      };
+      processUpdates();
+    },
+    [processUpdates],
+  );
+
+  useEffect(() => {
+    return () => {
+      processUpdates.clear();
+    };
+  }, [processUpdates]);
+
+  return handleUpdate;
+}
+
+export default useDebouncedUpdate;
diff --git a/frontend/src/utils/requestQueue.ts b/frontend/src/utils/requestQueue.ts
deleted file mode 100644
index 742e8d4..0000000
--- a/frontend/src/utils/requestQueue.ts
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright © 2024 Hexastack. All rights reserved.
- *
- * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
- * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
- * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
- * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
- */
-
-export class RequestQueue<T> {
-  private queue: Array<() => Promise<T>> = [];
-  private isProcessing = false;
-
-  enqueue(request: () => Promise<T>) {
-    this.queue.push(request);
-    this.processQueue();
-  }
-
-  private async processQueue() {
-    if (this.isProcessing) return;
-
-    this.isProcessing = true;
-
-    while (this.queue.length > 0) {
-      const request = this.queue.shift();
-      if (request) {
-        await request();
-      }
-    }
-
-    this.isProcessing = false;
-  }
-}

From 08e5f6853bf6d361b71516198103e14b5030527f Mon Sep 17 00:00:00 2001
From: auraofdivinity <auraofdivinity@gmail.com>
Date: Sun, 22 Sep 2024 19:10:30 +0530
Subject: [PATCH 6/7] fix: fix linting errors

---
 frontend/src/components/visual-editor/v2/Diagrams.tsx | 9 +++------
 frontend/src/hooks/useDebouncedUpdate.tsx             | 2 --
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx
index 1b0b514..431a408 100644
--- a/frontend/src/components/visual-editor/v2/Diagrams.tsx
+++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx
@@ -22,7 +22,6 @@ import {
   Tab,
   Tabs,
   Tooltip,
-  debounce,
   tabsClasses,
 } from "@mui/material";
 import {
@@ -47,17 +46,17 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete";
 import { useFind } from "@/hooks/crud/useFind";
 import { useGetFromCache } from "@/hooks/crud/useGet";
 import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate";
+import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
 import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
 import { useSearch } from "@/hooks/useSearch";
 import { EntityType, Format } from "@/services/types";
 import { IBlock } from "@/types/block.types";
-import { ICategory, ICategoryAttributes } from "@/types/category.types";
+import { ICategory } from "@/types/category.types";
 import { BlockPorts } from "@/types/visual-editor.types";
 
 import BlockDialog from "../BlockDialog";
 import { ZOOM_LEVEL } from "../constants";
 import { useVisualEditor } from "../hooks/useVisualEditor";
-import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
 
 const Diagrams = () => {
   const { t } = useTranslation();
@@ -115,7 +114,6 @@ const Diagrams = () => {
   const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
     invalidate: false,
   });
-
   const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
   const debouncedZoomEvent = useCallback(
     (event: any) => {
@@ -130,7 +128,7 @@ const Diagrams = () => {
       }
       event.stopPropagation();
     },
-    [selectedCategoryId, debouncedUpdateCategory],
+    [selectedCategoryId, engine, debouncedUpdateCategory],
   );
   const debouncedOffsetEvent = useCallback(
     (event: any) => {
@@ -146,7 +144,6 @@ const Diagrams = () => {
     },
     [selectedCategoryId, debouncedUpdateCategory],
   );
-
   const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
   const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
   const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);
diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx
index 5df92b4..eecb38f 100644
--- a/frontend/src/hooks/useDebouncedUpdate.tsx
+++ b/frontend/src/hooks/useDebouncedUpdate.tsx
@@ -11,7 +11,6 @@ function useDebouncedUpdate(
   delay: number = 300,
 ) {
   const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
-
   const processUpdates = useRef(
     debounce(() => {
       if (accumulatedUpdates.current) {
@@ -20,7 +19,6 @@ function useDebouncedUpdate(
       }
     }, delay),
   ).current;
-
   const handleUpdate = useCallback(
     (params: DebouncedUpdateParams) => {
       accumulatedUpdates.current = {

From 95fd2cbe3ac34a7143f17ea0331eec1b93b8b29d Mon Sep 17 00:00:00 2001
From: auraofdivinity <auraofdivinity@gmail.com>
Date: Mon, 23 Sep 2024 14:30:52 +0530
Subject: [PATCH 7/7] fix: adding license details

---
 frontend/src/hooks/useDebouncedUpdate.tsx | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx
index eecb38f..cb36ed9 100644
--- a/frontend/src/hooks/useDebouncedUpdate.tsx
+++ b/frontend/src/hooks/useDebouncedUpdate.tsx
@@ -1,3 +1,12 @@
+/*
+ * Copyright © 2024 Hexastack. All rights reserved.
+ *
+ * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
+ * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
+ * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
+ * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
+ */
+
 import { debounce } from "@mui/material";
 import { useCallback, useEffect, useRef } from "react";