Merge pull request #258 from MohamedAliBouhaouala/enhance/intent-classifier

Enhance/intent classifier
2025-06-26 18:27:28 +00:00 · 2024-10-22 16:15:51 +01:00 · 2024-10-22 16:15:51 +01:00 · 3f8d0a749c
commit 3f8d0a749c
parent 6e58361abf a8c6a97f6c
8 changed files with 59 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@ certbot
 docker/**/data
 node_modules/
 /docker/**/db_data
 .idea/
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@ -210,10 +210,6 @@ class JISFDL(tfbp.DataLoader):
        return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
    def get_prediction_data(self) -> str:
        helper = JsonHelper()
        dataset = helper.read_dataset_json_file('predict.json')
        return dataset["text"]
    def encode_text(self, text: str, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
        return self.encode_texts([text], tokenizer)
--- a/nlu/data_loaders/tflcdl.py
+++ b/nlu/data_loaders/tflcdl.py
@ -125,14 +125,6 @@ class TFLCDL(tfbp.DataLoader):
        else:
            raise ValueError("Unknown method!")
    def get_prediction_data(self):
        # The predict file contains a single JSON object whose only key is text.
        data = self.json_helper.read_dataset_json_file("predict.json")
        text = self.strip_numbers(data["text"])
        encoded_texts = np.array(self.tfidf.transform(
            [text]).toarray())  # type: ignore
        return np.array([text]), encoded_texts
    def encode_text(self, text: str):
        sanitized_text = self.strip_numbers(text)
        return self.tfidf.transform([sanitized_text]).toarray() # type: ignore
--- a/nlu/models/intent_classifier.py
+++ b/nlu/models/intent_classifier.py
@ -12,8 +12,8 @@ if platform == "darwin":
 else:
    from keras.optimizers import Adam
 from keras.losses import SparseCategoricalCrossentropy
 from keras.metrics import SparseCategoricalAccuracy
 from focal_loss import SparseCategoricalFocalLoss
 import numpy as np
 from data_loaders.jisfdl import JISFDL
@ -42,6 +42,8 @@ class IntentClassifier(tfbp.Model):
        "num_epochs": 2,
        "dropout_prob": 0.1,
        "intent_num_labels": 7,
        "gamma": 2,
        "k": 3
    }
    data_loader: JISFDL
@ -128,7 +130,7 @@ class IntentClassifier(tfbp.Model):
        # Hyperparams, Optimizer and Loss function
        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
-        losses = SparseCategoricalCrossentropy()
+        losses = SparseCategoricalFocalLoss(gamma=self.hparams.gamma)
        metrics = [SparseCategoricalAccuracy("accuracy")]
@ -172,32 +174,48 @@ class IntentClassifier(tfbp.Model):
        return scores
    @tfbp.runnable
    def predict(self):
        text = self.data_loader.get_prediction_data()
        info = self.get_prediction(text)
        print(self.summary())
        print("Text : " + text)
        print(json.dumps(info, indent=2))
        return json.dumps(info, indent=2)
    def get_prediction(self, text: str):
        inputs = self.data_loader.encode_text(text, self.tokenizer)
        intent_probas = self(inputs)  # type: ignore
        intent_probas_np = intent_probas.numpy()
-        
+
        # Get the indices of the maximum values
        intent_id = intent_probas_np.argmax(axis=-1)[0]
-        
+
        # get the confidences for each intent
        intent_confidences = intent_probas_np[0]
-        return {
+        margin = self.compute_normalized_confidence_margin(intent_probas_np)
        output = {
            "text": text,
            "intent": {"name": self.extra_params["intent_names"][intent_id],
                       "confidence": float(intent_confidences[intent_id])},
            "margin": margin,
        }
        return output
    def compute_top_k_confidence(self, probs, k=3):
        sorted_probas = np.sort(probs[0])[::-1]  # Sort in descending order
        top_k_sum = np.sum(sorted_probas[:k])
        return top_k_sum
    def compute_normalized_confidence_margin(self, probs):
        highest_proba = np.max(probs[0])
        sum_of_probas = self.compute_top_k_confidence(probs, self.hparams.k)
        # Normalized margin
        normalized_margin = highest_proba / sum_of_probas
        return normalized_margin
    @tfbp.runnable
    def predict(self):
        while True:
            text = input("Provide text: ")
            output = self.get_prediction(text)
            print(output)
            # Optionally, provide a way to exit the loop
            if input("Try again? (y/n): ").lower() != 'y':
                break
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@ -151,16 +151,19 @@ class SlotFiller(tfbp.Model):
    @tfbp.runnable
    def predict(self):
-        text = self.data_loader.get_prediction_data()
+        while True:
            text = input("Provide text: ")
            info = self.get_prediction(text)
-        info = self.get_prediction(text)
+            print(self.summary())
            print("Text : " + text)
            print(info)
            # Optionally, provide a way to exit the loop
            if input("Try again? (y/n): ").lower() != 'y':
                break
        print(self.summary())
        print("Text : " + text)
        print(json.dumps(info, indent=2))
        return json.dumps(info, indent=2)
    def get_slots_prediction(self, text: str, inputs, slot_probas):
        slot_probas_np = slot_probas.numpy()
        # Get the indices of the maximum values
--- a/nlu/models/tflc.py
+++ b/nlu/models/tflc.py
@ -95,19 +95,27 @@ class TFLC(tfbp.Model):
        self.calculate_metrics(y_test, y_pred, languages)
    def preprocess_text(self, text):
        # The predict file contains a single JSON object whose only key is text.
        stripped_text = self.strip_numbers(text)
        encoded_text = np.array(self.tfidf.transform(
            [stripped_text]).toarray())  # type: ignore
        return np.array([stripped_text]), encoded_text
    @tfbp.runnable
    def predict(self):
        languages = list(self.extra_params['languages'])
-        texts, encoded_texts = self.data_loader.get_prediction_data()
+        input_provided = input("Provide text: ")
        text, encoded_text = self.preprocess_text(input_provided)
        # converting a one hot output to language index
-        probas = super().predict(encoded_texts)
+        probas = super().predict(encoded_text)
        predictions = np.argmax(probas, axis=1)
        results = []
        for idx, prediction in enumerate(predictions):
            print('The sentence "{}" is in {}.'.format(
-                texts[idx], languages[prediction].upper()))
+                text[idx], languages[prediction].upper()))
-            results.append({'text': texts[idx], 'language': prediction})
+            results.append({'text': text[idx], 'language': prediction})
        return results
    def get_prediction(self, text: str):
--- a/nlu/requirements.txt
+++ b/nlu/requirements.txt
@ -6,4 +6,5 @@ scikit_learn==1.2.2
 fastapi==0.100.0
 uvicorn[standard]==0.23.1
 autopep8==2.0.2
 focal-loss==0.0.7
 h5py --only-binary=h5py
--- a/nlu/utils/json_helper.py
+++ b/nlu/utils/json_helper.py
@ -6,7 +6,7 @@ class JsonHelper:
    def __init__(self, model:str = "intent_classifier"):
        self.data_folder=os.path.join("data",model)
-        
+
    def read_dataset_json_file(self, filename):
        file_path = os.path.join(self.data_folder, filename)
        if os.path.exists(file_path):