From c58a080ebf10709d7361d5c62d05a579bdcc5ac6 Mon Sep 17 00:00:00 2001 From: Mohamed Marrouchi Date: Thu, 19 Sep 2024 09:14:17 +0100 Subject: [PATCH 1/7] feat: break NLU JISF apart --- nlu/.env.dev | 3 +- nlu/.env.example | 4 +- nlu/README.md | 4 +- nlu/data_loaders/jisfdl.py | 50 ++-- nlu/data_loaders/tflcdl.py | 2 +- nlu/main.py | 43 +++- nlu/models/{jisf.py => intent_classifier.py} | 129 +--------- nlu/models/slot_filler.py | 250 +++++++++++++++++++ nlu/utils/json_helper.py | 2 +- 9 files changed, 337 insertions(+), 150 deletions(-) rename nlu/models/{jisf.py => intent_classifier.py} (59%) create mode 100644 nlu/models/slot_filler.py diff --git a/nlu/.env.dev b/nlu/.env.dev index d5023df..6fac306 100644 --- a/nlu/.env.dev +++ b/nlu/.env.dev @@ -2,4 +2,5 @@ AUTH_TOKEN=123 LANGUAGE_CLASSIFIER=language-classifier INTENT_CLASSIFIERS=ar,fr,tn TFLC_REPO_ID=Hexastack/tflc -JISF_REPO_ID=Hexastack/jisf +INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier +SLOT_FILLER_REPO_ID=Hexastack/slot-filler diff --git a/nlu/.env.example b/nlu/.env.example index 52370c7..a863e43 100644 --- a/nlu/.env.example +++ b/nlu/.env.example @@ -1,5 +1,5 @@ AUTH_TOKEN= LANGUAGE_CLASSIFIER= INTENT_CLASSIFIERS= -TFLC_REPO_ID= -JISF_REPO_ID= \ No newline at end of file +INTENT_CLASSIFIER_REPO_ID= +SLOT_FILLER_REPO_ID= \ No newline at end of file diff --git a/nlu/README.md b/nlu/README.md index ff5ba60..dd8a00b 100644 --- a/nlu/README.md +++ b/nlu/README.md @@ -40,7 +40,7 @@ pip install -r requirements.txt You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`: ```bash $ cat env.sh -source env/bin/activate +source venv/bin/activate alias run='python run.py' ``` @@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1 Examples : ```bash # Intent classification -run fit intent-classifier-en-30072024 jisf --intent_num_labels=88 --slot_num_labels=17 --language=en +run fit intent-classifier-en-30072024 intent_classifier --intent_num_labels=88 --slot_num_labels=17 --language=en run predict intent-classifier-fr-30072024 --intent_num_labels=7 --slot_num_labels=2 --language=fr # Language classification diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index 75ae949..7c4096e 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -4,8 +4,8 @@ import json import numpy as np from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer + import boilerplate as tfbp -from utils.jisf_data_mapper import JisfDataMapper from utils.json_helper import JsonHelper @@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader): # Filter examples by language lang = self.hparams.language all_examples = data["common_examples"] - examples = filter(lambda exp: any( - e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples) + + if not lang: + examples = all_examples + else: + examples = filter(lambda exp: any(not lang or (e['entity'] == 'language' and e['value'] == lang) for e in exp['entities']), all_examples) # Parse raw data for exp in examples: @@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader): # the classifier. texts = [d.text for d in dataset] encoded_texts = self.encode_texts(texts, tokenizer) - # Map intents, load from the model (evaluate), recompute from dataset otherwise (train) intents = [d.intent for d in dataset] if not model_params: @@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader): # To handle those we need to add to slots_names. It can be some other symbol as well. slot_names.insert(0, "") else: - intent_names = model_params.intent_names - slot_names = model_params.slot_names + if "intent_names" in model_params: + intent_names = model_params["intent_names"] + else: + intent_names = None + + if "slot_names" in model_params: + slot_names = model_params["slot_names"] + else: + slot_names = None - intent_map = dict() # Dict : intent -> index - for idx, ui in enumerate(intent_names): - intent_map[ui] = idx + if intent_names: + intent_map = dict() # Dict : intent -> index + for idx, ui in enumerate(intent_names): + intent_map[ui] = idx + else: + intent_map = None # Encode intents - encoded_intents = self.encode_intents(intents, intent_map) + if intent_map: + encoded_intents = self.encode_intents(intents, intent_map) + else: + encoded_intents = None - slot_map: Dict[str, int] = dict() # slot -> index - for idx, us in enumerate(slot_names): - slot_map[us] = idx + if slot_names: + slot_map: Dict[str, int] = dict() # slot -> index + for idx, us in enumerate(slot_names): + slot_map[us] = idx + else: + slot_map = None # Encode slots # Text : Add a tune to my elrow Guest List @@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader): max_len = len(encoded_texts["input_ids"][0]) # type: ignore all_slots = [td.slots for td in dataset] all_texts = [td.text for td in dataset] - encoded_slots = self.encode_slots(tokenizer, + + if slot_map: + encoded_slots = self.encode_slots(tokenizer, all_slots, all_texts, slot_map, max_len) + else: + encoded_slots = None return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names diff --git a/nlu/data_loaders/tflcdl.py b/nlu/data_loaders/tflcdl.py index 09a23c0..b765f78 100644 --- a/nlu/data_loaders/tflcdl.py +++ b/nlu/data_loaders/tflcdl.py @@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader): self.json_helper = JsonHelper("tflc") self._save_dir = save_dir - print(hparams) + # We will opt for a TF-IDF representation of the data as the frequency of word # roots should give us a good idea about which language we're dealing with. if method == "fit": diff --git a/nlu/main.py b/nlu/main.py index b85ce11..f7e4f8b 100644 --- a/nlu/main.py +++ b/nlu/main.py @@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED") AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',') TFLC_REPO_ID = os.getenv("TFLC_REPO_ID") -JISF_REPO_ID = os.getenv("JISF_REPO_ID") - +INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID") +SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID") def load_language_classifier(): # Init language classifier model @@ -27,21 +27,31 @@ def load_language_classifier(): logging.info(f'Successfully loaded the language classifier model') return model - def load_intent_classifiers(): - Model = tfbp.get_model("jisf") - models = {} + Model = tfbp.get_model("intent_classifier") + intent_classifiers = {} for language in AVAILABLE_LANGUAGES: kwargs = {} - models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs) - models[language].load_model() + intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs) + intent_classifiers[language].load_model() logging.info(f'Successfully loaded the intent classifier {language} model') - return models + return intent_classifiers + +def load_slot_classifiers(): + Model = tfbp.get_model("slot_classifier") + slot_fillers = {} + for language in AVAILABLE_LANGUAGES: + kwargs = {} + slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs) + slot_fillers[language].load_model() + logging.info(f'Successfully loaded the slot filler {language} model') + return slot_fillers def load_models(): app.language_classifier = load_language_classifier() # type: ignore app.intent_classifiers = load_intent_classifiers() # type: ignore + app.slot_fillers = load_intent_classifiers() # type: ignore app = FastAPI() @@ -74,13 +84,20 @@ async def check_health(): @app.post("/parse") def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]): - if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'): + if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'): headers = {"Retry-After": "120"} # Suggest retrying after 2 minutes - return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers) + return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers) language = app.language_classifier.get_prediction(input.q) # type: ignore lang = language.get("value") - prediction = app.intent_classifiers[lang].get_prediction( + intent_prediction = app.intent_classifiers[lang].get_prediction( input.q) # type: ignore - prediction.get("entities").append(language) - return prediction + slot_prediction = app.slot_fillers[lang].get_prediction( + input.q) # type: ignore + slot_prediction.get("entities").append(language) + + return { + "text": input.q, + "intent": intent_prediction.get("intent"), + "entities": slot_prediction.get("entities"), + } diff --git a/nlu/models/jisf.py b/nlu/models/intent_classifier.py similarity index 59% rename from nlu/models/jisf.py rename to nlu/models/intent_classifier.py index 71c14ef..2d2f27c 100644 --- a/nlu/models/jisf.py +++ b/nlu/models/intent_classifier.py @@ -1,4 +1,3 @@ -import functools import json import math from typing import Tuple, Dict, List @@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL import boilerplate as tfbp ## -# JISF : Joint Intent Classification and Slot filling with BERT -# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), +# Intent Classification with BERT +# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project. # # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of @@ -33,19 +32,16 @@ import boilerplate as tfbp BERT_MODEL_BY_LANGUAGE = { 'en': "bert-base-cased", 'fr': "dbmdz/bert-base-french-europeana-cased", - 'ar': 'asafaya/bert-base-arabic', - 'tn': 'dbmdz/bert-base-french-europeana-cased' } @tfbp.default_export -class JISF(tfbp.Model): +class IntentClassifier(tfbp.Model): default_hparams = { - "language": "fr", + "language": None, "num_epochs": 2, "dropout_prob": 0.1, "intent_num_labels": 7, - "slot_num_labels": 40 } data_loader: JISFDL @@ -57,8 +53,8 @@ class JISF(tfbp.Model): # Load Tokenizer from transformers # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier. - bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language] - # bert_model_name = typing.cast(str, self.hparams.bert_model_name) + bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"] + self.tokenizer = AutoTokenizer.from_pretrained( bert_model_name, use_fast=False) self.bert = TFBertModel.from_pretrained(bert_model_name) @@ -66,27 +62,18 @@ class JISF(tfbp.Model): self.dropout = Dropout(self.hparams.dropout_prob) self.intent_classifier = Dense(self.hparams.intent_num_labels, name="intent_classifier", activation="softmax") - self.slot_classifier = Dense(self.hparams.slot_num_labels, - name="slot_classifier", activation="softmax") def call(self, inputs, **kwargs): - # two outputs from BERT trained_bert = self.bert(inputs, **kwargs) pooled_output = trained_bert.pooler_output - sequence_output = trained_bert.last_hidden_state - - # sequence_output will be used for slot_filling / classification - sequence_output = self.dropout(sequence_output, - training=kwargs.get("training", False)) - slot_probas = self.slot_classifier(sequence_output) - + # pooled_output for intent classification pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) intent_probas = self.intent_classifier(pooled_output) - return slot_probas, intent_probas + return intent_probas def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]: return data_loader(self.tokenizer) @@ -137,18 +124,11 @@ class JISF(tfbp.Model): raise ValueError( f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}" ) - if self.hparams.slot_num_labels != len(slot_names): - raise ValueError( - f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}" - ) # Hyperparams, Optimizer and Loss function opt = Adam(learning_rate=3e-5, epsilon=1e-08) - # two outputs, one for slots, another for intents - # we have to fine tune for both - losses = [SparseCategoricalCrossentropy(), - SparseCategoricalCrossentropy()] + losses = SparseCategoricalCrossentropy() metrics = [SparseCategoricalAccuracy("accuracy")] @@ -159,11 +139,10 @@ class JISF(tfbp.Model): "attention_mask": encoded_texts["attention_mask"]} super().fit( - x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) + x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) # Persist the model self.extra_params["intent_names"] = intent_names - self.extra_params["slot_names"] = slot_names self.save() @@ -175,8 +154,8 @@ class JISF(tfbp.Model): metrics = [SparseCategoricalAccuracy("accuracy")] self.compile(metrics=metrics) - _, intent_probas = self(encoded_texts) # type: ignore - + intent_probas = self(encoded_texts) # type: ignore + scores = self.get_metrics_by_intent(intent_probas, encoded_intents) overall_score = {} @@ -204,85 +183,10 @@ class JISF(tfbp.Model): print(json.dumps(info, indent=2)) return json.dumps(info, indent=2) - - def get_slots_prediction(self, text: str, inputs, slot_probas): - slot_probas_np = slot_probas.numpy() - # Get the indices of the maximum values - slot_ids = slot_probas_np.argmax(axis=-1)[0, :] - - # get all slot names and add to out_dict as keys - out_dict = {} - predicted_slots = set([self.extra_params["slot_names"][s] - for s in slot_ids if s != 0]) - for ps in predicted_slots: - out_dict[ps] = [] - - # retrieving the tokenization that was used in the predictions - tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) - - # We'd like to eliminate all special tokens from our output - special_tokens = self.tokenizer.special_tokens_map.values() - - for token, slot_id in zip(tokens, slot_ids): - if token in special_tokens: - continue - # add all to out_dict - slot_name = self.extra_params["slot_names"][slot_id] - - if slot_name == "": - continue - - # collect tokens - collected_tokens = [token] - idx = tokens.index(token) - - # see if it starts with ## - # then it belongs to the previous token - if token.startswith("##"): - # check if the token already exists or not - if tokens[idx - 1] not in out_dict[slot_name]: - collected_tokens.insert(0, tokens[idx - 1]) - - # add collected tokens to slots - out_dict[slot_name].extend(collected_tokens) - - slot_names_to_ids = {value: key for key, value in enumerate( - self.extra_params["slot_names"])} - - entities = [] - # process out_dict - for slot_name in out_dict: - slot_id = slot_names_to_ids[slot_name] - slot_tokens = out_dict[slot_name] - - slot_value = self.tokenizer.convert_tokens_to_string( - slot_tokens).strip() - - entity = { - "entity": slot_name, - "value": slot_value, - "start": text.find(slot_value), - "end": text.find(slot_value) + len(slot_value), - "confidence": 0, - } - - # The confidence of a slot is the average confidence of tokens in that slot. - indices = [tokens.index(token) for token in slot_tokens] - if len(slot_tokens) > 0: - total = functools.reduce( - lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0) - entity["confidence"] = total / len(slot_tokens) - else: - entity["confidence"] = 0 - - entities.append(entity) - - return entities - def get_prediction(self, text: str): inputs = self.data_loader.encode_text(text, self.tokenizer) - slot_probas, intent_probas = self(inputs) # type: ignore + intent_probas = self(inputs) # type: ignore intent_probas_np = intent_probas.numpy() @@ -292,15 +196,8 @@ class JISF(tfbp.Model): # get the confidences for each intent intent_confidences = intent_probas_np[0] - - entities = [] - if slot_probas is not None: - entities = self.get_slots_prediction(text, inputs, slot_probas) - return { "text": text, "intent": {"name": self.extra_params["intent_names"][intent_id], "confidence": float(intent_confidences[intent_id])}, - "entities": entities, } - diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py new file mode 100644 index 0000000..81eb54a --- /dev/null +++ b/nlu/models/slot_filler.py @@ -0,0 +1,250 @@ +import functools +import json +from transformers import TFBertModel, AutoTokenizer +from keras.layers import Dropout, Dense +from sys import platform + +if platform == "darwin": + from keras.optimizers.legacy import Adam +else: + from keras.optimizers import Adam + +from keras.losses import SparseCategoricalCrossentropy +from keras.metrics import SparseCategoricalAccuracy +import numpy as np + +from data_loaders.jisfdl import JISFDL + +from sklearn.metrics import classification_report + + +import boilerplate as tfbp + +## +# Slot filling with BERT +# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), +# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project. +# +# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of +# the paper with the original dataset. +## + +BERT_MODEL_BY_LANGUAGE = { + 'en': "bert-base-cased", + 'fr': "dbmdz/bert-base-french-europeana-cased", +} + + +@tfbp.default_export +class SlotFiller(tfbp.Model): + default_hparams = { + "language": None, + "num_epochs": 2, + "dropout_prob": 0.1, + "slot_num_labels": 40 + } + data_loader: JISFDL + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Init data loader + self.data_loader = JISFDL(**kwargs) + + # Load Tokenizer from transformers + # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier. + bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"] + + self.tokenizer = AutoTokenizer.from_pretrained( + bert_model_name, use_fast=False) + self.bert = TFBertModel.from_pretrained(bert_model_name) + + self.dropout = Dropout(self.hparams.dropout_prob) + self.slot_classifier = Dense(self.hparams.slot_num_labels, + name="slot_classifier", activation="softmax") + + + def call(self, inputs, **kwargs): + trained_bert = self.bert(inputs, **kwargs) + sequence_output = trained_bert.last_hidden_state + + # sequence_output will be used for slot_filling + sequence_output = self.dropout(sequence_output, + training=kwargs.get("training", False)) + slot_probas = self.slot_classifier(sequence_output) + + return slot_probas + + @tfbp.runnable + def fit(self): + """Training""" + encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader( + self.tokenizer) + + if self.hparams.slot_num_labels != len(slot_names): + raise ValueError( + f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}" + ) + + # Hyperparams, Optimizer and Loss function + opt = Adam(learning_rate=3e-5, epsilon=1e-08) + + # two outputs, one for slots, another for intents + # we have to fine tune for both + losses = SparseCategoricalCrossentropy() + + metrics = [SparseCategoricalAccuracy("accuracy")] + + # Compile model + self.compile(optimizer=opt, loss=losses, metrics=metrics) + + x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"], + "attention_mask": encoded_texts["attention_mask"]} + + super().fit( + x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) + + # Persist the model + self.extra_params["slot_names"] = slot_names + + self.save() + + @tfbp.runnable + def evaluate(self): + """Evaluation""" + # Load test data + # Assuming your data loader can return test data when mode='test' is specified + encoded_texts, _, encoded_slots, _, slot_names = self.data_loader( + self.tokenizer, self.extra_params) + + # Get predictions + predictions = self(encoded_texts) + predicted_slot_ids = np.argmax(predictions, axis=-1) # Shape: (batch_size, sequence_length) + + true_labels = encoded_slots.flatten() + pred_labels = predicted_slot_ids.flatten() + + # Filter out padding tokens (assuming padding label id is 0) + mask = true_labels != 0 + filtered_true_labels = true_labels[mask] + filtered_pred_labels = pred_labels[mask] + + # Adjust labels to start from 0 (since padding label 0 is removed) + filtered_true_labels -= 1 + filtered_pred_labels -= 1 + + # Get slot names excluding padding + slot_names_no_pad = self.extra_params["slot_names"][1:] # Exclude padding label + + + report = classification_report( + filtered_true_labels, + filtered_pred_labels, + target_names=slot_names_no_pad, + zero_division=0 + ) + + print(report) + + # Optionally, you can return the report as a string or dictionary + return report + + @tfbp.runnable + def predict(self): + text = self.data_loader.get_prediction_data() + + info = self.get_prediction(text) + + print(self.summary()) + print("Text : " + text) + print(json.dumps(info, indent=2)) + + return json.dumps(info, indent=2) + + def get_slots_prediction(self, text: str, inputs, slot_probas): + slot_probas_np = slot_probas.numpy() + # Get the indices of the maximum values + slot_ids = slot_probas_np.argmax(axis=-1)[0, :] + + # get all slot names and add to out_dict as keys + out_dict = {} + predicted_slots = set([self.extra_params["slot_names"][s] + for s in slot_ids if s != 0]) + for ps in predicted_slots: + out_dict[ps] = [] + + # retrieving the tokenization that was used in the predictions + tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) + + # We'd like to eliminate all special tokens from our output + special_tokens = self.tokenizer.special_tokens_map.values() + + for token, slot_id in zip(tokens, slot_ids): + if token in special_tokens: + continue + # add all to out_dict + slot_name = self.extra_params["slot_names"][slot_id] + + if slot_name == "": + continue + + # collect tokens + collected_tokens = [token] + idx = tokens.index(token) + + # see if it starts with ## + # then it belongs to the previous token + if token.startswith("##"): + # check if the token already exists or not + if tokens[idx - 1] not in out_dict[slot_name]: + collected_tokens.insert(0, tokens[idx - 1]) + + # add collected tokens to slots + out_dict[slot_name].extend(collected_tokens) + + slot_names_to_ids = {value: key for key, value in enumerate( + self.extra_params["slot_names"])} + + entities = [] + # process out_dict + for slot_name in out_dict: + slot_id = slot_names_to_ids[slot_name] + slot_tokens = out_dict[slot_name] + + slot_value = self.tokenizer.convert_tokens_to_string( + slot_tokens).strip() + + entity = { + "entity": slot_name, + "value": slot_value, + "start": text.find(slot_value), + "end": text.find(slot_value) + len(slot_value), + "confidence": 0, + } + + # The confidence of a slot is the average confidence of tokens in that slot. + indices = [tokens.index(token) for token in slot_tokens] + if len(slot_tokens) > 0: + total = functools.reduce( + lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0) + entity["confidence"] = total / len(slot_tokens) + else: + entity["confidence"] = 0 + + entities.append(entity) + + return entities + + + def get_prediction(self, text: str): + inputs = self.data_loader.encode_text(text, self.tokenizer) + slot_probas = self(inputs) # type: ignore + + entities = [] + if slot_probas is not None: + entities = self.get_slots_prediction(text, inputs, slot_probas) + + return { + "text": text, + "entities": entities, + } diff --git a/nlu/utils/json_helper.py b/nlu/utils/json_helper.py index 7292e72..c22a6e0 100644 --- a/nlu/utils/json_helper.py +++ b/nlu/utils/json_helper.py @@ -4,7 +4,7 @@ import json class JsonHelper: data_folder: str - def __init__(self, model:str="jisf"): + def __init__(self, model:str = "intent_classifier"): self.data_folder=os.path.join("data",model) def read_dataset_json_file(self, filename): From dab9d9f7163c42bfc527478595749c7788ebcf89 Mon Sep 17 00:00:00 2001 From: Mohamed Marrouchi Date: Thu, 19 Sep 2024 10:52:18 +0100 Subject: [PATCH 2/7] fix: model lang --- nlu/data_loaders/jisfdl.py | 4 ++-- nlu/models/intent_classifier.py | 2 +- nlu/models/slot_filler.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index 7c4096e..18f8a89 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -102,10 +102,10 @@ class JISFDL(tfbp.DataLoader): lang = self.hparams.language all_examples = data["common_examples"] - if not lang: + if not bool(lang): examples = all_examples else: - examples = filter(lambda exp: any(not lang or (e['entity'] == 'language' and e['value'] == lang) for e in exp['entities']), all_examples) + examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples) # Parse raw data for exp in examples: diff --git a/nlu/models/intent_classifier.py b/nlu/models/intent_classifier.py index 2d2f27c..5491cb8 100644 --- a/nlu/models/intent_classifier.py +++ b/nlu/models/intent_classifier.py @@ -38,7 +38,7 @@ BERT_MODEL_BY_LANGUAGE = { @tfbp.default_export class IntentClassifier(tfbp.Model): default_hparams = { - "language": None, + "language": "", "num_epochs": 2, "dropout_prob": 0.1, "intent_num_labels": 7, diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py index 81eb54a..0393fb3 100644 --- a/nlu/models/slot_filler.py +++ b/nlu/models/slot_filler.py @@ -38,7 +38,7 @@ BERT_MODEL_BY_LANGUAGE = { @tfbp.default_export class SlotFiller(tfbp.Model): default_hparams = { - "language": None, + "language": "", "num_epochs": 2, "dropout_prob": 0.1, "slot_num_labels": 40 From 6183bf3a46b4e7eb135118507b8ef71bd1727ad6 Mon Sep 17 00:00:00 2001 From: Mohamed Marrouchi Date: Fri, 20 Sep 2024 15:35:02 +0100 Subject: [PATCH 3/7] fix: update env template --- docker/.env.example | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/.env.example b/docker/.env.example index 2104dcc..c63f981 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -43,7 +43,8 @@ AUTH_TOKEN=token123 LANGUAGE_CLASSIFIER=language-classifier INTENT_CLASSIFIERS=en,fr TFLC_REPO_ID=Hexastack/tflc -JISF_REPO_ID=Hexastack/jisf +INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier +SLOT_FILLER_REPO_ID=Hexastack/slot-filler NLP_PORT=5000 # Frontend (Next.js) From 24b8bcf1bad5f5b32e5df7f4b534ce94ed636ec5 Mon Sep 17 00:00:00 2001 From: auraofdivinity Date: Sat, 21 Sep 2024 10:41:04 +0530 Subject: [PATCH 4/7] fix: add request queue to handle concurrent zoom & offset requests --- .../components/visual-editor/v2/Diagrams.tsx | 21 ++++++------ frontend/src/utils/requestQueue.ts | 33 +++++++++++++++++++ 2 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 frontend/src/utils/requestQueue.ts diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx index 37f29fc..8ea2e44 100644 --- a/frontend/src/components/visual-editor/v2/Diagrams.tsx +++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx @@ -51,6 +51,7 @@ import { BlockPorts } from "@/types/visual-editor.types"; import BlockDialog from "../BlockDialog"; import { ZOOM_LEVEL } from "../constants"; import { useVisualEditor } from "../hooks/useVisualEditor"; +import { RequestQueue } from "@/utils/requestQueue"; const Diagrams = () => { const { t } = useTranslation(); @@ -108,25 +109,23 @@ const Diagrams = () => { const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, { invalidate: false, }); + + const requestQueue = useRef(new RequestQueue()); + const enqueueUpdate = (id: string, params: any) => { + requestQueue.current.enqueue(() => updateCategory({ id, params })); + }; + const debouncedZoomEvent = debounce((event) => { if (selectedCategoryId) { engine?.repaintCanvas(); - updateCategory({ - id: selectedCategoryId, - params: { - zoom: event.zoom, - }, - }); + enqueueUpdate(selectedCategoryId, { zoom: event.zoom }); } event.stopPropagation(); }, 200); const debouncedOffsetEvent = debounce((event) => { if (selectedCategoryId) { - updateCategory({ - id: selectedCategoryId, - params: { - offset: [event.offsetX, event.offsetY], - }, + enqueueUpdate(selectedCategoryId, { + offset: [event.offsetX, event.offsetY], }); } event.stopPropagation(); diff --git a/frontend/src/utils/requestQueue.ts b/frontend/src/utils/requestQueue.ts new file mode 100644 index 0000000..742e8d4 --- /dev/null +++ b/frontend/src/utils/requestQueue.ts @@ -0,0 +1,33 @@ +/* + * Copyright © 2024 Hexastack. All rights reserved. + * + * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms: + * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission. + * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file). + * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited. + */ + +export class RequestQueue { + private queue: Array<() => Promise> = []; + private isProcessing = false; + + enqueue(request: () => Promise) { + this.queue.push(request); + this.processQueue(); + } + + private async processQueue() { + if (this.isProcessing) return; + + this.isProcessing = true; + + while (this.queue.length > 0) { + const request = this.queue.shift(); + if (request) { + await request(); + } + } + + this.isProcessing = false; + } +} From dbf1fb002f3a05c1ac3f3e701fed4371aa27f4eb Mon Sep 17 00:00:00 2001 From: auraofdivinity Date: Sun, 22 Sep 2024 18:33:01 +0530 Subject: [PATCH 5/7] fix: extracting debounced update to a custom hook --- .../components/visual-editor/v2/Diagrams.tsx | 61 ++++++++++++------- frontend/src/hooks/useDebouncedUpdate.tsx | 47 ++++++++++++++ frontend/src/utils/requestQueue.ts | 33 ---------- 3 files changed, 86 insertions(+), 55 deletions(-) create mode 100644 frontend/src/hooks/useDebouncedUpdate.tsx delete mode 100644 frontend/src/utils/requestQueue.ts diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx index 8ea2e44..1b0b514 100644 --- a/frontend/src/components/visual-editor/v2/Diagrams.tsx +++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx @@ -32,7 +32,13 @@ import { DiagramModel, DiagramModelGenerics, } from "@projectstorm/react-diagrams"; -import { SyntheticEvent, useEffect, useRef, useState } from "react"; +import { + SyntheticEvent, + useCallback, + useEffect, + useRef, + useState, +} from "react"; import { useTranslation } from "react-i18next"; import { DeleteDialog } from "@/app-components/dialogs"; @@ -45,13 +51,13 @@ import { getDisplayDialogs, useDialog } from "@/hooks/useDialog"; import { useSearch } from "@/hooks/useSearch"; import { EntityType, Format } from "@/services/types"; import { IBlock } from "@/types/block.types"; -import { ICategory } from "@/types/category.types"; +import { ICategory, ICategoryAttributes } from "@/types/category.types"; import { BlockPorts } from "@/types/visual-editor.types"; import BlockDialog from "../BlockDialog"; import { ZOOM_LEVEL } from "../constants"; import { useVisualEditor } from "../hooks/useVisualEditor"; -import { RequestQueue } from "@/utils/requestQueue"; +import useDebouncedUpdate from "@/hooks/useDebouncedUpdate"; const Diagrams = () => { const { t } = useTranslation(); @@ -110,26 +116,37 @@ const Diagrams = () => { invalidate: false, }); - const requestQueue = useRef(new RequestQueue()); - const enqueueUpdate = (id: string, params: any) => { - requestQueue.current.enqueue(() => updateCategory({ id, params })); - }; + const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300); + const debouncedZoomEvent = useCallback( + (event: any) => { + if (selectedCategoryId) { + engine?.repaintCanvas(); + debouncedUpdateCategory({ + id: selectedCategoryId, + params: { + zoom: event.zoom, + }, + }); + } + event.stopPropagation(); + }, + [selectedCategoryId, debouncedUpdateCategory], + ); + const debouncedOffsetEvent = useCallback( + (event: any) => { + if (selectedCategoryId) { + debouncedUpdateCategory({ + id: selectedCategoryId, + params: { + offset: [event.offsetX, event.offsetY], + }, + }); + } + event.stopPropagation(); + }, + [selectedCategoryId, debouncedUpdateCategory], + ); - const debouncedZoomEvent = debounce((event) => { - if (selectedCategoryId) { - engine?.repaintCanvas(); - enqueueUpdate(selectedCategoryId, { zoom: event.zoom }); - } - event.stopPropagation(); - }, 200); - const debouncedOffsetEvent = debounce((event) => { - if (selectedCategoryId) { - enqueueUpdate(selectedCategoryId, { - offset: [event.offsetX, event.offsetY], - }); - } - event.stopPropagation(); - }, 200); const getBlockFromCache = useGetFromCache(EntityType.BLOCK); const updateCachedBlock = useUpdateCache(EntityType.BLOCK); const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK); diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx new file mode 100644 index 0000000..5df92b4 --- /dev/null +++ b/frontend/src/hooks/useDebouncedUpdate.tsx @@ -0,0 +1,47 @@ +import { debounce } from "@mui/material"; +import { useCallback, useEffect, useRef } from "react"; + +type DebouncedUpdateParams = { + id: string; + params: Record; +}; + +function useDebouncedUpdate( + apiUpdate: (params: DebouncedUpdateParams) => void, + delay: number = 300, +) { + const accumulatedUpdates = useRef(null); + + const processUpdates = useRef( + debounce(() => { + if (accumulatedUpdates.current) { + apiUpdate(accumulatedUpdates.current); + accumulatedUpdates.current = null; + } + }, delay), + ).current; + + const handleUpdate = useCallback( + (params: DebouncedUpdateParams) => { + accumulatedUpdates.current = { + id: params.id, + params: { + ...(accumulatedUpdates.current?.params || {}), + ...params.params, + }, + }; + processUpdates(); + }, + [processUpdates], + ); + + useEffect(() => { + return () => { + processUpdates.clear(); + }; + }, [processUpdates]); + + return handleUpdate; +} + +export default useDebouncedUpdate; diff --git a/frontend/src/utils/requestQueue.ts b/frontend/src/utils/requestQueue.ts deleted file mode 100644 index 742e8d4..0000000 --- a/frontend/src/utils/requestQueue.ts +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright © 2024 Hexastack. All rights reserved. - * - * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms: - * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission. - * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file). - * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited. - */ - -export class RequestQueue { - private queue: Array<() => Promise> = []; - private isProcessing = false; - - enqueue(request: () => Promise) { - this.queue.push(request); - this.processQueue(); - } - - private async processQueue() { - if (this.isProcessing) return; - - this.isProcessing = true; - - while (this.queue.length > 0) { - const request = this.queue.shift(); - if (request) { - await request(); - } - } - - this.isProcessing = false; - } -} From 08e5f6853bf6d361b71516198103e14b5030527f Mon Sep 17 00:00:00 2001 From: auraofdivinity Date: Sun, 22 Sep 2024 19:10:30 +0530 Subject: [PATCH 6/7] fix: fix linting errors --- frontend/src/components/visual-editor/v2/Diagrams.tsx | 9 +++------ frontend/src/hooks/useDebouncedUpdate.tsx | 2 -- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx index 1b0b514..431a408 100644 --- a/frontend/src/components/visual-editor/v2/Diagrams.tsx +++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx @@ -22,7 +22,6 @@ import { Tab, Tabs, Tooltip, - debounce, tabsClasses, } from "@mui/material"; import { @@ -47,17 +46,17 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete"; import { useFind } from "@/hooks/crud/useFind"; import { useGetFromCache } from "@/hooks/crud/useGet"; import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate"; +import useDebouncedUpdate from "@/hooks/useDebouncedUpdate"; import { getDisplayDialogs, useDialog } from "@/hooks/useDialog"; import { useSearch } from "@/hooks/useSearch"; import { EntityType, Format } from "@/services/types"; import { IBlock } from "@/types/block.types"; -import { ICategory, ICategoryAttributes } from "@/types/category.types"; +import { ICategory } from "@/types/category.types"; import { BlockPorts } from "@/types/visual-editor.types"; import BlockDialog from "../BlockDialog"; import { ZOOM_LEVEL } from "../constants"; import { useVisualEditor } from "../hooks/useVisualEditor"; -import useDebouncedUpdate from "@/hooks/useDebouncedUpdate"; const Diagrams = () => { const { t } = useTranslation(); @@ -115,7 +114,6 @@ const Diagrams = () => { const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, { invalidate: false, }); - const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300); const debouncedZoomEvent = useCallback( (event: any) => { @@ -130,7 +128,7 @@ const Diagrams = () => { } event.stopPropagation(); }, - [selectedCategoryId, debouncedUpdateCategory], + [selectedCategoryId, engine, debouncedUpdateCategory], ); const debouncedOffsetEvent = useCallback( (event: any) => { @@ -146,7 +144,6 @@ const Diagrams = () => { }, [selectedCategoryId, debouncedUpdateCategory], ); - const getBlockFromCache = useGetFromCache(EntityType.BLOCK); const updateCachedBlock = useUpdateCache(EntityType.BLOCK); const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK); diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx index 5df92b4..eecb38f 100644 --- a/frontend/src/hooks/useDebouncedUpdate.tsx +++ b/frontend/src/hooks/useDebouncedUpdate.tsx @@ -11,7 +11,6 @@ function useDebouncedUpdate( delay: number = 300, ) { const accumulatedUpdates = useRef(null); - const processUpdates = useRef( debounce(() => { if (accumulatedUpdates.current) { @@ -20,7 +19,6 @@ function useDebouncedUpdate( } }, delay), ).current; - const handleUpdate = useCallback( (params: DebouncedUpdateParams) => { accumulatedUpdates.current = { From 95fd2cbe3ac34a7143f17ea0331eec1b93b8b29d Mon Sep 17 00:00:00 2001 From: auraofdivinity Date: Mon, 23 Sep 2024 14:30:52 +0530 Subject: [PATCH 7/7] fix: adding license details --- frontend/src/hooks/useDebouncedUpdate.tsx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx index eecb38f..cb36ed9 100644 --- a/frontend/src/hooks/useDebouncedUpdate.tsx +++ b/frontend/src/hooks/useDebouncedUpdate.tsx @@ -1,3 +1,12 @@ +/* + * Copyright © 2024 Hexastack. All rights reserved. + * + * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms: + * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission. + * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file). + * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited. + */ + import { debounce } from "@mui/material"; import { useCallback, useEffect, useRef } from "react";