diff --git a/docker/.env.example b/docker/.env.example index 8e4f837..a78c294 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -45,7 +45,8 @@ AUTH_TOKEN=token123 LANGUAGE_CLASSIFIER=language-classifier INTENT_CLASSIFIERS=en,fr TFLC_REPO_ID=Hexastack/tflc -JISF_REPO_ID=Hexastack/jisf +INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier +SLOT_FILLER_REPO_ID=Hexastack/slot-filler NLP_PORT=5000 # Frontend (Next.js) diff --git a/frontend/src/components/visual-editor/v2/Diagrams.tsx b/frontend/src/components/visual-editor/v2/Diagrams.tsx index 37f29fc..431a408 100644 --- a/frontend/src/components/visual-editor/v2/Diagrams.tsx +++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx @@ -22,7 +22,6 @@ import { Tab, Tabs, Tooltip, - debounce, tabsClasses, } from "@mui/material"; import { @@ -32,7 +31,13 @@ import { DiagramModel, DiagramModelGenerics, } from "@projectstorm/react-diagrams"; -import { SyntheticEvent, useEffect, useRef, useState } from "react"; +import { + SyntheticEvent, + useCallback, + useEffect, + useRef, + useState, +} from "react"; import { useTranslation } from "react-i18next"; import { DeleteDialog } from "@/app-components/dialogs"; @@ -41,6 +46,7 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete"; import { useFind } from "@/hooks/crud/useFind"; import { useGetFromCache } from "@/hooks/crud/useGet"; import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate"; +import useDebouncedUpdate from "@/hooks/useDebouncedUpdate"; import { getDisplayDialogs, useDialog } from "@/hooks/useDialog"; import { useSearch } from "@/hooks/useSearch"; import { EntityType, Format } from "@/services/types"; @@ -108,29 +114,36 @@ const Diagrams = () => { const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, { invalidate: false, }); - const debouncedZoomEvent = debounce((event) => { - if (selectedCategoryId) { - engine?.repaintCanvas(); - updateCategory({ - id: selectedCategoryId, - params: { - zoom: event.zoom, - }, - }); - } - event.stopPropagation(); - }, 200); - const debouncedOffsetEvent = debounce((event) => { - if (selectedCategoryId) { - updateCategory({ - id: selectedCategoryId, - params: { - offset: [event.offsetX, event.offsetY], - }, - }); - } - event.stopPropagation(); - }, 200); + const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300); + const debouncedZoomEvent = useCallback( + (event: any) => { + if (selectedCategoryId) { + engine?.repaintCanvas(); + debouncedUpdateCategory({ + id: selectedCategoryId, + params: { + zoom: event.zoom, + }, + }); + } + event.stopPropagation(); + }, + [selectedCategoryId, engine, debouncedUpdateCategory], + ); + const debouncedOffsetEvent = useCallback( + (event: any) => { + if (selectedCategoryId) { + debouncedUpdateCategory({ + id: selectedCategoryId, + params: { + offset: [event.offsetX, event.offsetY], + }, + }); + } + event.stopPropagation(); + }, + [selectedCategoryId, debouncedUpdateCategory], + ); const getBlockFromCache = useGetFromCache(EntityType.BLOCK); const updateCachedBlock = useUpdateCache(EntityType.BLOCK); const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK); diff --git a/frontend/src/hooks/useDebouncedUpdate.tsx b/frontend/src/hooks/useDebouncedUpdate.tsx new file mode 100644 index 0000000..cb36ed9 --- /dev/null +++ b/frontend/src/hooks/useDebouncedUpdate.tsx @@ -0,0 +1,54 @@ +/* + * Copyright © 2024 Hexastack. All rights reserved. + * + * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms: + * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission. + * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file). + * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited. + */ + +import { debounce } from "@mui/material"; +import { useCallback, useEffect, useRef } from "react"; + +type DebouncedUpdateParams = { + id: string; + params: Record; +}; + +function useDebouncedUpdate( + apiUpdate: (params: DebouncedUpdateParams) => void, + delay: number = 300, +) { + const accumulatedUpdates = useRef(null); + const processUpdates = useRef( + debounce(() => { + if (accumulatedUpdates.current) { + apiUpdate(accumulatedUpdates.current); + accumulatedUpdates.current = null; + } + }, delay), + ).current; + const handleUpdate = useCallback( + (params: DebouncedUpdateParams) => { + accumulatedUpdates.current = { + id: params.id, + params: { + ...(accumulatedUpdates.current?.params || {}), + ...params.params, + }, + }; + processUpdates(); + }, + [processUpdates], + ); + + useEffect(() => { + return () => { + processUpdates.clear(); + }; + }, [processUpdates]); + + return handleUpdate; +} + +export default useDebouncedUpdate; diff --git a/nlu/.env.dev b/nlu/.env.dev index d5023df..6fac306 100644 --- a/nlu/.env.dev +++ b/nlu/.env.dev @@ -2,4 +2,5 @@ AUTH_TOKEN=123 LANGUAGE_CLASSIFIER=language-classifier INTENT_CLASSIFIERS=ar,fr,tn TFLC_REPO_ID=Hexastack/tflc -JISF_REPO_ID=Hexastack/jisf +INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier +SLOT_FILLER_REPO_ID=Hexastack/slot-filler diff --git a/nlu/.env.example b/nlu/.env.example index 52370c7..a863e43 100644 --- a/nlu/.env.example +++ b/nlu/.env.example @@ -1,5 +1,5 @@ AUTH_TOKEN= LANGUAGE_CLASSIFIER= INTENT_CLASSIFIERS= -TFLC_REPO_ID= -JISF_REPO_ID= \ No newline at end of file +INTENT_CLASSIFIER_REPO_ID= +SLOT_FILLER_REPO_ID= \ No newline at end of file diff --git a/nlu/README.md b/nlu/README.md index ff5ba60..dd8a00b 100644 --- a/nlu/README.md +++ b/nlu/README.md @@ -40,7 +40,7 @@ pip install -r requirements.txt You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`: ```bash $ cat env.sh -source env/bin/activate +source venv/bin/activate alias run='python run.py' ``` @@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1 Examples : ```bash # Intent classification -run fit intent-classifier-en-30072024 jisf --intent_num_labels=88 --slot_num_labels=17 --language=en +run fit intent-classifier-en-30072024 intent_classifier --intent_num_labels=88 --slot_num_labels=17 --language=en run predict intent-classifier-fr-30072024 --intent_num_labels=7 --slot_num_labels=2 --language=fr # Language classification diff --git a/nlu/data_loaders/jisfdl.py b/nlu/data_loaders/jisfdl.py index 75ae949..18f8a89 100644 --- a/nlu/data_loaders/jisfdl.py +++ b/nlu/data_loaders/jisfdl.py @@ -4,8 +4,8 @@ import json import numpy as np from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer + import boilerplate as tfbp -from utils.jisf_data_mapper import JisfDataMapper from utils.json_helper import JsonHelper @@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader): # Filter examples by language lang = self.hparams.language all_examples = data["common_examples"] - examples = filter(lambda exp: any( - e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples) + + if not bool(lang): + examples = all_examples + else: + examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples) # Parse raw data for exp in examples: @@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader): # the classifier. texts = [d.text for d in dataset] encoded_texts = self.encode_texts(texts, tokenizer) - # Map intents, load from the model (evaluate), recompute from dataset otherwise (train) intents = [d.intent for d in dataset] if not model_params: @@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader): # To handle those we need to add to slots_names. It can be some other symbol as well. slot_names.insert(0, "") else: - intent_names = model_params.intent_names - slot_names = model_params.slot_names + if "intent_names" in model_params: + intent_names = model_params["intent_names"] + else: + intent_names = None + + if "slot_names" in model_params: + slot_names = model_params["slot_names"] + else: + slot_names = None - intent_map = dict() # Dict : intent -> index - for idx, ui in enumerate(intent_names): - intent_map[ui] = idx + if intent_names: + intent_map = dict() # Dict : intent -> index + for idx, ui in enumerate(intent_names): + intent_map[ui] = idx + else: + intent_map = None # Encode intents - encoded_intents = self.encode_intents(intents, intent_map) + if intent_map: + encoded_intents = self.encode_intents(intents, intent_map) + else: + encoded_intents = None - slot_map: Dict[str, int] = dict() # slot -> index - for idx, us in enumerate(slot_names): - slot_map[us] = idx + if slot_names: + slot_map: Dict[str, int] = dict() # slot -> index + for idx, us in enumerate(slot_names): + slot_map[us] = idx + else: + slot_map = None # Encode slots # Text : Add a tune to my elrow Guest List @@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader): max_len = len(encoded_texts["input_ids"][0]) # type: ignore all_slots = [td.slots for td in dataset] all_texts = [td.text for td in dataset] - encoded_slots = self.encode_slots(tokenizer, + + if slot_map: + encoded_slots = self.encode_slots(tokenizer, all_slots, all_texts, slot_map, max_len) + else: + encoded_slots = None return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names diff --git a/nlu/data_loaders/tflcdl.py b/nlu/data_loaders/tflcdl.py index 09a23c0..b765f78 100644 --- a/nlu/data_loaders/tflcdl.py +++ b/nlu/data_loaders/tflcdl.py @@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader): self.json_helper = JsonHelper("tflc") self._save_dir = save_dir - print(hparams) + # We will opt for a TF-IDF representation of the data as the frequency of word # roots should give us a good idea about which language we're dealing with. if method == "fit": diff --git a/nlu/main.py b/nlu/main.py index b85ce11..f7e4f8b 100644 --- a/nlu/main.py +++ b/nlu/main.py @@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED") AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',') TFLC_REPO_ID = os.getenv("TFLC_REPO_ID") -JISF_REPO_ID = os.getenv("JISF_REPO_ID") - +INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID") +SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID") def load_language_classifier(): # Init language classifier model @@ -27,21 +27,31 @@ def load_language_classifier(): logging.info(f'Successfully loaded the language classifier model') return model - def load_intent_classifiers(): - Model = tfbp.get_model("jisf") - models = {} + Model = tfbp.get_model("intent_classifier") + intent_classifiers = {} for language in AVAILABLE_LANGUAGES: kwargs = {} - models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs) - models[language].load_model() + intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs) + intent_classifiers[language].load_model() logging.info(f'Successfully loaded the intent classifier {language} model') - return models + return intent_classifiers + +def load_slot_classifiers(): + Model = tfbp.get_model("slot_classifier") + slot_fillers = {} + for language in AVAILABLE_LANGUAGES: + kwargs = {} + slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs) + slot_fillers[language].load_model() + logging.info(f'Successfully loaded the slot filler {language} model') + return slot_fillers def load_models(): app.language_classifier = load_language_classifier() # type: ignore app.intent_classifiers = load_intent_classifiers() # type: ignore + app.slot_fillers = load_intent_classifiers() # type: ignore app = FastAPI() @@ -74,13 +84,20 @@ async def check_health(): @app.post("/parse") def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]): - if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'): + if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'): headers = {"Retry-After": "120"} # Suggest retrying after 2 minutes - return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers) + return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers) language = app.language_classifier.get_prediction(input.q) # type: ignore lang = language.get("value") - prediction = app.intent_classifiers[lang].get_prediction( + intent_prediction = app.intent_classifiers[lang].get_prediction( input.q) # type: ignore - prediction.get("entities").append(language) - return prediction + slot_prediction = app.slot_fillers[lang].get_prediction( + input.q) # type: ignore + slot_prediction.get("entities").append(language) + + return { + "text": input.q, + "intent": intent_prediction.get("intent"), + "entities": slot_prediction.get("entities"), + } diff --git a/nlu/models/jisf.py b/nlu/models/intent_classifier.py similarity index 59% rename from nlu/models/jisf.py rename to nlu/models/intent_classifier.py index 71c14ef..5491cb8 100644 --- a/nlu/models/jisf.py +++ b/nlu/models/intent_classifier.py @@ -1,4 +1,3 @@ -import functools import json import math from typing import Tuple, Dict, List @@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL import boilerplate as tfbp ## -# JISF : Joint Intent Classification and Slot filling with BERT -# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), +# Intent Classification with BERT +# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project. # # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of @@ -33,19 +32,16 @@ import boilerplate as tfbp BERT_MODEL_BY_LANGUAGE = { 'en': "bert-base-cased", 'fr': "dbmdz/bert-base-french-europeana-cased", - 'ar': 'asafaya/bert-base-arabic', - 'tn': 'dbmdz/bert-base-french-europeana-cased' } @tfbp.default_export -class JISF(tfbp.Model): +class IntentClassifier(tfbp.Model): default_hparams = { - "language": "fr", + "language": "", "num_epochs": 2, "dropout_prob": 0.1, "intent_num_labels": 7, - "slot_num_labels": 40 } data_loader: JISFDL @@ -57,8 +53,8 @@ class JISF(tfbp.Model): # Load Tokenizer from transformers # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier. - bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language] - # bert_model_name = typing.cast(str, self.hparams.bert_model_name) + bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"] + self.tokenizer = AutoTokenizer.from_pretrained( bert_model_name, use_fast=False) self.bert = TFBertModel.from_pretrained(bert_model_name) @@ -66,27 +62,18 @@ class JISF(tfbp.Model): self.dropout = Dropout(self.hparams.dropout_prob) self.intent_classifier = Dense(self.hparams.intent_num_labels, name="intent_classifier", activation="softmax") - self.slot_classifier = Dense(self.hparams.slot_num_labels, - name="slot_classifier", activation="softmax") def call(self, inputs, **kwargs): - # two outputs from BERT trained_bert = self.bert(inputs, **kwargs) pooled_output = trained_bert.pooler_output - sequence_output = trained_bert.last_hidden_state - - # sequence_output will be used for slot_filling / classification - sequence_output = self.dropout(sequence_output, - training=kwargs.get("training", False)) - slot_probas = self.slot_classifier(sequence_output) - + # pooled_output for intent classification pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) intent_probas = self.intent_classifier(pooled_output) - return slot_probas, intent_probas + return intent_probas def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]: return data_loader(self.tokenizer) @@ -137,18 +124,11 @@ class JISF(tfbp.Model): raise ValueError( f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}" ) - if self.hparams.slot_num_labels != len(slot_names): - raise ValueError( - f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}" - ) # Hyperparams, Optimizer and Loss function opt = Adam(learning_rate=3e-5, epsilon=1e-08) - # two outputs, one for slots, another for intents - # we have to fine tune for both - losses = [SparseCategoricalCrossentropy(), - SparseCategoricalCrossentropy()] + losses = SparseCategoricalCrossentropy() metrics = [SparseCategoricalAccuracy("accuracy")] @@ -159,11 +139,10 @@ class JISF(tfbp.Model): "attention_mask": encoded_texts["attention_mask"]} super().fit( - x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) + x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) # Persist the model self.extra_params["intent_names"] = intent_names - self.extra_params["slot_names"] = slot_names self.save() @@ -175,8 +154,8 @@ class JISF(tfbp.Model): metrics = [SparseCategoricalAccuracy("accuracy")] self.compile(metrics=metrics) - _, intent_probas = self(encoded_texts) # type: ignore - + intent_probas = self(encoded_texts) # type: ignore + scores = self.get_metrics_by_intent(intent_probas, encoded_intents) overall_score = {} @@ -204,85 +183,10 @@ class JISF(tfbp.Model): print(json.dumps(info, indent=2)) return json.dumps(info, indent=2) - - def get_slots_prediction(self, text: str, inputs, slot_probas): - slot_probas_np = slot_probas.numpy() - # Get the indices of the maximum values - slot_ids = slot_probas_np.argmax(axis=-1)[0, :] - - # get all slot names and add to out_dict as keys - out_dict = {} - predicted_slots = set([self.extra_params["slot_names"][s] - for s in slot_ids if s != 0]) - for ps in predicted_slots: - out_dict[ps] = [] - - # retrieving the tokenization that was used in the predictions - tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) - - # We'd like to eliminate all special tokens from our output - special_tokens = self.tokenizer.special_tokens_map.values() - - for token, slot_id in zip(tokens, slot_ids): - if token in special_tokens: - continue - # add all to out_dict - slot_name = self.extra_params["slot_names"][slot_id] - - if slot_name == "": - continue - - # collect tokens - collected_tokens = [token] - idx = tokens.index(token) - - # see if it starts with ## - # then it belongs to the previous token - if token.startswith("##"): - # check if the token already exists or not - if tokens[idx - 1] not in out_dict[slot_name]: - collected_tokens.insert(0, tokens[idx - 1]) - - # add collected tokens to slots - out_dict[slot_name].extend(collected_tokens) - - slot_names_to_ids = {value: key for key, value in enumerate( - self.extra_params["slot_names"])} - - entities = [] - # process out_dict - for slot_name in out_dict: - slot_id = slot_names_to_ids[slot_name] - slot_tokens = out_dict[slot_name] - - slot_value = self.tokenizer.convert_tokens_to_string( - slot_tokens).strip() - - entity = { - "entity": slot_name, - "value": slot_value, - "start": text.find(slot_value), - "end": text.find(slot_value) + len(slot_value), - "confidence": 0, - } - - # The confidence of a slot is the average confidence of tokens in that slot. - indices = [tokens.index(token) for token in slot_tokens] - if len(slot_tokens) > 0: - total = functools.reduce( - lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0) - entity["confidence"] = total / len(slot_tokens) - else: - entity["confidence"] = 0 - - entities.append(entity) - - return entities - def get_prediction(self, text: str): inputs = self.data_loader.encode_text(text, self.tokenizer) - slot_probas, intent_probas = self(inputs) # type: ignore + intent_probas = self(inputs) # type: ignore intent_probas_np = intent_probas.numpy() @@ -292,15 +196,8 @@ class JISF(tfbp.Model): # get the confidences for each intent intent_confidences = intent_probas_np[0] - - entities = [] - if slot_probas is not None: - entities = self.get_slots_prediction(text, inputs, slot_probas) - return { "text": text, "intent": {"name": self.extra_params["intent_names"][intent_id], "confidence": float(intent_confidences[intent_id])}, - "entities": entities, } - diff --git a/nlu/models/slot_filler.py b/nlu/models/slot_filler.py new file mode 100644 index 0000000..0393fb3 --- /dev/null +++ b/nlu/models/slot_filler.py @@ -0,0 +1,250 @@ +import functools +import json +from transformers import TFBertModel, AutoTokenizer +from keras.layers import Dropout, Dense +from sys import platform + +if platform == "darwin": + from keras.optimizers.legacy import Adam +else: + from keras.optimizers import Adam + +from keras.losses import SparseCategoricalCrossentropy +from keras.metrics import SparseCategoricalAccuracy +import numpy as np + +from data_loaders.jisfdl import JISFDL + +from sklearn.metrics import classification_report + + +import boilerplate as tfbp + +## +# Slot filling with BERT +# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), +# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project. +# +# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of +# the paper with the original dataset. +## + +BERT_MODEL_BY_LANGUAGE = { + 'en': "bert-base-cased", + 'fr': "dbmdz/bert-base-french-europeana-cased", +} + + +@tfbp.default_export +class SlotFiller(tfbp.Model): + default_hparams = { + "language": "", + "num_epochs": 2, + "dropout_prob": 0.1, + "slot_num_labels": 40 + } + data_loader: JISFDL + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Init data loader + self.data_loader = JISFDL(**kwargs) + + # Load Tokenizer from transformers + # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier. + bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"] + + self.tokenizer = AutoTokenizer.from_pretrained( + bert_model_name, use_fast=False) + self.bert = TFBertModel.from_pretrained(bert_model_name) + + self.dropout = Dropout(self.hparams.dropout_prob) + self.slot_classifier = Dense(self.hparams.slot_num_labels, + name="slot_classifier", activation="softmax") + + + def call(self, inputs, **kwargs): + trained_bert = self.bert(inputs, **kwargs) + sequence_output = trained_bert.last_hidden_state + + # sequence_output will be used for slot_filling + sequence_output = self.dropout(sequence_output, + training=kwargs.get("training", False)) + slot_probas = self.slot_classifier(sequence_output) + + return slot_probas + + @tfbp.runnable + def fit(self): + """Training""" + encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader( + self.tokenizer) + + if self.hparams.slot_num_labels != len(slot_names): + raise ValueError( + f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}" + ) + + # Hyperparams, Optimizer and Loss function + opt = Adam(learning_rate=3e-5, epsilon=1e-08) + + # two outputs, one for slots, another for intents + # we have to fine tune for both + losses = SparseCategoricalCrossentropy() + + metrics = [SparseCategoricalAccuracy("accuracy")] + + # Compile model + self.compile(optimizer=opt, loss=losses, metrics=metrics) + + x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"], + "attention_mask": encoded_texts["attention_mask"]} + + super().fit( + x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) + + # Persist the model + self.extra_params["slot_names"] = slot_names + + self.save() + + @tfbp.runnable + def evaluate(self): + """Evaluation""" + # Load test data + # Assuming your data loader can return test data when mode='test' is specified + encoded_texts, _, encoded_slots, _, slot_names = self.data_loader( + self.tokenizer, self.extra_params) + + # Get predictions + predictions = self(encoded_texts) + predicted_slot_ids = np.argmax(predictions, axis=-1) # Shape: (batch_size, sequence_length) + + true_labels = encoded_slots.flatten() + pred_labels = predicted_slot_ids.flatten() + + # Filter out padding tokens (assuming padding label id is 0) + mask = true_labels != 0 + filtered_true_labels = true_labels[mask] + filtered_pred_labels = pred_labels[mask] + + # Adjust labels to start from 0 (since padding label 0 is removed) + filtered_true_labels -= 1 + filtered_pred_labels -= 1 + + # Get slot names excluding padding + slot_names_no_pad = self.extra_params["slot_names"][1:] # Exclude padding label + + + report = classification_report( + filtered_true_labels, + filtered_pred_labels, + target_names=slot_names_no_pad, + zero_division=0 + ) + + print(report) + + # Optionally, you can return the report as a string or dictionary + return report + + @tfbp.runnable + def predict(self): + text = self.data_loader.get_prediction_data() + + info = self.get_prediction(text) + + print(self.summary()) + print("Text : " + text) + print(json.dumps(info, indent=2)) + + return json.dumps(info, indent=2) + + def get_slots_prediction(self, text: str, inputs, slot_probas): + slot_probas_np = slot_probas.numpy() + # Get the indices of the maximum values + slot_ids = slot_probas_np.argmax(axis=-1)[0, :] + + # get all slot names and add to out_dict as keys + out_dict = {} + predicted_slots = set([self.extra_params["slot_names"][s] + for s in slot_ids if s != 0]) + for ps in predicted_slots: + out_dict[ps] = [] + + # retrieving the tokenization that was used in the predictions + tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) + + # We'd like to eliminate all special tokens from our output + special_tokens = self.tokenizer.special_tokens_map.values() + + for token, slot_id in zip(tokens, slot_ids): + if token in special_tokens: + continue + # add all to out_dict + slot_name = self.extra_params["slot_names"][slot_id] + + if slot_name == "": + continue + + # collect tokens + collected_tokens = [token] + idx = tokens.index(token) + + # see if it starts with ## + # then it belongs to the previous token + if token.startswith("##"): + # check if the token already exists or not + if tokens[idx - 1] not in out_dict[slot_name]: + collected_tokens.insert(0, tokens[idx - 1]) + + # add collected tokens to slots + out_dict[slot_name].extend(collected_tokens) + + slot_names_to_ids = {value: key for key, value in enumerate( + self.extra_params["slot_names"])} + + entities = [] + # process out_dict + for slot_name in out_dict: + slot_id = slot_names_to_ids[slot_name] + slot_tokens = out_dict[slot_name] + + slot_value = self.tokenizer.convert_tokens_to_string( + slot_tokens).strip() + + entity = { + "entity": slot_name, + "value": slot_value, + "start": text.find(slot_value), + "end": text.find(slot_value) + len(slot_value), + "confidence": 0, + } + + # The confidence of a slot is the average confidence of tokens in that slot. + indices = [tokens.index(token) for token in slot_tokens] + if len(slot_tokens) > 0: + total = functools.reduce( + lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0) + entity["confidence"] = total / len(slot_tokens) + else: + entity["confidence"] = 0 + + entities.append(entity) + + return entities + + + def get_prediction(self, text: str): + inputs = self.data_loader.encode_text(text, self.tokenizer) + slot_probas = self(inputs) # type: ignore + + entities = [] + if slot_probas is not None: + entities = self.get_slots_prediction(text, inputs, slot_probas) + + return { + "text": text, + "entities": entities, + } diff --git a/nlu/utils/json_helper.py b/nlu/utils/json_helper.py index 7292e72..c22a6e0 100644 --- a/nlu/utils/json_helper.py +++ b/nlu/utils/json_helper.py @@ -4,7 +4,7 @@ import json class JsonHelper: data_folder: str - def __init__(self, model:str="jisf"): + def __init__(self, model:str = "intent_classifier"): self.data_folder=os.path.join("data",model) def read_dataset_json_file(self, filename):