Merge branch 'main' into 40-issue-prevent-users-from-deleting-their-own-roles

2025-02-18 18:38:42 +00:00 · 2024-09-23 11:16:20 +01:00 · 2024-09-23 11:16:20 +01:00 · a999604472
commit a999604472
parent 5936816a04 37f20f7d79
12 changed files with 431 additions and 176 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -45,7 +45,8 @@ AUTH_TOKEN=token123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=en,fr
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
 SLOT_FILLER_REPO_ID=Hexastack/slot-filler
 NLP_PORT=5000
 # Frontend (Next.js)
--- a/frontend/src/components/visual-editor/v2/Diagrams.tsx
+++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx
@ -22,7 +22,6 @@ import {
  Tab,
  Tabs,
  Tooltip,
  debounce,
  tabsClasses,
 } from "@mui/material";
 import {
@ -32,7 +31,13 @@ import {
  DiagramModel,
  DiagramModelGenerics,
 } from "@projectstorm/react-diagrams";
-import { SyntheticEvent, useEffect, useRef, useState } from "react";
+import {
  SyntheticEvent,
  useCallback,
  useEffect,
  useRef,
  useState,
 } from "react";
 import { useTranslation } from "react-i18next";
 import { DeleteDialog } from "@/app-components/dialogs";
@ -41,6 +46,7 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete";
 import { useFind } from "@/hooks/crud/useFind";
 import { useGetFromCache } from "@/hooks/crud/useGet";
 import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate";
 import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
 import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
 import { useSearch } from "@/hooks/useSearch";
 import { EntityType, Format } from "@/services/types";
@ -108,10 +114,12 @@ const Diagrams = () => {
  const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
    invalidate: false,
  });
-  const debouncedZoomEvent = debounce((event) => {
+  const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
  const debouncedZoomEvent = useCallback(
    (event: any) => {
      if (selectedCategoryId) {
        engine?.repaintCanvas();
-      updateCategory({
+        debouncedUpdateCategory({
          id: selectedCategoryId,
          params: {
            zoom: event.zoom,
@ -119,10 +127,13 @@ const Diagrams = () => {
        });
      }
      event.stopPropagation();
-  }, 200);
+    },
-  const debouncedOffsetEvent = debounce((event) => {
+    [selectedCategoryId, engine, debouncedUpdateCategory],
  );
  const debouncedOffsetEvent = useCallback(
    (event: any) => {
      if (selectedCategoryId) {
-      updateCategory({
+        debouncedUpdateCategory({
          id: selectedCategoryId,
          params: {
            offset: [event.offsetX, event.offsetY],
@ -130,7 +141,9 @@ const Diagrams = () => {
        });
      }
      event.stopPropagation();
-  }, 200);
+    },
    [selectedCategoryId, debouncedUpdateCategory],
  );
  const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
  const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
  const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);
--- a/frontend/src/hooks/useDebouncedUpdate.tsx
+++ b/frontend/src/hooks/useDebouncedUpdate.tsx
@ -0,0 +1,54 @@
 /*
 * Copyright © 2024 Hexastack. All rights reserved.
 *
 * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
 * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
 * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
 * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
 */
 import { debounce } from "@mui/material";
 import { useCallback, useEffect, useRef } from "react";
 type DebouncedUpdateParams = {
  id: string;
  params: Record<string, any>;
 };
 function useDebouncedUpdate(
  apiUpdate: (params: DebouncedUpdateParams) => void,
  delay: number = 300,
 ) {
  const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
  const processUpdates = useRef(
    debounce(() => {
      if (accumulatedUpdates.current) {
        apiUpdate(accumulatedUpdates.current);
        accumulatedUpdates.current = null;
      }
    }, delay),
  ).current;
  const handleUpdate = useCallback(
    (params: DebouncedUpdateParams) => {
      accumulatedUpdates.current = {
        id: params.id,
        params: {
          ...(accumulatedUpdates.current?.params || {}),
          ...params.params,
        },
      };
      processUpdates();
    },
    [processUpdates],
  );
  useEffect(() => {
    return () => {
      processUpdates.clear();
    };
  }, [processUpdates]);
  return handleUpdate;
 }
 export default useDebouncedUpdate;
--- a/nlu/.env.dev
+++ b/nlu/.env.dev
@ -2,4 +2,5 @@ AUTH_TOKEN=123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=ar,fr,tn
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
 SLOT_FILLER_REPO_ID=Hexastack/slot-filler
--- a/nlu/.env.example
+++ b/nlu/.env.example
@ -1,5 +1,5 @@
 AUTH_TOKEN=
 LANGUAGE_CLASSIFIER=
 INTENT_CLASSIFIERS=
-TFLC_REPO_ID=
+INTENT_CLASSIFIER_REPO_ID=
-JISF_REPO_ID=
+SLOT_FILLER_REPO_ID=
--- a/nlu/README.md
+++ b/nlu/README.md
@ -40,7 +40,7 @@ pip install -r requirements.txt
 You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`:
 ```bash
 $ cat env.sh
-source env/bin/activate
+source venv/bin/activate
 alias run='python run.py'
 ```
@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1
 Examples :
 ```bash
 # Intent classification
-run fit intent-classifier-en-30072024 jisf  --intent_num_labels=88 --slot_num_labels=17 --language=en
+run fit intent-classifier-en-30072024 intent_classifier  --intent_num_labels=88 --slot_num_labels=17 --language=en
 run predict intent-classifier-fr-30072024  --intent_num_labels=7 --slot_num_labels=2 --language=fr
 # Language classification
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@ -4,8 +4,8 @@ import json
 import numpy as np
 from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
 import boilerplate as tfbp
 from utils.jisf_data_mapper import JisfDataMapper
 from utils.json_helper import JsonHelper
@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader):
        # Filter examples by language
        lang = self.hparams.language
        all_examples = data["common_examples"]
-        examples = filter(lambda exp: any(
+
-            e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
+        if not bool(lang):
            examples = all_examples
        else:
            examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
        # Parse raw data
        for exp in examples:
@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader):
        # the classifier.
        texts = [d.text for d in dataset]
        encoded_texts = self.encode_texts(texts, tokenizer)
        # Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
        intents = [d.intent for d in dataset]
        if not model_params:
@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader):
            # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
            slot_names.insert(0, "<PAD>")
        else:
-            intent_names = model_params.intent_names
+            if "intent_names" in model_params:
-            slot_names = model_params.slot_names
+                intent_names = model_params["intent_names"]
            else:
                intent_names = None
            if "slot_names" in model_params:
                slot_names = model_params["slot_names"]
            else:
                slot_names = None
        if intent_names:
            intent_map = dict()  # Dict : intent -> index
            for idx, ui in enumerate(intent_names):
                intent_map[ui] = idx
        else:
            intent_map = None
        # Encode intents
        if intent_map:
            encoded_intents = self.encode_intents(intents, intent_map)
        else:
            encoded_intents = None
        if slot_names:
            slot_map: Dict[str, int] = dict()  # slot -> index
            for idx, us in enumerate(slot_names):
                slot_map[us] = idx
        else:
            slot_map = None
        # Encode slots
        # Text : Add a tune to my elrow Guest List
@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader):
        max_len = len(encoded_texts["input_ids"][0])  # type: ignore
        all_slots = [td.slots for td in dataset]
        all_texts = [td.text for td in dataset]
        if slot_map:
            encoded_slots = self.encode_slots(tokenizer,
                                          all_slots, all_texts, slot_map, max_len)
        else:
            encoded_slots = None
        return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
--- a/nlu/data_loaders/tflcdl.py
+++ b/nlu/data_loaders/tflcdl.py
@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader):
        self.json_helper = JsonHelper("tflc")
        self._save_dir = save_dir
-        print(hparams)
+
        # We will opt for a TF-IDF representation of the data as the frequency of word
        # roots should give us a good idea about which language we're dealing with.
        if method == "fit":
--- a/nlu/main.py
+++ b/nlu/main.py
@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED")
 AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',')
 TFLC_REPO_ID = os.getenv("TFLC_REPO_ID")
-JISF_REPO_ID = os.getenv("JISF_REPO_ID")
+INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID")
-
+SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID")
 def load_language_classifier():
    # Init language classifier model
@ -27,21 +27,31 @@ def load_language_classifier():
    logging.info(f'Successfully loaded the language classifier model')
    return model
 def load_intent_classifiers():
-    Model = tfbp.get_model("jisf")
+    Model = tfbp.get_model("intent_classifier")
-    models = {}
+    intent_classifiers = {}
    for language in AVAILABLE_LANGUAGES:
        kwargs = {}
-        models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs)
+        intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs)
-        models[language].load_model()
+        intent_classifiers[language].load_model()
        logging.info(f'Successfully loaded the intent classifier {language} model')
-    return models
+    return intent_classifiers
 def load_slot_classifiers():
    Model = tfbp.get_model("slot_classifier")
    slot_fillers = {}
    for language in AVAILABLE_LANGUAGES:
        kwargs = {}
        slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs)
        slot_fillers[language].load_model()
        logging.info(f'Successfully loaded the slot filler {language} model')
    return slot_fillers
 def load_models():
    app.language_classifier = load_language_classifier()  # type: ignore
    app.intent_classifiers = load_intent_classifiers()  # type: ignore
    app.slot_fillers = load_intent_classifiers()  # type: ignore
 app = FastAPI()
@ -74,13 +84,20 @@ async def check_health():
@app.post("/parse")
 def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]):
-    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'):
+    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'):
        headers = {"Retry-After": "120"}  # Suggest retrying after 2 minutes
-        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers)
+        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers)
    language = app.language_classifier.get_prediction(input.q)  # type: ignore
    lang = language.get("value")
-    prediction = app.intent_classifiers[lang].get_prediction(
+    intent_prediction = app.intent_classifiers[lang].get_prediction(
        input.q)  # type: ignore
-    prediction.get("entities").append(language)
+    slot_prediction = app.slot_fillers[lang].get_prediction(
-    return prediction
+        input.q)  # type: ignore
    slot_prediction.get("entities").append(language)
    return {
        "text": input.q,
        "intent": intent_prediction.get("intent"),
        "entities": slot_prediction.get("entities"),
    }
--- a/nlu/models/intent_classifier.py
+++ b/nlu/models/intent_classifier.py
@ -1,4 +1,3 @@
 import functools
 import json
 import math
 from typing import Tuple, Dict, List
@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL
 import boilerplate as tfbp
 ##
-# JISF : Joint Intent Classification and Slot filling with BERT
+# Intent Classification with BERT
-# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
+# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
 # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
 #
 # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
@ -33,19 +32,16 @@ import boilerplate as tfbp
 BERT_MODEL_BY_LANGUAGE = {
    'en': "bert-base-cased",
    'fr': "dbmdz/bert-base-french-europeana-cased",
    'ar': 'asafaya/bert-base-arabic',
    'tn': 'dbmdz/bert-base-french-europeana-cased'
 }
@tfbp.default_export
-class JISF(tfbp.Model):
+class IntentClassifier(tfbp.Model):
    default_hparams = {
-        "language": "fr",
+        "language": "",
        "num_epochs": 2,
        "dropout_prob": 0.1,
        "intent_num_labels": 7,
        "slot_num_labels": 40
    }
    data_loader: JISFDL
@ -57,8 +53,8 @@ class JISF(tfbp.Model):
        # Load Tokenizer from transformers
        # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
-        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language]
+        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
-        # bert_model_name = typing.cast(str, self.hparams.bert_model_name)
+
        self.tokenizer = AutoTokenizer.from_pretrained(
            bert_model_name, use_fast=False)
        self.bert = TFBertModel.from_pretrained(bert_model_name)
@ -66,27 +62,18 @@ class JISF(tfbp.Model):
        self.dropout = Dropout(self.hparams.dropout_prob)
        self.intent_classifier = Dense(self.hparams.intent_num_labels,
                                       name="intent_classifier", activation="softmax")
        self.slot_classifier = Dense(self.hparams.slot_num_labels,
                                     name="slot_classifier", activation="softmax")
    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
        sequence_output = trained_bert.last_hidden_state
        # sequence_output will be used for slot_filling / classification
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_probas = self.slot_classifier(sequence_output)
        # pooled_output for intent classification
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_probas = self.intent_classifier(pooled_output)
-        return slot_probas, intent_probas
+        return intent_probas
    def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]:
        return data_loader(self.tokenizer)
@ -137,18 +124,11 @@ class JISF(tfbp.Model):
            raise ValueError(
                f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}"
            )
        if self.hparams.slot_num_labels != len(slot_names):
            raise ValueError(
                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
            )
        # Hyperparams, Optimizer and Loss function
        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
-        # two outputs, one for slots, another for intents
+        losses = SparseCategoricalCrossentropy()
        # we have to fine tune for both
        losses = [SparseCategoricalCrossentropy(),
                  SparseCategoricalCrossentropy()]
        metrics = [SparseCategoricalAccuracy("accuracy")]
@ -159,11 +139,10 @@ class JISF(tfbp.Model):
             "attention_mask": encoded_texts["attention_mask"]}
        super().fit(
-            x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
+            x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
        # Persist the model
        self.extra_params["intent_names"] = intent_names
        self.extra_params["slot_names"] = slot_names
        self.save()
@ -175,7 +154,7 @@ class JISF(tfbp.Model):
        metrics = [SparseCategoricalAccuracy("accuracy")]
        self.compile(metrics=metrics)
-        _, intent_probas = self(encoded_texts)  # type: ignore
+        intent_probas = self(encoded_texts)  # type: ignore
        scores = self.get_metrics_by_intent(intent_probas, encoded_intents)
@ -205,84 +184,9 @@ class JISF(tfbp.Model):
        return json.dumps(info, indent=2)
    def get_slots_prediction(self, text: str, inputs, slot_probas):
        slot_probas_np = slot_probas.numpy()
        # Get the indices of the maximum values
        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
        # get all slot names and add to out_dict as keys
        out_dict = {}
        predicted_slots = set([self.extra_params["slot_names"][s]
                            for s in slot_ids if s != 0])
        for ps in predicted_slots:
            out_dict[ps] = []
        # retrieving the tokenization that was used in the predictions
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        # We'd like to eliminate all special tokens from our output
        special_tokens = self.tokenizer.special_tokens_map.values()
        for token, slot_id in zip(tokens, slot_ids):
            if token in special_tokens:
                continue
            # add all to out_dict
            slot_name = self.extra_params["slot_names"][slot_id]
            if slot_name == "<PAD>":
                continue
            # collect tokens
            collected_tokens = [token]
            idx = tokens.index(token)
            # see if it starts with ##
            # then it belongs to the previous token
            if token.startswith("##"):
                # check if the token already exists or not
                if tokens[idx - 1] not in out_dict[slot_name]:
                    collected_tokens.insert(0, tokens[idx - 1])
            # add collected tokens to slots
            out_dict[slot_name].extend(collected_tokens)
        slot_names_to_ids = {value: key for key, value in enumerate(
            self.extra_params["slot_names"])}
        entities = []
        # process out_dict
        for slot_name in out_dict:
            slot_id = slot_names_to_ids[slot_name]
            slot_tokens = out_dict[slot_name]
            slot_value = self.tokenizer.convert_tokens_to_string(
                slot_tokens).strip()
            entity = {
                "entity": slot_name,
                "value": slot_value,
                "start": text.find(slot_value),
                "end":  text.find(slot_value) + len(slot_value),
                "confidence": 0,
            }
            # The confidence of a slot is the average confidence of tokens in that slot.
            indices = [tokens.index(token) for token in slot_tokens]
            if len(slot_tokens) > 0:
                total = functools.reduce(
                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
                entity["confidence"] = total / len(slot_tokens)
            else:
                entity["confidence"] = 0
            entities.append(entity)
        return entities
    def get_prediction(self, text: str):
        inputs = self.data_loader.encode_text(text, self.tokenizer)
-        slot_probas, intent_probas = self(inputs)  # type: ignore
+        intent_probas = self(inputs)  # type: ignore
        intent_probas_np = intent_probas.numpy()
@ -292,15 +196,8 @@ class JISF(tfbp.Model):
        # get the confidences for each intent
        intent_confidences = intent_probas_np[0]
        entities = []
        if slot_probas is not None:
            entities = self.get_slots_prediction(text, inputs, slot_probas)
        return {
            "text": text,
            "intent": {"name": self.extra_params["intent_names"][intent_id],
                       "confidence": float(intent_confidences[intent_id])},
            "entities": entities,
        }
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@ -0,0 +1,250 @@
 import functools
 import json
 from transformers import TFBertModel, AutoTokenizer
 from keras.layers import Dropout, Dense
 from sys import platform
 if platform == "darwin":
    from keras.optimizers.legacy import Adam
 else:
    from keras.optimizers import Adam
 from keras.losses import SparseCategoricalCrossentropy
 from keras.metrics import SparseCategoricalAccuracy
 import numpy as np
 from data_loaders.jisfdl import JISFDL
 from sklearn.metrics import classification_report
 import boilerplate as tfbp
 ##
 # Slot filling with BERT
 # This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
 # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
 #
 # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
 # the paper with the original dataset.
 ##
 BERT_MODEL_BY_LANGUAGE = {
    'en': "bert-base-cased",
    'fr': "dbmdz/bert-base-french-europeana-cased",
 }
@tfbp.default_export
 class SlotFiller(tfbp.Model):
    default_hparams = {
        "language": "",
        "num_epochs": 2,
        "dropout_prob": 0.1,
        "slot_num_labels": 40
    }
    data_loader: JISFDL
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Init data loader
        self.data_loader = JISFDL(**kwargs)
        # Load Tokenizer from transformers
        # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
        self.tokenizer = AutoTokenizer.from_pretrained(
            bert_model_name, use_fast=False)
        self.bert = TFBertModel.from_pretrained(bert_model_name)
        self.dropout = Dropout(self.hparams.dropout_prob)
        self.slot_classifier = Dense(self.hparams.slot_num_labels,
                                     name="slot_classifier", activation="softmax")
    def call(self, inputs, **kwargs):
        trained_bert = self.bert(inputs, **kwargs)
        sequence_output = trained_bert.last_hidden_state
        # sequence_output will be used for slot_filling
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_probas = self.slot_classifier(sequence_output)
        return slot_probas
    @tfbp.runnable
    def fit(self):
        """Training"""
        encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader(
            self.tokenizer)
        if self.hparams.slot_num_labels != len(slot_names):
            raise ValueError(
                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
            )
        # Hyperparams, Optimizer and Loss function
        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
        # two outputs, one for slots, another for intents
        # we have to fine tune for both
        losses = SparseCategoricalCrossentropy()
        metrics = [SparseCategoricalAccuracy("accuracy")]
        # Compile model
        self.compile(optimizer=opt, loss=losses, metrics=metrics)
        x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],
             "attention_mask": encoded_texts["attention_mask"]}
        super().fit(
            x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
        # Persist the model
        self.extra_params["slot_names"] = slot_names
        self.save()
    @tfbp.runnable
    def evaluate(self):
        """Evaluation"""
        # Load test data
        # Assuming your data loader can return test data when mode='test' is specified
        encoded_texts, _, encoded_slots, _, slot_names = self.data_loader(
            self.tokenizer, self.extra_params)
        # Get predictions
        predictions = self(encoded_texts)
        predicted_slot_ids = np.argmax(predictions, axis=-1)  # Shape: (batch_size, sequence_length)
        true_labels = encoded_slots.flatten()
        pred_labels = predicted_slot_ids.flatten()
        # Filter out padding tokens (assuming padding label id is 0)
        mask = true_labels != 0
        filtered_true_labels = true_labels[mask]
        filtered_pred_labels = pred_labels[mask]
        # Adjust labels to start from 0 (since padding label 0 is removed)
        filtered_true_labels -= 1
        filtered_pred_labels -= 1
        # Get slot names excluding padding
        slot_names_no_pad = self.extra_params["slot_names"][1:]  # Exclude padding label
        report = classification_report(
            filtered_true_labels,
            filtered_pred_labels,
            target_names=slot_names_no_pad,
            zero_division=0
        )
        print(report)
        # Optionally, you can return the report as a string or dictionary
        return report
    @tfbp.runnable
    def predict(self):
        text = self.data_loader.get_prediction_data()
        info = self.get_prediction(text)
        print(self.summary())
        print("Text : " + text)
        print(json.dumps(info, indent=2))
        return json.dumps(info, indent=2)
    def get_slots_prediction(self, text: str, inputs, slot_probas):
        slot_probas_np = slot_probas.numpy()
        # Get the indices of the maximum values
        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
        # get all slot names and add to out_dict as keys
        out_dict = {}
        predicted_slots = set([self.extra_params["slot_names"][s]
                            for s in slot_ids if s != 0])
        for ps in predicted_slots:
            out_dict[ps] = []
        # retrieving the tokenization that was used in the predictions
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        # We'd like to eliminate all special tokens from our output
        special_tokens = self.tokenizer.special_tokens_map.values()
        for token, slot_id in zip(tokens, slot_ids):
            if token in special_tokens:
                continue
            # add all to out_dict
            slot_name = self.extra_params["slot_names"][slot_id]
            if slot_name == "<PAD>":
                continue
            # collect tokens
            collected_tokens = [token]
            idx = tokens.index(token)
            # see if it starts with ##
            # then it belongs to the previous token
            if token.startswith("##"):
                # check if the token already exists or not
                if tokens[idx - 1] not in out_dict[slot_name]:
                    collected_tokens.insert(0, tokens[idx - 1])
            # add collected tokens to slots
            out_dict[slot_name].extend(collected_tokens)
        slot_names_to_ids = {value: key for key, value in enumerate(
            self.extra_params["slot_names"])}
        entities = []
        # process out_dict
        for slot_name in out_dict:
            slot_id = slot_names_to_ids[slot_name]
            slot_tokens = out_dict[slot_name]
            slot_value = self.tokenizer.convert_tokens_to_string(
                slot_tokens).strip()
            entity = {
                "entity": slot_name,
                "value": slot_value,
                "start": text.find(slot_value),
                "end":  text.find(slot_value) + len(slot_value),
                "confidence": 0,
            }
            # The confidence of a slot is the average confidence of tokens in that slot.
            indices = [tokens.index(token) for token in slot_tokens]
            if len(slot_tokens) > 0:
                total = functools.reduce(
                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
                entity["confidence"] = total / len(slot_tokens)
            else:
                entity["confidence"] = 0
            entities.append(entity)
        return entities
    def get_prediction(self, text: str):
        inputs = self.data_loader.encode_text(text, self.tokenizer)
        slot_probas = self(inputs)  # type: ignore
        entities = []
        if slot_probas is not None:
            entities = self.get_slots_prediction(text, inputs, slot_probas)
        return {
            "text": text,
            "entities": entities,
        }
--- a/nlu/utils/json_helper.py
+++ b/nlu/utils/json_helper.py
@ -4,7 +4,7 @@ import json
 class JsonHelper:
    data_folder: str
-    def __init__(self, model:str="jisf"):
+    def __init__(self, model:str = "intent_classifier"):
        self.data_folder=os.path.join("data",model)
    def read_dataset_json_file(self, filename):