Merge branch 'main' into 40-issue-prevent-users-from-deleting-their-own-roles

2024-11-24 04:53:41 +00:00 · 2024-09-23 11:16:20 +01:00 · 2024-09-23 11:16:20 +01:00 · a999604472
commit a999604472
parent 5936816a04 37f20f7d79
12 changed files with 431 additions and 176 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -45,7 +45,8 @@ AUTH_TOKEN=token123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=en,fr
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
+SLOT_FILLER_REPO_ID=Hexastack/slot-filler
 NLP_PORT=5000

 # Frontend (Next.js)
--- a/frontend/src/components/visual-editor/v2/Diagrams.tsx
+++ b/frontend/src/components/visual-editor/v2/Diagrams.tsx
@ -22,7 +22,6 @@ import {
  Tab,
  Tabs,
  Tooltip,
-  debounce,
  tabsClasses,
 } from "@mui/material";
 import {
@ -32,7 +31,13 @@ import {
  DiagramModel,
  DiagramModelGenerics,
 } from "@projectstorm/react-diagrams";
-import { SyntheticEvent, useEffect, useRef, useState } from "react";
+import {
+  SyntheticEvent,
+  useCallback,
+  useEffect,
+  useRef,
+  useState,
+} from "react";
 import { useTranslation } from "react-i18next";

 import { DeleteDialog } from "@/app-components/dialogs";
@ -41,6 +46,7 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete";
 import { useFind } from "@/hooks/crud/useFind";
 import { useGetFromCache } from "@/hooks/crud/useGet";
 import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate";
+import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
 import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
 import { useSearch } from "@/hooks/useSearch";
 import { EntityType, Format } from "@/services/types";
@ -108,29 +114,36 @@ const Diagrams = () => {
  const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
    invalidate: false,
  });
-  const debouncedZoomEvent = debounce((event) => {
-    if (selectedCategoryId) {
-      engine?.repaintCanvas();
-      updateCategory({
-        id: selectedCategoryId,
-        params: {
-          zoom: event.zoom,
-        },
-      });
-    }
-    event.stopPropagation();
-  }, 200);
-  const debouncedOffsetEvent = debounce((event) => {
-    if (selectedCategoryId) {
-      updateCategory({
-        id: selectedCategoryId,
-        params: {
-          offset: [event.offsetX, event.offsetY],
-        },
-      });
-    }
-    event.stopPropagation();
-  }, 200);
+  const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
+  const debouncedZoomEvent = useCallback(
+    (event: any) => {
+      if (selectedCategoryId) {
+        engine?.repaintCanvas();
+        debouncedUpdateCategory({
+          id: selectedCategoryId,
+          params: {
+            zoom: event.zoom,
+          },
+        });
+      }
+      event.stopPropagation();
+    },
+    [selectedCategoryId, engine, debouncedUpdateCategory],
+  );
+  const debouncedOffsetEvent = useCallback(
+    (event: any) => {
+      if (selectedCategoryId) {
+        debouncedUpdateCategory({
+          id: selectedCategoryId,
+          params: {
+            offset: [event.offsetX, event.offsetY],
+          },
+        });
+      }
+      event.stopPropagation();
+    },
+    [selectedCategoryId, debouncedUpdateCategory],
+  );
  const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
  const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
  const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);
--- a/frontend/src/hooks/useDebouncedUpdate.tsx
+++ b/frontend/src/hooks/useDebouncedUpdate.tsx
@ -0,0 +1,54 @@
+/*
+ * Copyright © 2024 Hexastack. All rights reserved.
+ *
+ * Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
+ * 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
+ * 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
+ * 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
+ */
+
+import { debounce } from "@mui/material";
+import { useCallback, useEffect, useRef } from "react";
+
+type DebouncedUpdateParams = {
+  id: string;
+  params: Record<string, any>;
+};
+
+function useDebouncedUpdate(
+  apiUpdate: (params: DebouncedUpdateParams) => void,
+  delay: number = 300,
+) {
+  const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
+  const processUpdates = useRef(
+    debounce(() => {
+      if (accumulatedUpdates.current) {
+        apiUpdate(accumulatedUpdates.current);
+        accumulatedUpdates.current = null;
+      }
+    }, delay),
+  ).current;
+  const handleUpdate = useCallback(
+    (params: DebouncedUpdateParams) => {
+      accumulatedUpdates.current = {
+        id: params.id,
+        params: {
+          ...(accumulatedUpdates.current?.params || {}),
+          ...params.params,
+        },
+      };
+      processUpdates();
+    },
+    [processUpdates],
+  );
+
+  useEffect(() => {
+    return () => {
+      processUpdates.clear();
+    };
+  }, [processUpdates]);
+
+  return handleUpdate;
+}
+
+export default useDebouncedUpdate;
--- a/nlu/.env.dev
+++ b/nlu/.env.dev
@ -2,4 +2,5 @@ AUTH_TOKEN=123
 LANGUAGE_CLASSIFIER=language-classifier
 INTENT_CLASSIFIERS=ar,fr,tn
 TFLC_REPO_ID=Hexastack/tflc
-JISF_REPO_ID=Hexastack/jisf
+INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
+SLOT_FILLER_REPO_ID=Hexastack/slot-filler
--- a/nlu/.env.example
+++ b/nlu/.env.example
@ -1,5 +1,5 @@
 AUTH_TOKEN=
 LANGUAGE_CLASSIFIER=
 INTENT_CLASSIFIERS=
-TFLC_REPO_ID=
-JISF_REPO_ID=
+INTENT_CLASSIFIER_REPO_ID=
+SLOT_FILLER_REPO_ID=
--- a/nlu/README.md
+++ b/nlu/README.md
@ -40,7 +40,7 @@ pip install -r requirements.txt
 You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`:
 ```bash
 $ cat env.sh
-source env/bin/activate
+source venv/bin/activate
 alias run='python run.py'
 ```

@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1
 Examples :
 ```bash
 # Intent classification
-run fit intent-classifier-en-30072024 jisf  --intent_num_labels=88 --slot_num_labels=17 --language=en
+run fit intent-classifier-en-30072024 intent_classifier  --intent_num_labels=88 --slot_num_labels=17 --language=en
 run predict intent-classifier-fr-30072024  --intent_num_labels=7 --slot_num_labels=2 --language=fr

 # Language classification
--- a/nlu/data_loaders/jisfdl.py
+++ b/nlu/data_loaders/jisfdl.py
@ -4,8 +4,8 @@ import json
 import numpy as np
 from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

+
 import boilerplate as tfbp
-from utils.jisf_data_mapper import JisfDataMapper
 from utils.json_helper import JsonHelper


@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader):
        # Filter examples by language
        lang = self.hparams.language
        all_examples = data["common_examples"]
-        examples = filter(lambda exp: any(
-            e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
+
+        if not bool(lang):
+            examples = all_examples
+        else:
+            examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)

        # Parse raw data
        for exp in examples:
@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader):
        # the classifier.
        texts = [d.text for d in dataset]
        encoded_texts = self.encode_texts(texts, tokenizer)
-
        # Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
        intents = [d.intent for d in dataset]
        if not model_params:
@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader):
            # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
            slot_names.insert(0, "<PAD>")
        else:
-            intent_names = model_params.intent_names
-            slot_names = model_params.slot_names
+            if "intent_names" in model_params:
+                intent_names = model_params["intent_names"]
+            else:
+                intent_names = None
            
-        intent_map = dict()  # Dict : intent -> index
-        for idx, ui in enumerate(intent_names):
-            intent_map[ui] = idx
+            if "slot_names" in model_params:
+                slot_names = model_params["slot_names"]
+            else:
+                slot_names = None
+
+        if intent_names:
+            intent_map = dict()  # Dict : intent -> index
+            for idx, ui in enumerate(intent_names):
+                intent_map[ui] = idx
+        else:
+            intent_map = None

        # Encode intents
-        encoded_intents = self.encode_intents(intents, intent_map)
+        if intent_map:
+            encoded_intents = self.encode_intents(intents, intent_map)
+        else:
+            encoded_intents = None

-        slot_map: Dict[str, int] = dict()  # slot -> index
-        for idx, us in enumerate(slot_names):
-            slot_map[us] = idx
+        if slot_names:
+            slot_map: Dict[str, int] = dict()  # slot -> index
+            for idx, us in enumerate(slot_names):
+                slot_map[us] = idx
+        else:
+            slot_map = None

        # Encode slots
        # Text : Add a tune to my elrow Guest List
@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader):
        max_len = len(encoded_texts["input_ids"][0])  # type: ignore
        all_slots = [td.slots for td in dataset]
        all_texts = [td.text for td in dataset]
-        encoded_slots = self.encode_slots(tokenizer,
+        
+        if slot_map:
+            encoded_slots = self.encode_slots(tokenizer,
                                          all_slots, all_texts, slot_map, max_len)
+        else:
+            encoded_slots = None

        return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names

--- a/nlu/data_loaders/tflcdl.py
+++ b/nlu/data_loaders/tflcdl.py
@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader):

        self.json_helper = JsonHelper("tflc")
        self._save_dir = save_dir
-        print(hparams)
+
        # We will opt for a TF-IDF representation of the data as the frequency of word
        # roots should give us a good idea about which language we're dealing with.
        if method == "fit":
--- a/nlu/main.py
+++ b/nlu/main.py
@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED")

 AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',')
 TFLC_REPO_ID = os.getenv("TFLC_REPO_ID")
-JISF_REPO_ID = os.getenv("JISF_REPO_ID")
-
+INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID")
+SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID")

 def load_language_classifier():
    # Init language classifier model
@ -27,21 +27,31 @@ def load_language_classifier():
    logging.info(f'Successfully loaded the language classifier model')
    return model

-
 def load_intent_classifiers():
-    Model = tfbp.get_model("jisf")
-    models = {}
+    Model = tfbp.get_model("intent_classifier")
+    intent_classifiers = {}
    for language in AVAILABLE_LANGUAGES:
        kwargs = {}
-        models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs)
-        models[language].load_model()
+        intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs)
+        intent_classifiers[language].load_model()
        logging.info(f'Successfully loaded the intent classifier {language} model')
-    return models
+    return intent_classifiers
+
+def load_slot_classifiers():
+    Model = tfbp.get_model("slot_classifier")
+    slot_fillers = {}
+    for language in AVAILABLE_LANGUAGES:
+        kwargs = {}
+        slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs)
+        slot_fillers[language].load_model()
+        logging.info(f'Successfully loaded the slot filler {language} model')
+    return slot_fillers


 def load_models():
    app.language_classifier = load_language_classifier()  # type: ignore
    app.intent_classifiers = load_intent_classifiers()  # type: ignore
+    app.slot_fillers = load_intent_classifiers()  # type: ignore

 app = FastAPI()

@ -74,13 +84,20 @@ async def check_health():

@app.post("/parse")
 def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]):
-    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'):
+    if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'):
        headers = {"Retry-After": "120"}  # Suggest retrying after 2 minutes
-        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers)
+        return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers)
    
    language = app.language_classifier.get_prediction(input.q)  # type: ignore
    lang = language.get("value")
-    prediction = app.intent_classifiers[lang].get_prediction(
+    intent_prediction = app.intent_classifiers[lang].get_prediction(
        input.q)  # type: ignore
-    prediction.get("entities").append(language)
-    return prediction
+    slot_prediction = app.slot_fillers[lang].get_prediction(
+        input.q)  # type: ignore
+    slot_prediction.get("entities").append(language)
+
+    return {
+        "text": input.q,
+        "intent": intent_prediction.get("intent"),
+        "entities": slot_prediction.get("entities"),
+    }
--- a/nlu/models/intent_classifier.py
+++ b/nlu/models/intent_classifier.py
@ -1,4 +1,3 @@
-import functools
 import json
 import math
 from typing import Tuple, Dict, List
@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL
 import boilerplate as tfbp

 ##
-# JISF : Joint Intent Classification and Slot filling with BERT
-# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
+# Intent Classification with BERT
+# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
 # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
 #
 # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
@ -33,19 +32,16 @@ import boilerplate as tfbp
 BERT_MODEL_BY_LANGUAGE = {
    'en': "bert-base-cased",
    'fr': "dbmdz/bert-base-french-europeana-cased",
-    'ar': 'asafaya/bert-base-arabic',
-    'tn': 'dbmdz/bert-base-french-europeana-cased'
 }


@tfbp.default_export
-class JISF(tfbp.Model):
+class IntentClassifier(tfbp.Model):
    default_hparams = {
-        "language": "fr",
+        "language": "",
        "num_epochs": 2,
        "dropout_prob": 0.1,
        "intent_num_labels": 7,
-        "slot_num_labels": 40
    }
    data_loader: JISFDL

@ -57,8 +53,8 @@ class JISF(tfbp.Model):

        # Load Tokenizer from transformers
        # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
-        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language]
-        # bert_model_name = typing.cast(str, self.hparams.bert_model_name)
+        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
+
        self.tokenizer = AutoTokenizer.from_pretrained(
            bert_model_name, use_fast=False)
        self.bert = TFBertModel.from_pretrained(bert_model_name)
@ -66,27 +62,18 @@ class JISF(tfbp.Model):
        self.dropout = Dropout(self.hparams.dropout_prob)
        self.intent_classifier = Dense(self.hparams.intent_num_labels,
                                       name="intent_classifier", activation="softmax")
-        self.slot_classifier = Dense(self.hparams.slot_num_labels,
-                                     name="slot_classifier", activation="softmax")


    def call(self, inputs, **kwargs):
-        # two outputs from BERT
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
-        sequence_output = trained_bert.last_hidden_state
-
-        # sequence_output will be used for slot_filling / classification
-        sequence_output = self.dropout(sequence_output,
-                                       training=kwargs.get("training", False))
-        slot_probas = self.slot_classifier(sequence_output)
        
        # pooled_output for intent classification
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_probas = self.intent_classifier(pooled_output)

-        return slot_probas, intent_probas
+        return intent_probas

    def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]:
        return data_loader(self.tokenizer)
@ -137,18 +124,11 @@ class JISF(tfbp.Model):
            raise ValueError(
                f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}"
            )
-        if self.hparams.slot_num_labels != len(slot_names):
-            raise ValueError(
-                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
-            )

        # Hyperparams, Optimizer and Loss function
        opt = Adam(learning_rate=3e-5, epsilon=1e-08)

-        # two outputs, one for slots, another for intents
-        # we have to fine tune for both
-        losses = [SparseCategoricalCrossentropy(),
-                  SparseCategoricalCrossentropy()]
+        losses = SparseCategoricalCrossentropy()

        metrics = [SparseCategoricalAccuracy("accuracy")]

@ -159,11 +139,10 @@ class JISF(tfbp.Model):
             "attention_mask": encoded_texts["attention_mask"]}

        super().fit(
-            x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
+            x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)

        # Persist the model
        self.extra_params["intent_names"] = intent_names
-        self.extra_params["slot_names"] = slot_names

        self.save()

@ -175,7 +154,7 @@ class JISF(tfbp.Model):
        metrics = [SparseCategoricalAccuracy("accuracy")]
        self.compile(metrics=metrics)

-        _, intent_probas = self(encoded_texts)  # type: ignore
+        intent_probas = self(encoded_texts)  # type: ignore
        
        scores = self.get_metrics_by_intent(intent_probas, encoded_intents)

@ -205,84 +184,9 @@ class JISF(tfbp.Model):

        return json.dumps(info, indent=2)

-    def get_slots_prediction(self, text: str, inputs, slot_probas):
-        slot_probas_np = slot_probas.numpy()
-        # Get the indices of the maximum values
-        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
-
-        # get all slot names and add to out_dict as keys
-        out_dict = {}
-        predicted_slots = set([self.extra_params["slot_names"][s]
-                            for s in slot_ids if s != 0])
-        for ps in predicted_slots:
-            out_dict[ps] = []
-        
-        # retrieving the tokenization that was used in the predictions
-        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-
-        # We'd like to eliminate all special tokens from our output
-        special_tokens = self.tokenizer.special_tokens_map.values()
-
-        for token, slot_id in zip(tokens, slot_ids):
-            if token in special_tokens:
-                continue
-            # add all to out_dict
-            slot_name = self.extra_params["slot_names"][slot_id]
-
-            if slot_name == "<PAD>":
-                continue
-
-            # collect tokens
-            collected_tokens = [token]
-            idx = tokens.index(token)
-
-            # see if it starts with ##
-            # then it belongs to the previous token
-            if token.startswith("##"):
-                # check if the token already exists or not
-                if tokens[idx - 1] not in out_dict[slot_name]:
-                    collected_tokens.insert(0, tokens[idx - 1])
-
-            # add collected tokens to slots
-            out_dict[slot_name].extend(collected_tokens)
-
-        slot_names_to_ids = {value: key for key, value in enumerate(
-            self.extra_params["slot_names"])}
-
-        entities = []
-        # process out_dict
-        for slot_name in out_dict:
-            slot_id = slot_names_to_ids[slot_name]
-            slot_tokens = out_dict[slot_name]
-
-            slot_value = self.tokenizer.convert_tokens_to_string(
-                slot_tokens).strip()
-
-            entity = {
-                "entity": slot_name,
-                "value": slot_value,
-                "start": text.find(slot_value),
-                "end":  text.find(slot_value) + len(slot_value),
-                "confidence": 0,
-            }
-
-            # The confidence of a slot is the average confidence of tokens in that slot.
-            indices = [tokens.index(token) for token in slot_tokens]
-            if len(slot_tokens) > 0:
-                total = functools.reduce(
-                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
-                entity["confidence"] = total / len(slot_tokens)
-            else:
-                entity["confidence"] = 0
-
-            entities.append(entity)
-
-        return entities
-
-
    def get_prediction(self, text: str):
        inputs = self.data_loader.encode_text(text, self.tokenizer)
-        slot_probas, intent_probas = self(inputs)  # type: ignore
+        intent_probas = self(inputs)  # type: ignore

        intent_probas_np = intent_probas.numpy()
        
@ -292,15 +196,8 @@ class JISF(tfbp.Model):
        # get the confidences for each intent
        intent_confidences = intent_probas_np[0]

-
-        entities = []
-        if slot_probas is not None:
-            entities = self.get_slots_prediction(text, inputs, slot_probas)
-
        return {
            "text": text,
            "intent": {"name": self.extra_params["intent_names"][intent_id],
                       "confidence": float(intent_confidences[intent_id])},
-            "entities": entities,
        }
-
--- a/nlu/models/slot_filler.py
+++ b/nlu/models/slot_filler.py
@ -0,0 +1,250 @@
+import functools
+import json
+from transformers import TFBertModel, AutoTokenizer
+from keras.layers import Dropout, Dense
+from sys import platform
+
+if platform == "darwin":
+    from keras.optimizers.legacy import Adam
+else:
+    from keras.optimizers import Adam
+
+from keras.losses import SparseCategoricalCrossentropy
+from keras.metrics import SparseCategoricalAccuracy
+import numpy as np
+
+from data_loaders.jisfdl import JISFDL
+
+from sklearn.metrics import classification_report
+
+
+import boilerplate as tfbp
+
+##
+# Slot filling with BERT
+# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
+# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
+#
+# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
+# the paper with the original dataset.
+##
+
+BERT_MODEL_BY_LANGUAGE = {
+    'en': "bert-base-cased",
+    'fr': "dbmdz/bert-base-french-europeana-cased",
+}
+
+
+@tfbp.default_export
+class SlotFiller(tfbp.Model):
+    default_hparams = {
+        "language": "",
+        "num_epochs": 2,
+        "dropout_prob": 0.1,
+        "slot_num_labels": 40
+    }
+    data_loader: JISFDL
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Init data loader
+        self.data_loader = JISFDL(**kwargs)
+
+        # Load Tokenizer from transformers
+        # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
+        bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            bert_model_name, use_fast=False)
+        self.bert = TFBertModel.from_pretrained(bert_model_name)
+
+        self.dropout = Dropout(self.hparams.dropout_prob)
+        self.slot_classifier = Dense(self.hparams.slot_num_labels,
+                                     name="slot_classifier", activation="softmax")
+
+
+    def call(self, inputs, **kwargs):
+        trained_bert = self.bert(inputs, **kwargs)
+        sequence_output = trained_bert.last_hidden_state
+
+        # sequence_output will be used for slot_filling
+        sequence_output = self.dropout(sequence_output,
+                                       training=kwargs.get("training", False))
+        slot_probas = self.slot_classifier(sequence_output)
+
+        return slot_probas
+
+    @tfbp.runnable
+    def fit(self):
+        """Training"""
+        encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader(
+            self.tokenizer)
+
+        if self.hparams.slot_num_labels != len(slot_names):
+            raise ValueError(
+                f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
+            )
+
+        # Hyperparams, Optimizer and Loss function
+        opt = Adam(learning_rate=3e-5, epsilon=1e-08)
+
+        # two outputs, one for slots, another for intents
+        # we have to fine tune for both
+        losses = SparseCategoricalCrossentropy()
+
+        metrics = [SparseCategoricalAccuracy("accuracy")]
+
+        # Compile model
+        self.compile(optimizer=opt, loss=losses, metrics=metrics)
+
+        x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],
+             "attention_mask": encoded_texts["attention_mask"]}
+
+        super().fit(
+            x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
+
+        # Persist the model
+        self.extra_params["slot_names"] = slot_names
+
+        self.save()
+
+    @tfbp.runnable
+    def evaluate(self):
+        """Evaluation"""
+        # Load test data
+        # Assuming your data loader can return test data when mode='test' is specified
+        encoded_texts, _, encoded_slots, _, slot_names = self.data_loader(
+            self.tokenizer, self.extra_params)
+
+        # Get predictions
+        predictions = self(encoded_texts)
+        predicted_slot_ids = np.argmax(predictions, axis=-1)  # Shape: (batch_size, sequence_length)
+
+        true_labels = encoded_slots.flatten()
+        pred_labels = predicted_slot_ids.flatten()
+
+        # Filter out padding tokens (assuming padding label id is 0)
+        mask = true_labels != 0
+        filtered_true_labels = true_labels[mask]
+        filtered_pred_labels = pred_labels[mask]
+
+        # Adjust labels to start from 0 (since padding label 0 is removed)
+        filtered_true_labels -= 1
+        filtered_pred_labels -= 1
+
+        # Get slot names excluding padding
+        slot_names_no_pad = self.extra_params["slot_names"][1:]  # Exclude padding label
+
+
+        report = classification_report(
+            filtered_true_labels,
+            filtered_pred_labels,
+            target_names=slot_names_no_pad,
+            zero_division=0
+        )
+
+        print(report)
+
+        # Optionally, you can return the report as a string or dictionary
+        return report
+
+    @tfbp.runnable
+    def predict(self):
+        text = self.data_loader.get_prediction_data()
+
+        info = self.get_prediction(text)
+
+        print(self.summary())
+        print("Text : " + text)
+        print(json.dumps(info, indent=2))
+
+        return json.dumps(info, indent=2)
+    
+    def get_slots_prediction(self, text: str, inputs, slot_probas):
+        slot_probas_np = slot_probas.numpy()
+        # Get the indices of the maximum values
+        slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
+
+        # get all slot names and add to out_dict as keys
+        out_dict = {}
+        predicted_slots = set([self.extra_params["slot_names"][s]
+                            for s in slot_ids if s != 0])
+        for ps in predicted_slots:
+            out_dict[ps] = []
+        
+        # retrieving the tokenization that was used in the predictions
+        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+
+        # We'd like to eliminate all special tokens from our output
+        special_tokens = self.tokenizer.special_tokens_map.values()
+
+        for token, slot_id in zip(tokens, slot_ids):
+            if token in special_tokens:
+                continue
+            # add all to out_dict
+            slot_name = self.extra_params["slot_names"][slot_id]
+
+            if slot_name == "<PAD>":
+                continue
+
+            # collect tokens
+            collected_tokens = [token]
+            idx = tokens.index(token)
+
+            # see if it starts with ##
+            # then it belongs to the previous token
+            if token.startswith("##"):
+                # check if the token already exists or not
+                if tokens[idx - 1] not in out_dict[slot_name]:
+                    collected_tokens.insert(0, tokens[idx - 1])
+
+            # add collected tokens to slots
+            out_dict[slot_name].extend(collected_tokens)
+
+        slot_names_to_ids = {value: key for key, value in enumerate(
+            self.extra_params["slot_names"])}
+
+        entities = []
+        # process out_dict
+        for slot_name in out_dict:
+            slot_id = slot_names_to_ids[slot_name]
+            slot_tokens = out_dict[slot_name]
+
+            slot_value = self.tokenizer.convert_tokens_to_string(
+                slot_tokens).strip()
+
+            entity = {
+                "entity": slot_name,
+                "value": slot_value,
+                "start": text.find(slot_value),
+                "end":  text.find(slot_value) + len(slot_value),
+                "confidence": 0,
+            }
+
+            # The confidence of a slot is the average confidence of tokens in that slot.
+            indices = [tokens.index(token) for token in slot_tokens]
+            if len(slot_tokens) > 0:
+                total = functools.reduce(
+                    lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
+                entity["confidence"] = total / len(slot_tokens)
+            else:
+                entity["confidence"] = 0
+
+            entities.append(entity)
+
+        return entities
+
+
+    def get_prediction(self, text: str):
+        inputs = self.data_loader.encode_text(text, self.tokenizer)
+        slot_probas = self(inputs)  # type: ignore
+
+        entities = []
+        if slot_probas is not None:
+            entities = self.get_slots_prediction(text, inputs, slot_probas)
+
+        return {
+            "text": text,
+            "entities": entities,
+        }
--- a/nlu/utils/json_helper.py
+++ b/nlu/utils/json_helper.py
@ -4,7 +4,7 @@ import json
 class JsonHelper:
    data_folder: str

-    def __init__(self, model:str="jisf"):
+    def __init__(self, model:str = "intent_classifier"):
        self.data_folder=os.path.join("data",model)
        
    def read_dataset_json_file(self, filename):