Merge branch 'main' into 40-issue-prevent-users-from-deleting-their-own-roles

This commit is contained in:
Emnaghz 2024-09-23 11:16:20 +01:00
commit a999604472
12 changed files with 431 additions and 176 deletions

View File

@ -45,7 +45,8 @@ AUTH_TOKEN=token123
LANGUAGE_CLASSIFIER=language-classifier LANGUAGE_CLASSIFIER=language-classifier
INTENT_CLASSIFIERS=en,fr INTENT_CLASSIFIERS=en,fr
TFLC_REPO_ID=Hexastack/tflc TFLC_REPO_ID=Hexastack/tflc
JISF_REPO_ID=Hexastack/jisf INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
SLOT_FILLER_REPO_ID=Hexastack/slot-filler
NLP_PORT=5000 NLP_PORT=5000
# Frontend (Next.js) # Frontend (Next.js)

View File

@ -22,7 +22,6 @@ import {
Tab, Tab,
Tabs, Tabs,
Tooltip, Tooltip,
debounce,
tabsClasses, tabsClasses,
} from "@mui/material"; } from "@mui/material";
import { import {
@ -32,7 +31,13 @@ import {
DiagramModel, DiagramModel,
DiagramModelGenerics, DiagramModelGenerics,
} from "@projectstorm/react-diagrams"; } from "@projectstorm/react-diagrams";
import { SyntheticEvent, useEffect, useRef, useState } from "react"; import {
SyntheticEvent,
useCallback,
useEffect,
useRef,
useState,
} from "react";
import { useTranslation } from "react-i18next"; import { useTranslation } from "react-i18next";
import { DeleteDialog } from "@/app-components/dialogs"; import { DeleteDialog } from "@/app-components/dialogs";
@ -41,6 +46,7 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete";
import { useFind } from "@/hooks/crud/useFind"; import { useFind } from "@/hooks/crud/useFind";
import { useGetFromCache } from "@/hooks/crud/useGet"; import { useGetFromCache } from "@/hooks/crud/useGet";
import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate"; import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate";
import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
import { getDisplayDialogs, useDialog } from "@/hooks/useDialog"; import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
import { useSearch } from "@/hooks/useSearch"; import { useSearch } from "@/hooks/useSearch";
import { EntityType, Format } from "@/services/types"; import { EntityType, Format } from "@/services/types";
@ -108,10 +114,12 @@ const Diagrams = () => {
const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, { const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
invalidate: false, invalidate: false,
}); });
const debouncedZoomEvent = debounce((event) => { const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
const debouncedZoomEvent = useCallback(
(event: any) => {
if (selectedCategoryId) { if (selectedCategoryId) {
engine?.repaintCanvas(); engine?.repaintCanvas();
updateCategory({ debouncedUpdateCategory({
id: selectedCategoryId, id: selectedCategoryId,
params: { params: {
zoom: event.zoom, zoom: event.zoom,
@ -119,10 +127,13 @@ const Diagrams = () => {
}); });
} }
event.stopPropagation(); event.stopPropagation();
}, 200); },
const debouncedOffsetEvent = debounce((event) => { [selectedCategoryId, engine, debouncedUpdateCategory],
);
const debouncedOffsetEvent = useCallback(
(event: any) => {
if (selectedCategoryId) { if (selectedCategoryId) {
updateCategory({ debouncedUpdateCategory({
id: selectedCategoryId, id: selectedCategoryId,
params: { params: {
offset: [event.offsetX, event.offsetY], offset: [event.offsetX, event.offsetY],
@ -130,7 +141,9 @@ const Diagrams = () => {
}); });
} }
event.stopPropagation(); event.stopPropagation();
}, 200); },
[selectedCategoryId, debouncedUpdateCategory],
);
const getBlockFromCache = useGetFromCache(EntityType.BLOCK); const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
const updateCachedBlock = useUpdateCache(EntityType.BLOCK); const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK); const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);

View File

@ -0,0 +1,54 @@
/*
* Copyright © 2024 Hexastack. All rights reserved.
*
* Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
* 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
* 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
* 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
*/
import { debounce } from "@mui/material";
import { useCallback, useEffect, useRef } from "react";
type DebouncedUpdateParams = {
id: string;
params: Record<string, any>;
};
function useDebouncedUpdate(
apiUpdate: (params: DebouncedUpdateParams) => void,
delay: number = 300,
) {
const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
const processUpdates = useRef(
debounce(() => {
if (accumulatedUpdates.current) {
apiUpdate(accumulatedUpdates.current);
accumulatedUpdates.current = null;
}
}, delay),
).current;
const handleUpdate = useCallback(
(params: DebouncedUpdateParams) => {
accumulatedUpdates.current = {
id: params.id,
params: {
...(accumulatedUpdates.current?.params || {}),
...params.params,
},
};
processUpdates();
},
[processUpdates],
);
useEffect(() => {
return () => {
processUpdates.clear();
};
}, [processUpdates]);
return handleUpdate;
}
export default useDebouncedUpdate;

View File

@ -2,4 +2,5 @@ AUTH_TOKEN=123
LANGUAGE_CLASSIFIER=language-classifier LANGUAGE_CLASSIFIER=language-classifier
INTENT_CLASSIFIERS=ar,fr,tn INTENT_CLASSIFIERS=ar,fr,tn
TFLC_REPO_ID=Hexastack/tflc TFLC_REPO_ID=Hexastack/tflc
JISF_REPO_ID=Hexastack/jisf INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
SLOT_FILLER_REPO_ID=Hexastack/slot-filler

View File

@ -1,5 +1,5 @@
AUTH_TOKEN= AUTH_TOKEN=
LANGUAGE_CLASSIFIER= LANGUAGE_CLASSIFIER=
INTENT_CLASSIFIERS= INTENT_CLASSIFIERS=
TFLC_REPO_ID= INTENT_CLASSIFIER_REPO_ID=
JISF_REPO_ID= SLOT_FILLER_REPO_ID=

View File

@ -40,7 +40,7 @@ pip install -r requirements.txt
You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`: You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`:
```bash ```bash
$ cat env.sh $ cat env.sh
source env/bin/activate source venv/bin/activate
alias run='python run.py' alias run='python run.py'
``` ```
@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1
Examples : Examples :
```bash ```bash
# Intent classification # Intent classification
run fit intent-classifier-en-30072024 jisf --intent_num_labels=88 --slot_num_labels=17 --language=en run fit intent-classifier-en-30072024 intent_classifier --intent_num_labels=88 --slot_num_labels=17 --language=en
run predict intent-classifier-fr-30072024 --intent_num_labels=7 --slot_num_labels=2 --language=fr run predict intent-classifier-fr-30072024 --intent_num_labels=7 --slot_num_labels=2 --language=fr
# Language classification # Language classification

View File

@ -4,8 +4,8 @@ import json
import numpy as np import numpy as np
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
import boilerplate as tfbp import boilerplate as tfbp
from utils.jisf_data_mapper import JisfDataMapper
from utils.json_helper import JsonHelper from utils.json_helper import JsonHelper
@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader):
# Filter examples by language # Filter examples by language
lang = self.hparams.language lang = self.hparams.language
all_examples = data["common_examples"] all_examples = data["common_examples"]
examples = filter(lambda exp: any(
e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples) if not bool(lang):
examples = all_examples
else:
examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
# Parse raw data # Parse raw data
for exp in examples: for exp in examples:
@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader):
# the classifier. # the classifier.
texts = [d.text for d in dataset] texts = [d.text for d in dataset]
encoded_texts = self.encode_texts(texts, tokenizer) encoded_texts = self.encode_texts(texts, tokenizer)
# Map intents, load from the model (evaluate), recompute from dataset otherwise (train) # Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
intents = [d.intent for d in dataset] intents = [d.intent for d in dataset]
if not model_params: if not model_params:
@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader):
# To handle those we need to add <PAD> to slots_names. It can be some other symbol as well. # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
slot_names.insert(0, "<PAD>") slot_names.insert(0, "<PAD>")
else: else:
intent_names = model_params.intent_names if "intent_names" in model_params:
slot_names = model_params.slot_names intent_names = model_params["intent_names"]
else:
intent_names = None
if "slot_names" in model_params:
slot_names = model_params["slot_names"]
else:
slot_names = None
if intent_names:
intent_map = dict() # Dict : intent -> index intent_map = dict() # Dict : intent -> index
for idx, ui in enumerate(intent_names): for idx, ui in enumerate(intent_names):
intent_map[ui] = idx intent_map[ui] = idx
else:
intent_map = None
# Encode intents # Encode intents
if intent_map:
encoded_intents = self.encode_intents(intents, intent_map) encoded_intents = self.encode_intents(intents, intent_map)
else:
encoded_intents = None
if slot_names:
slot_map: Dict[str, int] = dict() # slot -> index slot_map: Dict[str, int] = dict() # slot -> index
for idx, us in enumerate(slot_names): for idx, us in enumerate(slot_names):
slot_map[us] = idx slot_map[us] = idx
else:
slot_map = None
# Encode slots # Encode slots
# Text : Add a tune to my elrow Guest List # Text : Add a tune to my elrow Guest List
@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader):
max_len = len(encoded_texts["input_ids"][0]) # type: ignore max_len = len(encoded_texts["input_ids"][0]) # type: ignore
all_slots = [td.slots for td in dataset] all_slots = [td.slots for td in dataset]
all_texts = [td.text for td in dataset] all_texts = [td.text for td in dataset]
if slot_map:
encoded_slots = self.encode_slots(tokenizer, encoded_slots = self.encode_slots(tokenizer,
all_slots, all_texts, slot_map, max_len) all_slots, all_texts, slot_map, max_len)
else:
encoded_slots = None
return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names

View File

@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader):
self.json_helper = JsonHelper("tflc") self.json_helper = JsonHelper("tflc")
self._save_dir = save_dir self._save_dir = save_dir
print(hparams)
# We will opt for a TF-IDF representation of the data as the frequency of word # We will opt for a TF-IDF representation of the data as the frequency of word
# roots should give us a good idea about which language we're dealing with. # roots should give us a good idea about which language we're dealing with.
if method == "fit": if method == "fit":

View File

@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED")
AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',') AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',')
TFLC_REPO_ID = os.getenv("TFLC_REPO_ID") TFLC_REPO_ID = os.getenv("TFLC_REPO_ID")
JISF_REPO_ID = os.getenv("JISF_REPO_ID") INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID")
SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID")
def load_language_classifier(): def load_language_classifier():
# Init language classifier model # Init language classifier model
@ -27,21 +27,31 @@ def load_language_classifier():
logging.info(f'Successfully loaded the language classifier model') logging.info(f'Successfully loaded the language classifier model')
return model return model
def load_intent_classifiers(): def load_intent_classifiers():
Model = tfbp.get_model("jisf") Model = tfbp.get_model("intent_classifier")
models = {} intent_classifiers = {}
for language in AVAILABLE_LANGUAGES: for language in AVAILABLE_LANGUAGES:
kwargs = {} kwargs = {}
models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs) intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs)
models[language].load_model() intent_classifiers[language].load_model()
logging.info(f'Successfully loaded the intent classifier {language} model') logging.info(f'Successfully loaded the intent classifier {language} model')
return models return intent_classifiers
def load_slot_classifiers():
Model = tfbp.get_model("slot_classifier")
slot_fillers = {}
for language in AVAILABLE_LANGUAGES:
kwargs = {}
slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs)
slot_fillers[language].load_model()
logging.info(f'Successfully loaded the slot filler {language} model')
return slot_fillers
def load_models(): def load_models():
app.language_classifier = load_language_classifier() # type: ignore app.language_classifier = load_language_classifier() # type: ignore
app.intent_classifiers = load_intent_classifiers() # type: ignore app.intent_classifiers = load_intent_classifiers() # type: ignore
app.slot_fillers = load_intent_classifiers() # type: ignore
app = FastAPI() app = FastAPI()
@ -74,13 +84,20 @@ async def check_health():
@app.post("/parse") @app.post("/parse")
def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]): def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]):
if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'): if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'):
headers = {"Retry-After": "120"} # Suggest retrying after 2 minutes headers = {"Retry-After": "120"} # Suggest retrying after 2 minutes
return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers) return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers)
language = app.language_classifier.get_prediction(input.q) # type: ignore language = app.language_classifier.get_prediction(input.q) # type: ignore
lang = language.get("value") lang = language.get("value")
prediction = app.intent_classifiers[lang].get_prediction( intent_prediction = app.intent_classifiers[lang].get_prediction(
input.q) # type: ignore input.q) # type: ignore
prediction.get("entities").append(language) slot_prediction = app.slot_fillers[lang].get_prediction(
return prediction input.q) # type: ignore
slot_prediction.get("entities").append(language)
return {
"text": input.q,
"intent": intent_prediction.get("intent"),
"entities": slot_prediction.get("entities"),
}

View File

@ -1,4 +1,3 @@
import functools
import json import json
import math import math
from typing import Tuple, Dict, List from typing import Tuple, Dict, List
@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL
import boilerplate as tfbp import boilerplate as tfbp
## ##
# JISF : Joint Intent Classification and Slot filling with BERT # Intent Classification with BERT
# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019), # This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project. # https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
# #
# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of # Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
@ -33,19 +32,16 @@ import boilerplate as tfbp
BERT_MODEL_BY_LANGUAGE = { BERT_MODEL_BY_LANGUAGE = {
'en': "bert-base-cased", 'en': "bert-base-cased",
'fr': "dbmdz/bert-base-french-europeana-cased", 'fr': "dbmdz/bert-base-french-europeana-cased",
'ar': 'asafaya/bert-base-arabic',
'tn': 'dbmdz/bert-base-french-europeana-cased'
} }
@tfbp.default_export @tfbp.default_export
class JISF(tfbp.Model): class IntentClassifier(tfbp.Model):
default_hparams = { default_hparams = {
"language": "fr", "language": "",
"num_epochs": 2, "num_epochs": 2,
"dropout_prob": 0.1, "dropout_prob": 0.1,
"intent_num_labels": 7, "intent_num_labels": 7,
"slot_num_labels": 40
} }
data_loader: JISFDL data_loader: JISFDL
@ -57,8 +53,8 @@ class JISF(tfbp.Model):
# Load Tokenizer from transformers # Load Tokenizer from transformers
# We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier. # We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language] bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
# bert_model_name = typing.cast(str, self.hparams.bert_model_name)
self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer = AutoTokenizer.from_pretrained(
bert_model_name, use_fast=False) bert_model_name, use_fast=False)
self.bert = TFBertModel.from_pretrained(bert_model_name) self.bert = TFBertModel.from_pretrained(bert_model_name)
@ -66,27 +62,18 @@ class JISF(tfbp.Model):
self.dropout = Dropout(self.hparams.dropout_prob) self.dropout = Dropout(self.hparams.dropout_prob)
self.intent_classifier = Dense(self.hparams.intent_num_labels, self.intent_classifier = Dense(self.hparams.intent_num_labels,
name="intent_classifier", activation="softmax") name="intent_classifier", activation="softmax")
self.slot_classifier = Dense(self.hparams.slot_num_labels,
name="slot_classifier", activation="softmax")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
# two outputs from BERT
trained_bert = self.bert(inputs, **kwargs) trained_bert = self.bert(inputs, **kwargs)
pooled_output = trained_bert.pooler_output pooled_output = trained_bert.pooler_output
sequence_output = trained_bert.last_hidden_state
# sequence_output will be used for slot_filling / classification
sequence_output = self.dropout(sequence_output,
training=kwargs.get("training", False))
slot_probas = self.slot_classifier(sequence_output)
# pooled_output for intent classification # pooled_output for intent classification
pooled_output = self.dropout(pooled_output, pooled_output = self.dropout(pooled_output,
training=kwargs.get("training", False)) training=kwargs.get("training", False))
intent_probas = self.intent_classifier(pooled_output) intent_probas = self.intent_classifier(pooled_output)
return slot_probas, intent_probas return intent_probas
def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]: def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]:
return data_loader(self.tokenizer) return data_loader(self.tokenizer)
@ -137,18 +124,11 @@ class JISF(tfbp.Model):
raise ValueError( raise ValueError(
f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}" f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}"
) )
if self.hparams.slot_num_labels != len(slot_names):
raise ValueError(
f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
)
# Hyperparams, Optimizer and Loss function # Hyperparams, Optimizer and Loss function
opt = Adam(learning_rate=3e-5, epsilon=1e-08) opt = Adam(learning_rate=3e-5, epsilon=1e-08)
# two outputs, one for slots, another for intents losses = SparseCategoricalCrossentropy()
# we have to fine tune for both
losses = [SparseCategoricalCrossentropy(),
SparseCategoricalCrossentropy()]
metrics = [SparseCategoricalAccuracy("accuracy")] metrics = [SparseCategoricalAccuracy("accuracy")]
@ -159,11 +139,10 @@ class JISF(tfbp.Model):
"attention_mask": encoded_texts["attention_mask"]} "attention_mask": encoded_texts["attention_mask"]}
super().fit( super().fit(
x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True) x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
# Persist the model # Persist the model
self.extra_params["intent_names"] = intent_names self.extra_params["intent_names"] = intent_names
self.extra_params["slot_names"] = slot_names
self.save() self.save()
@ -175,7 +154,7 @@ class JISF(tfbp.Model):
metrics = [SparseCategoricalAccuracy("accuracy")] metrics = [SparseCategoricalAccuracy("accuracy")]
self.compile(metrics=metrics) self.compile(metrics=metrics)
_, intent_probas = self(encoded_texts) # type: ignore intent_probas = self(encoded_texts) # type: ignore
scores = self.get_metrics_by_intent(intent_probas, encoded_intents) scores = self.get_metrics_by_intent(intent_probas, encoded_intents)
@ -205,84 +184,9 @@ class JISF(tfbp.Model):
return json.dumps(info, indent=2) return json.dumps(info, indent=2)
def get_slots_prediction(self, text: str, inputs, slot_probas):
slot_probas_np = slot_probas.numpy()
# Get the indices of the maximum values
slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
# get all slot names and add to out_dict as keys
out_dict = {}
predicted_slots = set([self.extra_params["slot_names"][s]
for s in slot_ids if s != 0])
for ps in predicted_slots:
out_dict[ps] = []
# retrieving the tokenization that was used in the predictions
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# We'd like to eliminate all special tokens from our output
special_tokens = self.tokenizer.special_tokens_map.values()
for token, slot_id in zip(tokens, slot_ids):
if token in special_tokens:
continue
# add all to out_dict
slot_name = self.extra_params["slot_names"][slot_id]
if slot_name == "<PAD>":
continue
# collect tokens
collected_tokens = [token]
idx = tokens.index(token)
# see if it starts with ##
# then it belongs to the previous token
if token.startswith("##"):
# check if the token already exists or not
if tokens[idx - 1] not in out_dict[slot_name]:
collected_tokens.insert(0, tokens[idx - 1])
# add collected tokens to slots
out_dict[slot_name].extend(collected_tokens)
slot_names_to_ids = {value: key for key, value in enumerate(
self.extra_params["slot_names"])}
entities = []
# process out_dict
for slot_name in out_dict:
slot_id = slot_names_to_ids[slot_name]
slot_tokens = out_dict[slot_name]
slot_value = self.tokenizer.convert_tokens_to_string(
slot_tokens).strip()
entity = {
"entity": slot_name,
"value": slot_value,
"start": text.find(slot_value),
"end": text.find(slot_value) + len(slot_value),
"confidence": 0,
}
# The confidence of a slot is the average confidence of tokens in that slot.
indices = [tokens.index(token) for token in slot_tokens]
if len(slot_tokens) > 0:
total = functools.reduce(
lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
entity["confidence"] = total / len(slot_tokens)
else:
entity["confidence"] = 0
entities.append(entity)
return entities
def get_prediction(self, text: str): def get_prediction(self, text: str):
inputs = self.data_loader.encode_text(text, self.tokenizer) inputs = self.data_loader.encode_text(text, self.tokenizer)
slot_probas, intent_probas = self(inputs) # type: ignore intent_probas = self(inputs) # type: ignore
intent_probas_np = intent_probas.numpy() intent_probas_np = intent_probas.numpy()
@ -292,15 +196,8 @@ class JISF(tfbp.Model):
# get the confidences for each intent # get the confidences for each intent
intent_confidences = intent_probas_np[0] intent_confidences = intent_probas_np[0]
entities = []
if slot_probas is not None:
entities = self.get_slots_prediction(text, inputs, slot_probas)
return { return {
"text": text, "text": text,
"intent": {"name": self.extra_params["intent_names"][intent_id], "intent": {"name": self.extra_params["intent_names"][intent_id],
"confidence": float(intent_confidences[intent_id])}, "confidence": float(intent_confidences[intent_id])},
"entities": entities,
} }

250
nlu/models/slot_filler.py Normal file
View File

@ -0,0 +1,250 @@
import functools
import json
from transformers import TFBertModel, AutoTokenizer
from keras.layers import Dropout, Dense
from sys import platform
if platform == "darwin":
from keras.optimizers.legacy import Adam
else:
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy
import numpy as np
from data_loaders.jisfdl import JISFDL
from sklearn.metrics import classification_report
import boilerplate as tfbp
##
# Slot filling with BERT
# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
#
# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
# the paper with the original dataset.
##
BERT_MODEL_BY_LANGUAGE = {
'en': "bert-base-cased",
'fr': "dbmdz/bert-base-french-europeana-cased",
}
@tfbp.default_export
class SlotFiller(tfbp.Model):
default_hparams = {
"language": "",
"num_epochs": 2,
"dropout_prob": 0.1,
"slot_num_labels": 40
}
data_loader: JISFDL
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Init data loader
self.data_loader = JISFDL(**kwargs)
# Load Tokenizer from transformers
# We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
self.tokenizer = AutoTokenizer.from_pretrained(
bert_model_name, use_fast=False)
self.bert = TFBertModel.from_pretrained(bert_model_name)
self.dropout = Dropout(self.hparams.dropout_prob)
self.slot_classifier = Dense(self.hparams.slot_num_labels,
name="slot_classifier", activation="softmax")
def call(self, inputs, **kwargs):
trained_bert = self.bert(inputs, **kwargs)
sequence_output = trained_bert.last_hidden_state
# sequence_output will be used for slot_filling
sequence_output = self.dropout(sequence_output,
training=kwargs.get("training", False))
slot_probas = self.slot_classifier(sequence_output)
return slot_probas
@tfbp.runnable
def fit(self):
"""Training"""
encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader(
self.tokenizer)
if self.hparams.slot_num_labels != len(slot_names):
raise ValueError(
f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
)
# Hyperparams, Optimizer and Loss function
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
# two outputs, one for slots, another for intents
# we have to fine tune for both
losses = SparseCategoricalCrossentropy()
metrics = [SparseCategoricalAccuracy("accuracy")]
# Compile model
self.compile(optimizer=opt, loss=losses, metrics=metrics)
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],
"attention_mask": encoded_texts["attention_mask"]}
super().fit(
x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
# Persist the model
self.extra_params["slot_names"] = slot_names
self.save()
@tfbp.runnable
def evaluate(self):
"""Evaluation"""
# Load test data
# Assuming your data loader can return test data when mode='test' is specified
encoded_texts, _, encoded_slots, _, slot_names = self.data_loader(
self.tokenizer, self.extra_params)
# Get predictions
predictions = self(encoded_texts)
predicted_slot_ids = np.argmax(predictions, axis=-1) # Shape: (batch_size, sequence_length)
true_labels = encoded_slots.flatten()
pred_labels = predicted_slot_ids.flatten()
# Filter out padding tokens (assuming padding label id is 0)
mask = true_labels != 0
filtered_true_labels = true_labels[mask]
filtered_pred_labels = pred_labels[mask]
# Adjust labels to start from 0 (since padding label 0 is removed)
filtered_true_labels -= 1
filtered_pred_labels -= 1
# Get slot names excluding padding
slot_names_no_pad = self.extra_params["slot_names"][1:] # Exclude padding label
report = classification_report(
filtered_true_labels,
filtered_pred_labels,
target_names=slot_names_no_pad,
zero_division=0
)
print(report)
# Optionally, you can return the report as a string or dictionary
return report
@tfbp.runnable
def predict(self):
text = self.data_loader.get_prediction_data()
info = self.get_prediction(text)
print(self.summary())
print("Text : " + text)
print(json.dumps(info, indent=2))
return json.dumps(info, indent=2)
def get_slots_prediction(self, text: str, inputs, slot_probas):
slot_probas_np = slot_probas.numpy()
# Get the indices of the maximum values
slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
# get all slot names and add to out_dict as keys
out_dict = {}
predicted_slots = set([self.extra_params["slot_names"][s]
for s in slot_ids if s != 0])
for ps in predicted_slots:
out_dict[ps] = []
# retrieving the tokenization that was used in the predictions
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
# We'd like to eliminate all special tokens from our output
special_tokens = self.tokenizer.special_tokens_map.values()
for token, slot_id in zip(tokens, slot_ids):
if token in special_tokens:
continue
# add all to out_dict
slot_name = self.extra_params["slot_names"][slot_id]
if slot_name == "<PAD>":
continue
# collect tokens
collected_tokens = [token]
idx = tokens.index(token)
# see if it starts with ##
# then it belongs to the previous token
if token.startswith("##"):
# check if the token already exists or not
if tokens[idx - 1] not in out_dict[slot_name]:
collected_tokens.insert(0, tokens[idx - 1])
# add collected tokens to slots
out_dict[slot_name].extend(collected_tokens)
slot_names_to_ids = {value: key for key, value in enumerate(
self.extra_params["slot_names"])}
entities = []
# process out_dict
for slot_name in out_dict:
slot_id = slot_names_to_ids[slot_name]
slot_tokens = out_dict[slot_name]
slot_value = self.tokenizer.convert_tokens_to_string(
slot_tokens).strip()
entity = {
"entity": slot_name,
"value": slot_value,
"start": text.find(slot_value),
"end": text.find(slot_value) + len(slot_value),
"confidence": 0,
}
# The confidence of a slot is the average confidence of tokens in that slot.
indices = [tokens.index(token) for token in slot_tokens]
if len(slot_tokens) > 0:
total = functools.reduce(
lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
entity["confidence"] = total / len(slot_tokens)
else:
entity["confidence"] = 0
entities.append(entity)
return entities
def get_prediction(self, text: str):
inputs = self.data_loader.encode_text(text, self.tokenizer)
slot_probas = self(inputs) # type: ignore
entities = []
if slot_probas is not None:
entities = self.get_slots_prediction(text, inputs, slot_probas)
return {
"text": text,
"entities": entities,
}

View File

@ -4,7 +4,7 @@ import json
class JsonHelper: class JsonHelper:
data_folder: str data_folder: str
def __init__(self, model:str="jisf"): def __init__(self, model:str = "intent_classifier"):
self.data_folder=os.path.join("data",model) self.data_folder=os.path.join("data",model)
def read_dataset_json_file(self, filename): def read_dataset_json_file(self, filename):