mirror of
https://github.com/hexastack/hexabot
synced 2024-11-24 04:53:41 +00:00
Merge branch 'main' into 40-issue-prevent-users-from-deleting-their-own-roles
This commit is contained in:
commit
a999604472
@ -45,7 +45,8 @@ AUTH_TOKEN=token123
|
||||
LANGUAGE_CLASSIFIER=language-classifier
|
||||
INTENT_CLASSIFIERS=en,fr
|
||||
TFLC_REPO_ID=Hexastack/tflc
|
||||
JISF_REPO_ID=Hexastack/jisf
|
||||
INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
|
||||
SLOT_FILLER_REPO_ID=Hexastack/slot-filler
|
||||
NLP_PORT=5000
|
||||
|
||||
# Frontend (Next.js)
|
||||
|
@ -22,7 +22,6 @@ import {
|
||||
Tab,
|
||||
Tabs,
|
||||
Tooltip,
|
||||
debounce,
|
||||
tabsClasses,
|
||||
} from "@mui/material";
|
||||
import {
|
||||
@ -32,7 +31,13 @@ import {
|
||||
DiagramModel,
|
||||
DiagramModelGenerics,
|
||||
} from "@projectstorm/react-diagrams";
|
||||
import { SyntheticEvent, useEffect, useRef, useState } from "react";
|
||||
import {
|
||||
SyntheticEvent,
|
||||
useCallback,
|
||||
useEffect,
|
||||
useRef,
|
||||
useState,
|
||||
} from "react";
|
||||
import { useTranslation } from "react-i18next";
|
||||
|
||||
import { DeleteDialog } from "@/app-components/dialogs";
|
||||
@ -41,6 +46,7 @@ import { useDelete, useDeleteFromCache } from "@/hooks/crud/useDelete";
|
||||
import { useFind } from "@/hooks/crud/useFind";
|
||||
import { useGetFromCache } from "@/hooks/crud/useGet";
|
||||
import { useUpdate, useUpdateCache } from "@/hooks/crud/useUpdate";
|
||||
import useDebouncedUpdate from "@/hooks/useDebouncedUpdate";
|
||||
import { getDisplayDialogs, useDialog } from "@/hooks/useDialog";
|
||||
import { useSearch } from "@/hooks/useSearch";
|
||||
import { EntityType, Format } from "@/services/types";
|
||||
@ -108,29 +114,36 @@ const Diagrams = () => {
|
||||
const { mutateAsync: updateBlock } = useUpdate(EntityType.BLOCK, {
|
||||
invalidate: false,
|
||||
});
|
||||
const debouncedZoomEvent = debounce((event) => {
|
||||
if (selectedCategoryId) {
|
||||
engine?.repaintCanvas();
|
||||
updateCategory({
|
||||
id: selectedCategoryId,
|
||||
params: {
|
||||
zoom: event.zoom,
|
||||
},
|
||||
});
|
||||
}
|
||||
event.stopPropagation();
|
||||
}, 200);
|
||||
const debouncedOffsetEvent = debounce((event) => {
|
||||
if (selectedCategoryId) {
|
||||
updateCategory({
|
||||
id: selectedCategoryId,
|
||||
params: {
|
||||
offset: [event.offsetX, event.offsetY],
|
||||
},
|
||||
});
|
||||
}
|
||||
event.stopPropagation();
|
||||
}, 200);
|
||||
const debouncedUpdateCategory = useDebouncedUpdate(updateCategory, 300);
|
||||
const debouncedZoomEvent = useCallback(
|
||||
(event: any) => {
|
||||
if (selectedCategoryId) {
|
||||
engine?.repaintCanvas();
|
||||
debouncedUpdateCategory({
|
||||
id: selectedCategoryId,
|
||||
params: {
|
||||
zoom: event.zoom,
|
||||
},
|
||||
});
|
||||
}
|
||||
event.stopPropagation();
|
||||
},
|
||||
[selectedCategoryId, engine, debouncedUpdateCategory],
|
||||
);
|
||||
const debouncedOffsetEvent = useCallback(
|
||||
(event: any) => {
|
||||
if (selectedCategoryId) {
|
||||
debouncedUpdateCategory({
|
||||
id: selectedCategoryId,
|
||||
params: {
|
||||
offset: [event.offsetX, event.offsetY],
|
||||
},
|
||||
});
|
||||
}
|
||||
event.stopPropagation();
|
||||
},
|
||||
[selectedCategoryId, debouncedUpdateCategory],
|
||||
);
|
||||
const getBlockFromCache = useGetFromCache(EntityType.BLOCK);
|
||||
const updateCachedBlock = useUpdateCache(EntityType.BLOCK);
|
||||
const deleteCachedBlock = useDeleteFromCache(EntityType.BLOCK);
|
||||
|
54
frontend/src/hooks/useDebouncedUpdate.tsx
Normal file
54
frontend/src/hooks/useDebouncedUpdate.tsx
Normal file
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright © 2024 Hexastack. All rights reserved.
|
||||
*
|
||||
* Licensed under the GNU Affero General Public License v3.0 (AGPLv3) with the following additional terms:
|
||||
* 1. The name "Hexabot" is a trademark of Hexastack. You may not use this name in derivative works without express written permission.
|
||||
* 2. All derivative works must include clear attribution to the original creator and software, Hexastack and Hexabot, in a prominent location (e.g., in the software's "About" section, documentation, and README file).
|
||||
* 3. SaaS Restriction: This software, or any derivative of it, may not be used to offer a competing product or service (SaaS) without prior written consent from Hexastack. Offering the software as a service or using it in a commercial cloud environment without express permission is strictly prohibited.
|
||||
*/
|
||||
|
||||
import { debounce } from "@mui/material";
|
||||
import { useCallback, useEffect, useRef } from "react";
|
||||
|
||||
type DebouncedUpdateParams = {
|
||||
id: string;
|
||||
params: Record<string, any>;
|
||||
};
|
||||
|
||||
function useDebouncedUpdate(
|
||||
apiUpdate: (params: DebouncedUpdateParams) => void,
|
||||
delay: number = 300,
|
||||
) {
|
||||
const accumulatedUpdates = useRef<DebouncedUpdateParams | null>(null);
|
||||
const processUpdates = useRef(
|
||||
debounce(() => {
|
||||
if (accumulatedUpdates.current) {
|
||||
apiUpdate(accumulatedUpdates.current);
|
||||
accumulatedUpdates.current = null;
|
||||
}
|
||||
}, delay),
|
||||
).current;
|
||||
const handleUpdate = useCallback(
|
||||
(params: DebouncedUpdateParams) => {
|
||||
accumulatedUpdates.current = {
|
||||
id: params.id,
|
||||
params: {
|
||||
...(accumulatedUpdates.current?.params || {}),
|
||||
...params.params,
|
||||
},
|
||||
};
|
||||
processUpdates();
|
||||
},
|
||||
[processUpdates],
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
processUpdates.clear();
|
||||
};
|
||||
}, [processUpdates]);
|
||||
|
||||
return handleUpdate;
|
||||
}
|
||||
|
||||
export default useDebouncedUpdate;
|
@ -2,4 +2,5 @@ AUTH_TOKEN=123
|
||||
LANGUAGE_CLASSIFIER=language-classifier
|
||||
INTENT_CLASSIFIERS=ar,fr,tn
|
||||
TFLC_REPO_ID=Hexastack/tflc
|
||||
JISF_REPO_ID=Hexastack/jisf
|
||||
INTENT_CLASSIFIER_REPO_ID=Hexastack/intent-classifier
|
||||
SLOT_FILLER_REPO_ID=Hexastack/slot-filler
|
||||
|
@ -1,5 +1,5 @@
|
||||
AUTH_TOKEN=
|
||||
LANGUAGE_CLASSIFIER=
|
||||
INTENT_CLASSIFIERS=
|
||||
TFLC_REPO_ID=
|
||||
JISF_REPO_ID=
|
||||
INTENT_CLASSIFIER_REPO_ID=
|
||||
SLOT_FILLER_REPO_ID=
|
@ -40,7 +40,7 @@ pip install -r requirements.txt
|
||||
You should run `source env.sh` on each new shell session. This activates the virtualenv and creates a nice alias for `run.py`:
|
||||
```bash
|
||||
$ cat env.sh
|
||||
source env/bin/activate
|
||||
source venv/bin/activate
|
||||
alias run='python run.py'
|
||||
```
|
||||
|
||||
@ -53,7 +53,7 @@ run fit myexperiment1 mlp mnist --batch_size=32 --learning_rate=0.1
|
||||
Examples :
|
||||
```bash
|
||||
# Intent classification
|
||||
run fit intent-classifier-en-30072024 jisf --intent_num_labels=88 --slot_num_labels=17 --language=en
|
||||
run fit intent-classifier-en-30072024 intent_classifier --intent_num_labels=88 --slot_num_labels=17 --language=en
|
||||
run predict intent-classifier-fr-30072024 --intent_num_labels=7 --slot_num_labels=2 --language=fr
|
||||
|
||||
# Language classification
|
||||
|
@ -4,8 +4,8 @@ import json
|
||||
import numpy as np
|
||||
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
|
||||
|
||||
|
||||
import boilerplate as tfbp
|
||||
from utils.jisf_data_mapper import JisfDataMapper
|
||||
from utils.json_helper import JsonHelper
|
||||
|
||||
|
||||
@ -101,8 +101,11 @@ class JISFDL(tfbp.DataLoader):
|
||||
# Filter examples by language
|
||||
lang = self.hparams.language
|
||||
all_examples = data["common_examples"]
|
||||
examples = filter(lambda exp: any(
|
||||
e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
|
||||
|
||||
if not bool(lang):
|
||||
examples = all_examples
|
||||
else:
|
||||
examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
|
||||
|
||||
# Parse raw data
|
||||
for exp in examples:
|
||||
@ -145,7 +148,6 @@ class JISFDL(tfbp.DataLoader):
|
||||
# the classifier.
|
||||
texts = [d.text for d in dataset]
|
||||
encoded_texts = self.encode_texts(texts, tokenizer)
|
||||
|
||||
# Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
|
||||
intents = [d.intent for d in dataset]
|
||||
if not model_params:
|
||||
@ -161,19 +163,35 @@ class JISFDL(tfbp.DataLoader):
|
||||
# To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
|
||||
slot_names.insert(0, "<PAD>")
|
||||
else:
|
||||
intent_names = model_params.intent_names
|
||||
slot_names = model_params.slot_names
|
||||
if "intent_names" in model_params:
|
||||
intent_names = model_params["intent_names"]
|
||||
else:
|
||||
intent_names = None
|
||||
|
||||
intent_map = dict() # Dict : intent -> index
|
||||
for idx, ui in enumerate(intent_names):
|
||||
intent_map[ui] = idx
|
||||
if "slot_names" in model_params:
|
||||
slot_names = model_params["slot_names"]
|
||||
else:
|
||||
slot_names = None
|
||||
|
||||
if intent_names:
|
||||
intent_map = dict() # Dict : intent -> index
|
||||
for idx, ui in enumerate(intent_names):
|
||||
intent_map[ui] = idx
|
||||
else:
|
||||
intent_map = None
|
||||
|
||||
# Encode intents
|
||||
encoded_intents = self.encode_intents(intents, intent_map)
|
||||
if intent_map:
|
||||
encoded_intents = self.encode_intents(intents, intent_map)
|
||||
else:
|
||||
encoded_intents = None
|
||||
|
||||
slot_map: Dict[str, int] = dict() # slot -> index
|
||||
for idx, us in enumerate(slot_names):
|
||||
slot_map[us] = idx
|
||||
if slot_names:
|
||||
slot_map: Dict[str, int] = dict() # slot -> index
|
||||
for idx, us in enumerate(slot_names):
|
||||
slot_map[us] = idx
|
||||
else:
|
||||
slot_map = None
|
||||
|
||||
# Encode slots
|
||||
# Text : Add a tune to my elrow Guest List
|
||||
@ -183,8 +201,12 @@ class JISFDL(tfbp.DataLoader):
|
||||
max_len = len(encoded_texts["input_ids"][0]) # type: ignore
|
||||
all_slots = [td.slots for td in dataset]
|
||||
all_texts = [td.text for td in dataset]
|
||||
encoded_slots = self.encode_slots(tokenizer,
|
||||
|
||||
if slot_map:
|
||||
encoded_slots = self.encode_slots(tokenizer,
|
||||
all_slots, all_texts, slot_map, max_len)
|
||||
else:
|
||||
encoded_slots = None
|
||||
|
||||
return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
|
||||
|
||||
|
@ -29,7 +29,7 @@ class TFLCDL(tfbp.DataLoader):
|
||||
|
||||
self.json_helper = JsonHelper("tflc")
|
||||
self._save_dir = save_dir
|
||||
print(hparams)
|
||||
|
||||
# We will opt for a TF-IDF representation of the data as the frequency of word
|
||||
# roots should give us a good idea about which language we're dealing with.
|
||||
if method == "fit":
|
||||
|
43
nlu/main.py
43
nlu/main.py
@ -15,8 +15,8 @@ AUTH_TOKEN = os.getenv("AUTH_TOKEN", "TOKEN_MUST_BE_DEFINED")
|
||||
|
||||
AVAILABLE_LANGUAGES = os.getenv("AVAILABLE_LANGUAGES", "en,fr").split(',')
|
||||
TFLC_REPO_ID = os.getenv("TFLC_REPO_ID")
|
||||
JISF_REPO_ID = os.getenv("JISF_REPO_ID")
|
||||
|
||||
INTENT_CLASSIFIER_REPO_ID = os.getenv("INTENT_CLASSIFIER_REPO_ID")
|
||||
SLOT_FILLER_REPO_ID = os.getenv("SLOT_FILLER_REPO_ID")
|
||||
|
||||
def load_language_classifier():
|
||||
# Init language classifier model
|
||||
@ -27,21 +27,31 @@ def load_language_classifier():
|
||||
logging.info(f'Successfully loaded the language classifier model')
|
||||
return model
|
||||
|
||||
|
||||
def load_intent_classifiers():
|
||||
Model = tfbp.get_model("jisf")
|
||||
models = {}
|
||||
Model = tfbp.get_model("intent_classifier")
|
||||
intent_classifiers = {}
|
||||
for language in AVAILABLE_LANGUAGES:
|
||||
kwargs = {}
|
||||
models[language] = Model(save_dir=language, method="predict", repo_id=JISF_REPO_ID, **kwargs)
|
||||
models[language].load_model()
|
||||
intent_classifiers[language] = Model(save_dir=language, method="predict", repo_id=INTENT_CLASSIFIER_REPO_ID, **kwargs)
|
||||
intent_classifiers[language].load_model()
|
||||
logging.info(f'Successfully loaded the intent classifier {language} model')
|
||||
return models
|
||||
return intent_classifiers
|
||||
|
||||
def load_slot_classifiers():
|
||||
Model = tfbp.get_model("slot_classifier")
|
||||
slot_fillers = {}
|
||||
for language in AVAILABLE_LANGUAGES:
|
||||
kwargs = {}
|
||||
slot_fillers[language] = Model(save_dir=language, method="predict", repo_id=SLOT_FILLER_REPO_ID, **kwargs)
|
||||
slot_fillers[language].load_model()
|
||||
logging.info(f'Successfully loaded the slot filler {language} model')
|
||||
return slot_fillers
|
||||
|
||||
|
||||
def load_models():
|
||||
app.language_classifier = load_language_classifier() # type: ignore
|
||||
app.intent_classifiers = load_intent_classifiers() # type: ignore
|
||||
app.slot_fillers = load_intent_classifiers() # type: ignore
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@ -74,13 +84,20 @@ async def check_health():
|
||||
|
||||
@app.post("/parse")
|
||||
def parse(input: ParseInput, is_authenticated: Annotated[str, Depends(authenticate)]):
|
||||
if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers'):
|
||||
if not hasattr(app, 'language_classifier') or not hasattr(app, 'intent_classifiers') or not hasattr(app, 'slot_fillers'):
|
||||
headers = {"Retry-After": "120"} # Suggest retrying after 2 minutes
|
||||
return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are loading, please retry later."}, headers=headers)
|
||||
return JSONResponse(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, content={"message": "Models are still loading, please retry later."}, headers=headers)
|
||||
|
||||
language = app.language_classifier.get_prediction(input.q) # type: ignore
|
||||
lang = language.get("value")
|
||||
prediction = app.intent_classifiers[lang].get_prediction(
|
||||
intent_prediction = app.intent_classifiers[lang].get_prediction(
|
||||
input.q) # type: ignore
|
||||
prediction.get("entities").append(language)
|
||||
return prediction
|
||||
slot_prediction = app.slot_fillers[lang].get_prediction(
|
||||
input.q) # type: ignore
|
||||
slot_prediction.get("entities").append(language)
|
||||
|
||||
return {
|
||||
"text": input.q,
|
||||
"intent": intent_prediction.get("intent"),
|
||||
"entities": slot_prediction.get("entities"),
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
import functools
|
||||
import json
|
||||
import math
|
||||
from typing import Tuple, Dict, List
|
||||
@ -22,8 +21,8 @@ from data_loaders.jisfdl import JISFDL
|
||||
import boilerplate as tfbp
|
||||
|
||||
##
|
||||
# JISF : Joint Intent Classification and Slot filling with BERT
|
||||
# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
|
||||
# Intent Classification with BERT
|
||||
# This code is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
|
||||
# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
|
||||
#
|
||||
# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
|
||||
@ -33,19 +32,16 @@ import boilerplate as tfbp
|
||||
BERT_MODEL_BY_LANGUAGE = {
|
||||
'en': "bert-base-cased",
|
||||
'fr': "dbmdz/bert-base-french-europeana-cased",
|
||||
'ar': 'asafaya/bert-base-arabic',
|
||||
'tn': 'dbmdz/bert-base-french-europeana-cased'
|
||||
}
|
||||
|
||||
|
||||
@tfbp.default_export
|
||||
class JISF(tfbp.Model):
|
||||
class IntentClassifier(tfbp.Model):
|
||||
default_hparams = {
|
||||
"language": "fr",
|
||||
"language": "",
|
||||
"num_epochs": 2,
|
||||
"dropout_prob": 0.1,
|
||||
"intent_num_labels": 7,
|
||||
"slot_num_labels": 40
|
||||
}
|
||||
data_loader: JISFDL
|
||||
|
||||
@ -57,8 +53,8 @@ class JISF(tfbp.Model):
|
||||
|
||||
# Load Tokenizer from transformers
|
||||
# We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
|
||||
bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language]
|
||||
# bert_model_name = typing.cast(str, self.hparams.bert_model_name)
|
||||
bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
bert_model_name, use_fast=False)
|
||||
self.bert = TFBertModel.from_pretrained(bert_model_name)
|
||||
@ -66,27 +62,18 @@ class JISF(tfbp.Model):
|
||||
self.dropout = Dropout(self.hparams.dropout_prob)
|
||||
self.intent_classifier = Dense(self.hparams.intent_num_labels,
|
||||
name="intent_classifier", activation="softmax")
|
||||
self.slot_classifier = Dense(self.hparams.slot_num_labels,
|
||||
name="slot_classifier", activation="softmax")
|
||||
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
# two outputs from BERT
|
||||
trained_bert = self.bert(inputs, **kwargs)
|
||||
pooled_output = trained_bert.pooler_output
|
||||
sequence_output = trained_bert.last_hidden_state
|
||||
|
||||
# sequence_output will be used for slot_filling / classification
|
||||
sequence_output = self.dropout(sequence_output,
|
||||
training=kwargs.get("training", False))
|
||||
slot_probas = self.slot_classifier(sequence_output)
|
||||
|
||||
# pooled_output for intent classification
|
||||
pooled_output = self.dropout(pooled_output,
|
||||
training=kwargs.get("training", False))
|
||||
intent_probas = self.intent_classifier(pooled_output)
|
||||
|
||||
return slot_probas, intent_probas
|
||||
return intent_probas
|
||||
|
||||
def load_data(self, data_loader) -> Tuple[BatchEncoding, tf.Tensor, ndarray, int, int]:
|
||||
return data_loader(self.tokenizer)
|
||||
@ -137,18 +124,11 @@ class JISF(tfbp.Model):
|
||||
raise ValueError(
|
||||
f"Hyperparam intent_num_labels mismatch, should be : {len(intent_names)}"
|
||||
)
|
||||
if self.hparams.slot_num_labels != len(slot_names):
|
||||
raise ValueError(
|
||||
f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
|
||||
)
|
||||
|
||||
# Hyperparams, Optimizer and Loss function
|
||||
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
|
||||
|
||||
# two outputs, one for slots, another for intents
|
||||
# we have to fine tune for both
|
||||
losses = [SparseCategoricalCrossentropy(),
|
||||
SparseCategoricalCrossentropy()]
|
||||
losses = SparseCategoricalCrossentropy()
|
||||
|
||||
metrics = [SparseCategoricalAccuracy("accuracy")]
|
||||
|
||||
@ -159,11 +139,10 @@ class JISF(tfbp.Model):
|
||||
"attention_mask": encoded_texts["attention_mask"]}
|
||||
|
||||
super().fit(
|
||||
x, (encoded_slots, encoded_intents), epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
|
||||
x, encoded_intents, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
|
||||
|
||||
# Persist the model
|
||||
self.extra_params["intent_names"] = intent_names
|
||||
self.extra_params["slot_names"] = slot_names
|
||||
|
||||
self.save()
|
||||
|
||||
@ -175,7 +154,7 @@ class JISF(tfbp.Model):
|
||||
metrics = [SparseCategoricalAccuracy("accuracy")]
|
||||
self.compile(metrics=metrics)
|
||||
|
||||
_, intent_probas = self(encoded_texts) # type: ignore
|
||||
intent_probas = self(encoded_texts) # type: ignore
|
||||
|
||||
scores = self.get_metrics_by_intent(intent_probas, encoded_intents)
|
||||
|
||||
@ -205,84 +184,9 @@ class JISF(tfbp.Model):
|
||||
|
||||
return json.dumps(info, indent=2)
|
||||
|
||||
def get_slots_prediction(self, text: str, inputs, slot_probas):
|
||||
slot_probas_np = slot_probas.numpy()
|
||||
# Get the indices of the maximum values
|
||||
slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
|
||||
|
||||
# get all slot names and add to out_dict as keys
|
||||
out_dict = {}
|
||||
predicted_slots = set([self.extra_params["slot_names"][s]
|
||||
for s in slot_ids if s != 0])
|
||||
for ps in predicted_slots:
|
||||
out_dict[ps] = []
|
||||
|
||||
# retrieving the tokenization that was used in the predictions
|
||||
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
||||
|
||||
# We'd like to eliminate all special tokens from our output
|
||||
special_tokens = self.tokenizer.special_tokens_map.values()
|
||||
|
||||
for token, slot_id in zip(tokens, slot_ids):
|
||||
if token in special_tokens:
|
||||
continue
|
||||
# add all to out_dict
|
||||
slot_name = self.extra_params["slot_names"][slot_id]
|
||||
|
||||
if slot_name == "<PAD>":
|
||||
continue
|
||||
|
||||
# collect tokens
|
||||
collected_tokens = [token]
|
||||
idx = tokens.index(token)
|
||||
|
||||
# see if it starts with ##
|
||||
# then it belongs to the previous token
|
||||
if token.startswith("##"):
|
||||
# check if the token already exists or not
|
||||
if tokens[idx - 1] not in out_dict[slot_name]:
|
||||
collected_tokens.insert(0, tokens[idx - 1])
|
||||
|
||||
# add collected tokens to slots
|
||||
out_dict[slot_name].extend(collected_tokens)
|
||||
|
||||
slot_names_to_ids = {value: key for key, value in enumerate(
|
||||
self.extra_params["slot_names"])}
|
||||
|
||||
entities = []
|
||||
# process out_dict
|
||||
for slot_name in out_dict:
|
||||
slot_id = slot_names_to_ids[slot_name]
|
||||
slot_tokens = out_dict[slot_name]
|
||||
|
||||
slot_value = self.tokenizer.convert_tokens_to_string(
|
||||
slot_tokens).strip()
|
||||
|
||||
entity = {
|
||||
"entity": slot_name,
|
||||
"value": slot_value,
|
||||
"start": text.find(slot_value),
|
||||
"end": text.find(slot_value) + len(slot_value),
|
||||
"confidence": 0,
|
||||
}
|
||||
|
||||
# The confidence of a slot is the average confidence of tokens in that slot.
|
||||
indices = [tokens.index(token) for token in slot_tokens]
|
||||
if len(slot_tokens) > 0:
|
||||
total = functools.reduce(
|
||||
lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
|
||||
entity["confidence"] = total / len(slot_tokens)
|
||||
else:
|
||||
entity["confidence"] = 0
|
||||
|
||||
entities.append(entity)
|
||||
|
||||
return entities
|
||||
|
||||
|
||||
def get_prediction(self, text: str):
|
||||
inputs = self.data_loader.encode_text(text, self.tokenizer)
|
||||
slot_probas, intent_probas = self(inputs) # type: ignore
|
||||
intent_probas = self(inputs) # type: ignore
|
||||
|
||||
intent_probas_np = intent_probas.numpy()
|
||||
|
||||
@ -292,15 +196,8 @@ class JISF(tfbp.Model):
|
||||
# get the confidences for each intent
|
||||
intent_confidences = intent_probas_np[0]
|
||||
|
||||
|
||||
entities = []
|
||||
if slot_probas is not None:
|
||||
entities = self.get_slots_prediction(text, inputs, slot_probas)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"intent": {"name": self.extra_params["intent_names"][intent_id],
|
||||
"confidence": float(intent_confidences[intent_id])},
|
||||
"entities": entities,
|
||||
}
|
||||
|
250
nlu/models/slot_filler.py
Normal file
250
nlu/models/slot_filler.py
Normal file
@ -0,0 +1,250 @@
|
||||
import functools
|
||||
import json
|
||||
from transformers import TFBertModel, AutoTokenizer
|
||||
from keras.layers import Dropout, Dense
|
||||
from sys import platform
|
||||
|
||||
if platform == "darwin":
|
||||
from keras.optimizers.legacy import Adam
|
||||
else:
|
||||
from keras.optimizers import Adam
|
||||
|
||||
from keras.losses import SparseCategoricalCrossentropy
|
||||
from keras.metrics import SparseCategoricalAccuracy
|
||||
import numpy as np
|
||||
|
||||
from data_loaders.jisfdl import JISFDL
|
||||
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
|
||||
import boilerplate as tfbp
|
||||
|
||||
##
|
||||
# Slot filling with BERT
|
||||
# This notebook is based on the paper BERT for Joint Intent Classification and Slot Filling by Chen et al. (2019),
|
||||
# https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.
|
||||
#
|
||||
# Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of
|
||||
# the paper with the original dataset.
|
||||
##
|
||||
|
||||
BERT_MODEL_BY_LANGUAGE = {
|
||||
'en': "bert-base-cased",
|
||||
'fr': "dbmdz/bert-base-french-europeana-cased",
|
||||
}
|
||||
|
||||
|
||||
@tfbp.default_export
|
||||
class SlotFiller(tfbp.Model):
|
||||
default_hparams = {
|
||||
"language": "",
|
||||
"num_epochs": 2,
|
||||
"dropout_prob": 0.1,
|
||||
"slot_num_labels": 40
|
||||
}
|
||||
data_loader: JISFDL
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Init data loader
|
||||
self.data_loader = JISFDL(**kwargs)
|
||||
|
||||
# Load Tokenizer from transformers
|
||||
# We will use a pretrained bert model bert-base-cased for both Tokenizer and our classifier.
|
||||
bert_model_name = BERT_MODEL_BY_LANGUAGE[self.hparams.language or "en"]
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
bert_model_name, use_fast=False)
|
||||
self.bert = TFBertModel.from_pretrained(bert_model_name)
|
||||
|
||||
self.dropout = Dropout(self.hparams.dropout_prob)
|
||||
self.slot_classifier = Dense(self.hparams.slot_num_labels,
|
||||
name="slot_classifier", activation="softmax")
|
||||
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
trained_bert = self.bert(inputs, **kwargs)
|
||||
sequence_output = trained_bert.last_hidden_state
|
||||
|
||||
# sequence_output will be used for slot_filling
|
||||
sequence_output = self.dropout(sequence_output,
|
||||
training=kwargs.get("training", False))
|
||||
slot_probas = self.slot_classifier(sequence_output)
|
||||
|
||||
return slot_probas
|
||||
|
||||
@tfbp.runnable
|
||||
def fit(self):
|
||||
"""Training"""
|
||||
encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names = self.data_loader(
|
||||
self.tokenizer)
|
||||
|
||||
if self.hparams.slot_num_labels != len(slot_names):
|
||||
raise ValueError(
|
||||
f"Hyperparam slot_num_labels mismatch, should be : {len(slot_names)}"
|
||||
)
|
||||
|
||||
# Hyperparams, Optimizer and Loss function
|
||||
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
|
||||
|
||||
# two outputs, one for slots, another for intents
|
||||
# we have to fine tune for both
|
||||
losses = SparseCategoricalCrossentropy()
|
||||
|
||||
metrics = [SparseCategoricalAccuracy("accuracy")]
|
||||
|
||||
# Compile model
|
||||
self.compile(optimizer=opt, loss=losses, metrics=metrics)
|
||||
|
||||
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],
|
||||
"attention_mask": encoded_texts["attention_mask"]}
|
||||
|
||||
super().fit(
|
||||
x, encoded_slots, epochs=self.hparams.num_epochs, batch_size=32, shuffle=True)
|
||||
|
||||
# Persist the model
|
||||
self.extra_params["slot_names"] = slot_names
|
||||
|
||||
self.save()
|
||||
|
||||
@tfbp.runnable
|
||||
def evaluate(self):
|
||||
"""Evaluation"""
|
||||
# Load test data
|
||||
# Assuming your data loader can return test data when mode='test' is specified
|
||||
encoded_texts, _, encoded_slots, _, slot_names = self.data_loader(
|
||||
self.tokenizer, self.extra_params)
|
||||
|
||||
# Get predictions
|
||||
predictions = self(encoded_texts)
|
||||
predicted_slot_ids = np.argmax(predictions, axis=-1) # Shape: (batch_size, sequence_length)
|
||||
|
||||
true_labels = encoded_slots.flatten()
|
||||
pred_labels = predicted_slot_ids.flatten()
|
||||
|
||||
# Filter out padding tokens (assuming padding label id is 0)
|
||||
mask = true_labels != 0
|
||||
filtered_true_labels = true_labels[mask]
|
||||
filtered_pred_labels = pred_labels[mask]
|
||||
|
||||
# Adjust labels to start from 0 (since padding label 0 is removed)
|
||||
filtered_true_labels -= 1
|
||||
filtered_pred_labels -= 1
|
||||
|
||||
# Get slot names excluding padding
|
||||
slot_names_no_pad = self.extra_params["slot_names"][1:] # Exclude padding label
|
||||
|
||||
|
||||
report = classification_report(
|
||||
filtered_true_labels,
|
||||
filtered_pred_labels,
|
||||
target_names=slot_names_no_pad,
|
||||
zero_division=0
|
||||
)
|
||||
|
||||
print(report)
|
||||
|
||||
# Optionally, you can return the report as a string or dictionary
|
||||
return report
|
||||
|
||||
@tfbp.runnable
|
||||
def predict(self):
|
||||
text = self.data_loader.get_prediction_data()
|
||||
|
||||
info = self.get_prediction(text)
|
||||
|
||||
print(self.summary())
|
||||
print("Text : " + text)
|
||||
print(json.dumps(info, indent=2))
|
||||
|
||||
return json.dumps(info, indent=2)
|
||||
|
||||
def get_slots_prediction(self, text: str, inputs, slot_probas):
|
||||
slot_probas_np = slot_probas.numpy()
|
||||
# Get the indices of the maximum values
|
||||
slot_ids = slot_probas_np.argmax(axis=-1)[0, :]
|
||||
|
||||
# get all slot names and add to out_dict as keys
|
||||
out_dict = {}
|
||||
predicted_slots = set([self.extra_params["slot_names"][s]
|
||||
for s in slot_ids if s != 0])
|
||||
for ps in predicted_slots:
|
||||
out_dict[ps] = []
|
||||
|
||||
# retrieving the tokenization that was used in the predictions
|
||||
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
||||
|
||||
# We'd like to eliminate all special tokens from our output
|
||||
special_tokens = self.tokenizer.special_tokens_map.values()
|
||||
|
||||
for token, slot_id in zip(tokens, slot_ids):
|
||||
if token in special_tokens:
|
||||
continue
|
||||
# add all to out_dict
|
||||
slot_name = self.extra_params["slot_names"][slot_id]
|
||||
|
||||
if slot_name == "<PAD>":
|
||||
continue
|
||||
|
||||
# collect tokens
|
||||
collected_tokens = [token]
|
||||
idx = tokens.index(token)
|
||||
|
||||
# see if it starts with ##
|
||||
# then it belongs to the previous token
|
||||
if token.startswith("##"):
|
||||
# check if the token already exists or not
|
||||
if tokens[idx - 1] not in out_dict[slot_name]:
|
||||
collected_tokens.insert(0, tokens[idx - 1])
|
||||
|
||||
# add collected tokens to slots
|
||||
out_dict[slot_name].extend(collected_tokens)
|
||||
|
||||
slot_names_to_ids = {value: key for key, value in enumerate(
|
||||
self.extra_params["slot_names"])}
|
||||
|
||||
entities = []
|
||||
# process out_dict
|
||||
for slot_name in out_dict:
|
||||
slot_id = slot_names_to_ids[slot_name]
|
||||
slot_tokens = out_dict[slot_name]
|
||||
|
||||
slot_value = self.tokenizer.convert_tokens_to_string(
|
||||
slot_tokens).strip()
|
||||
|
||||
entity = {
|
||||
"entity": slot_name,
|
||||
"value": slot_value,
|
||||
"start": text.find(slot_value),
|
||||
"end": text.find(slot_value) + len(slot_value),
|
||||
"confidence": 0,
|
||||
}
|
||||
|
||||
# The confidence of a slot is the average confidence of tokens in that slot.
|
||||
indices = [tokens.index(token) for token in slot_tokens]
|
||||
if len(slot_tokens) > 0:
|
||||
total = functools.reduce(
|
||||
lambda proba1, proba2: proba1+proba2, slot_probas_np[0, indices, slot_id], 0)
|
||||
entity["confidence"] = total / len(slot_tokens)
|
||||
else:
|
||||
entity["confidence"] = 0
|
||||
|
||||
entities.append(entity)
|
||||
|
||||
return entities
|
||||
|
||||
|
||||
def get_prediction(self, text: str):
|
||||
inputs = self.data_loader.encode_text(text, self.tokenizer)
|
||||
slot_probas = self(inputs) # type: ignore
|
||||
|
||||
entities = []
|
||||
if slot_probas is not None:
|
||||
entities = self.get_slots_prediction(text, inputs, slot_probas)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"entities": entities,
|
||||
}
|
@ -4,7 +4,7 @@ import json
|
||||
class JsonHelper:
|
||||
data_folder: str
|
||||
|
||||
def __init__(self, model:str="jisf"):
|
||||
def __init__(self, model:str = "intent_classifier"):
|
||||
self.data_folder=os.path.join("data",model)
|
||||
|
||||
def read_dataset_json_file(self, filename):
|
||||
|
Loading…
Reference in New Issue
Block a user