mirror of
https://github.com/hexastack/hexabot
synced 2025-06-26 18:27:28 +00:00
Merge pull request #258 from MohamedAliBouhaouala/enhance/intent-classifier
Enhance/intent classifier
This commit is contained in:
commit
3f8d0a749c
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ certbot
|
|||||||
docker/**/data
|
docker/**/data
|
||||||
node_modules/
|
node_modules/
|
||||||
/docker/**/db_data
|
/docker/**/db_data
|
||||||
|
.idea/
|
@ -210,10 +210,6 @@ class JISFDL(tfbp.DataLoader):
|
|||||||
|
|
||||||
return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
|
return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
|
||||||
|
|
||||||
def get_prediction_data(self) -> str:
|
|
||||||
helper = JsonHelper()
|
|
||||||
dataset = helper.read_dataset_json_file('predict.json')
|
|
||||||
return dataset["text"]
|
|
||||||
|
|
||||||
def encode_text(self, text: str, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
|
def encode_text(self, text: str, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
|
||||||
return self.encode_texts([text], tokenizer)
|
return self.encode_texts([text], tokenizer)
|
||||||
|
@ -125,14 +125,6 @@ class TFLCDL(tfbp.DataLoader):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Unknown method!")
|
raise ValueError("Unknown method!")
|
||||||
|
|
||||||
def get_prediction_data(self):
|
|
||||||
# The predict file contains a single JSON object whose only key is text.
|
|
||||||
data = self.json_helper.read_dataset_json_file("predict.json")
|
|
||||||
text = self.strip_numbers(data["text"])
|
|
||||||
encoded_texts = np.array(self.tfidf.transform(
|
|
||||||
[text]).toarray()) # type: ignore
|
|
||||||
return np.array([text]), encoded_texts
|
|
||||||
|
|
||||||
def encode_text(self, text: str):
|
def encode_text(self, text: str):
|
||||||
sanitized_text = self.strip_numbers(text)
|
sanitized_text = self.strip_numbers(text)
|
||||||
return self.tfidf.transform([sanitized_text]).toarray() # type: ignore
|
return self.tfidf.transform([sanitized_text]).toarray() # type: ignore
|
||||||
|
@ -12,8 +12,8 @@ if platform == "darwin":
|
|||||||
else:
|
else:
|
||||||
from keras.optimizers import Adam
|
from keras.optimizers import Adam
|
||||||
|
|
||||||
from keras.losses import SparseCategoricalCrossentropy
|
|
||||||
from keras.metrics import SparseCategoricalAccuracy
|
from keras.metrics import SparseCategoricalAccuracy
|
||||||
|
from focal_loss import SparseCategoricalFocalLoss
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from data_loaders.jisfdl import JISFDL
|
from data_loaders.jisfdl import JISFDL
|
||||||
@ -42,6 +42,8 @@ class IntentClassifier(tfbp.Model):
|
|||||||
"num_epochs": 2,
|
"num_epochs": 2,
|
||||||
"dropout_prob": 0.1,
|
"dropout_prob": 0.1,
|
||||||
"intent_num_labels": 7,
|
"intent_num_labels": 7,
|
||||||
|
"gamma": 2,
|
||||||
|
"k": 3
|
||||||
}
|
}
|
||||||
data_loader: JISFDL
|
data_loader: JISFDL
|
||||||
|
|
||||||
@ -128,7 +130,7 @@ class IntentClassifier(tfbp.Model):
|
|||||||
# Hyperparams, Optimizer and Loss function
|
# Hyperparams, Optimizer and Loss function
|
||||||
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
|
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
|
||||||
|
|
||||||
losses = SparseCategoricalCrossentropy()
|
losses = SparseCategoricalFocalLoss(gamma=self.hparams.gamma)
|
||||||
|
|
||||||
metrics = [SparseCategoricalAccuracy("accuracy")]
|
metrics = [SparseCategoricalAccuracy("accuracy")]
|
||||||
|
|
||||||
@ -172,32 +174,48 @@ class IntentClassifier(tfbp.Model):
|
|||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@tfbp.runnable
|
|
||||||
def predict(self):
|
|
||||||
text = self.data_loader.get_prediction_data()
|
|
||||||
|
|
||||||
info = self.get_prediction(text)
|
|
||||||
|
|
||||||
print(self.summary())
|
|
||||||
print("Text : " + text)
|
|
||||||
print(json.dumps(info, indent=2))
|
|
||||||
|
|
||||||
return json.dumps(info, indent=2)
|
|
||||||
|
|
||||||
def get_prediction(self, text: str):
|
def get_prediction(self, text: str):
|
||||||
inputs = self.data_loader.encode_text(text, self.tokenizer)
|
inputs = self.data_loader.encode_text(text, self.tokenizer)
|
||||||
intent_probas = self(inputs) # type: ignore
|
intent_probas = self(inputs) # type: ignore
|
||||||
|
|
||||||
intent_probas_np = intent_probas.numpy()
|
intent_probas_np = intent_probas.numpy()
|
||||||
|
|
||||||
# Get the indices of the maximum values
|
# Get the indices of the maximum values
|
||||||
intent_id = intent_probas_np.argmax(axis=-1)[0]
|
intent_id = intent_probas_np.argmax(axis=-1)[0]
|
||||||
|
|
||||||
# get the confidences for each intent
|
# get the confidences for each intent
|
||||||
intent_confidences = intent_probas_np[0]
|
intent_confidences = intent_probas_np[0]
|
||||||
|
|
||||||
return {
|
margin = self.compute_normalized_confidence_margin(intent_probas_np)
|
||||||
|
output = {
|
||||||
"text": text,
|
"text": text,
|
||||||
"intent": {"name": self.extra_params["intent_names"][intent_id],
|
"intent": {"name": self.extra_params["intent_names"][intent_id],
|
||||||
"confidence": float(intent_confidences[intent_id])},
|
"confidence": float(intent_confidences[intent_id])},
|
||||||
|
"margin": margin,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def compute_top_k_confidence(self, probs, k=3):
|
||||||
|
sorted_probas = np.sort(probs[0])[::-1] # Sort in descending order
|
||||||
|
top_k_sum = np.sum(sorted_probas[:k])
|
||||||
|
return top_k_sum
|
||||||
|
|
||||||
|
def compute_normalized_confidence_margin(self, probs):
|
||||||
|
highest_proba = np.max(probs[0])
|
||||||
|
sum_of_probas = self.compute_top_k_confidence(probs, self.hparams.k)
|
||||||
|
# Normalized margin
|
||||||
|
normalized_margin = highest_proba / sum_of_probas
|
||||||
|
return normalized_margin
|
||||||
|
|
||||||
|
@tfbp.runnable
|
||||||
|
def predict(self):
|
||||||
|
while True:
|
||||||
|
|
||||||
|
text = input("Provide text: ")
|
||||||
|
output = self.get_prediction(text)
|
||||||
|
print(output)
|
||||||
|
# Optionally, provide a way to exit the loop
|
||||||
|
if input("Try again? (y/n): ").lower() != 'y':
|
||||||
|
break
|
||||||
|
@ -151,16 +151,19 @@ class SlotFiller(tfbp.Model):
|
|||||||
|
|
||||||
@tfbp.runnable
|
@tfbp.runnable
|
||||||
def predict(self):
|
def predict(self):
|
||||||
text = self.data_loader.get_prediction_data()
|
while True:
|
||||||
|
text = input("Provide text: ")
|
||||||
|
info = self.get_prediction(text)
|
||||||
|
|
||||||
info = self.get_prediction(text)
|
print(self.summary())
|
||||||
|
print("Text : " + text)
|
||||||
|
print(info)
|
||||||
|
|
||||||
|
# Optionally, provide a way to exit the loop
|
||||||
|
if input("Try again? (y/n): ").lower() != 'y':
|
||||||
|
break
|
||||||
|
|
||||||
print(self.summary())
|
|
||||||
print("Text : " + text)
|
|
||||||
print(json.dumps(info, indent=2))
|
|
||||||
|
|
||||||
return json.dumps(info, indent=2)
|
|
||||||
|
|
||||||
def get_slots_prediction(self, text: str, inputs, slot_probas):
|
def get_slots_prediction(self, text: str, inputs, slot_probas):
|
||||||
slot_probas_np = slot_probas.numpy()
|
slot_probas_np = slot_probas.numpy()
|
||||||
# Get the indices of the maximum values
|
# Get the indices of the maximum values
|
||||||
|
@ -95,19 +95,27 @@ class TFLC(tfbp.Model):
|
|||||||
|
|
||||||
self.calculate_metrics(y_test, y_pred, languages)
|
self.calculate_metrics(y_test, y_pred, languages)
|
||||||
|
|
||||||
|
def preprocess_text(self, text):
|
||||||
|
# The predict file contains a single JSON object whose only key is text.
|
||||||
|
stripped_text = self.strip_numbers(text)
|
||||||
|
encoded_text = np.array(self.tfidf.transform(
|
||||||
|
[stripped_text]).toarray()) # type: ignore
|
||||||
|
return np.array([stripped_text]), encoded_text
|
||||||
|
|
||||||
@tfbp.runnable
|
@tfbp.runnable
|
||||||
def predict(self):
|
def predict(self):
|
||||||
languages = list(self.extra_params['languages'])
|
languages = list(self.extra_params['languages'])
|
||||||
texts, encoded_texts = self.data_loader.get_prediction_data()
|
input_provided = input("Provide text: ")
|
||||||
|
text, encoded_text = self.preprocess_text(input_provided)
|
||||||
# converting a one hot output to language index
|
# converting a one hot output to language index
|
||||||
probas = super().predict(encoded_texts)
|
probas = super().predict(encoded_text)
|
||||||
predictions = np.argmax(probas, axis=1)
|
predictions = np.argmax(probas, axis=1)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for idx, prediction in enumerate(predictions):
|
for idx, prediction in enumerate(predictions):
|
||||||
print('The sentence "{}" is in {}.'.format(
|
print('The sentence "{}" is in {}.'.format(
|
||||||
texts[idx], languages[prediction].upper()))
|
text[idx], languages[prediction].upper()))
|
||||||
results.append({'text': texts[idx], 'language': prediction})
|
results.append({'text': text[idx], 'language': prediction})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_prediction(self, text: str):
|
def get_prediction(self, text: str):
|
||||||
|
@ -6,4 +6,5 @@ scikit_learn==1.2.2
|
|||||||
fastapi==0.100.0
|
fastapi==0.100.0
|
||||||
uvicorn[standard]==0.23.1
|
uvicorn[standard]==0.23.1
|
||||||
autopep8==2.0.2
|
autopep8==2.0.2
|
||||||
|
focal-loss==0.0.7
|
||||||
h5py --only-binary=h5py
|
h5py --only-binary=h5py
|
||||||
|
@ -6,7 +6,7 @@ class JsonHelper:
|
|||||||
|
|
||||||
def __init__(self, model:str = "intent_classifier"):
|
def __init__(self, model:str = "intent_classifier"):
|
||||||
self.data_folder=os.path.join("data",model)
|
self.data_folder=os.path.join("data",model)
|
||||||
|
|
||||||
def read_dataset_json_file(self, filename):
|
def read_dataset_json_file(self, filename):
|
||||||
file_path = os.path.join(self.data_folder, filename)
|
file_path = os.path.join(self.data_folder, filename)
|
||||||
if os.path.exists(file_path):
|
if os.path.exists(file_path):
|
||||||
|
Loading…
Reference in New Issue
Block a user