Merge pull request #258 from MohamedAliBouhaouala/enhance/intent-classifier

Enhance/intent classifier
This commit is contained in:
Med Marrouchi 2024-10-22 16:15:51 +01:00 committed by GitHub
commit 3f8d0a749c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 59 additions and 40 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ certbot
docker/**/data
node_modules/
/docker/**/db_data
.idea/

View File

@ -210,10 +210,6 @@ class JISFDL(tfbp.DataLoader):
return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names
def get_prediction_data(self) -> str:
helper = JsonHelper()
dataset = helper.read_dataset_json_file('predict.json')
return dataset["text"]
def encode_text(self, text: str, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
return self.encode_texts([text], tokenizer)

View File

@ -125,14 +125,6 @@ class TFLCDL(tfbp.DataLoader):
else:
raise ValueError("Unknown method!")
def get_prediction_data(self):
# The predict file contains a single JSON object whose only key is text.
data = self.json_helper.read_dataset_json_file("predict.json")
text = self.strip_numbers(data["text"])
encoded_texts = np.array(self.tfidf.transform(
[text]).toarray()) # type: ignore
return np.array([text]), encoded_texts
def encode_text(self, text: str):
sanitized_text = self.strip_numbers(text)
return self.tfidf.transform([sanitized_text]).toarray() # type: ignore

View File

@ -12,8 +12,8 @@ if platform == "darwin":
else:
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy
from focal_loss import SparseCategoricalFocalLoss
import numpy as np
from data_loaders.jisfdl import JISFDL
@ -42,6 +42,8 @@ class IntentClassifier(tfbp.Model):
"num_epochs": 2,
"dropout_prob": 0.1,
"intent_num_labels": 7,
"gamma": 2,
"k": 3
}
data_loader: JISFDL
@ -128,7 +130,7 @@ class IntentClassifier(tfbp.Model):
# Hyperparams, Optimizer and Loss function
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = SparseCategoricalCrossentropy()
losses = SparseCategoricalFocalLoss(gamma=self.hparams.gamma)
metrics = [SparseCategoricalAccuracy("accuracy")]
@ -172,17 +174,6 @@ class IntentClassifier(tfbp.Model):
return scores
@tfbp.runnable
def predict(self):
text = self.data_loader.get_prediction_data()
info = self.get_prediction(text)
print(self.summary())
print("Text : " + text)
print(json.dumps(info, indent=2))
return json.dumps(info, indent=2)
def get_prediction(self, text: str):
inputs = self.data_loader.encode_text(text, self.tokenizer)
@ -196,8 +187,35 @@ class IntentClassifier(tfbp.Model):
# get the confidences for each intent
intent_confidences = intent_probas_np[0]
return {
margin = self.compute_normalized_confidence_margin(intent_probas_np)
output = {
"text": text,
"intent": {"name": self.extra_params["intent_names"][intent_id],
"confidence": float(intent_confidences[intent_id])},
"margin": margin,
}
return output
def compute_top_k_confidence(self, probs, k=3):
sorted_probas = np.sort(probs[0])[::-1] # Sort in descending order
top_k_sum = np.sum(sorted_probas[:k])
return top_k_sum
def compute_normalized_confidence_margin(self, probs):
highest_proba = np.max(probs[0])
sum_of_probas = self.compute_top_k_confidence(probs, self.hparams.k)
# Normalized margin
normalized_margin = highest_proba / sum_of_probas
return normalized_margin
@tfbp.runnable
def predict(self):
while True:
text = input("Provide text: ")
output = self.get_prediction(text)
print(output)
# Optionally, provide a way to exit the loop
if input("Try again? (y/n): ").lower() != 'y':
break

View File

@ -151,15 +151,18 @@ class SlotFiller(tfbp.Model):
@tfbp.runnable
def predict(self):
text = self.data_loader.get_prediction_data()
while True:
text = input("Provide text: ")
info = self.get_prediction(text)
print(self.summary())
print("Text : " + text)
print(json.dumps(info, indent=2))
print(info)
# Optionally, provide a way to exit the loop
if input("Try again? (y/n): ").lower() != 'y':
break
return json.dumps(info, indent=2)
def get_slots_prediction(self, text: str, inputs, slot_probas):
slot_probas_np = slot_probas.numpy()

View File

@ -95,19 +95,27 @@ class TFLC(tfbp.Model):
self.calculate_metrics(y_test, y_pred, languages)
def preprocess_text(self, text):
# The predict file contains a single JSON object whose only key is text.
stripped_text = self.strip_numbers(text)
encoded_text = np.array(self.tfidf.transform(
[stripped_text]).toarray()) # type: ignore
return np.array([stripped_text]), encoded_text
@tfbp.runnable
def predict(self):
languages = list(self.extra_params['languages'])
texts, encoded_texts = self.data_loader.get_prediction_data()
input_provided = input("Provide text: ")
text, encoded_text = self.preprocess_text(input_provided)
# converting a one hot output to language index
probas = super().predict(encoded_texts)
probas = super().predict(encoded_text)
predictions = np.argmax(probas, axis=1)
results = []
for idx, prediction in enumerate(predictions):
print('The sentence "{}" is in {}.'.format(
texts[idx], languages[prediction].upper()))
results.append({'text': texts[idx], 'language': prediction})
text[idx], languages[prediction].upper()))
results.append({'text': text[idx], 'language': prediction})
return results
def get_prediction(self, text: str):

View File

@ -6,4 +6,5 @@ scikit_learn==1.2.2
fastapi==0.100.0
uvicorn[standard]==0.23.1
autopep8==2.0.2
focal-loss==0.0.7
h5py --only-binary=h5py