fix: restore data loader

This commit is contained in:
hexastack 2024-10-22 13:46:03 +01:00
parent 9b61e36c88
commit 13df530881
3 changed files with 28 additions and 27 deletions

View File

@ -99,28 +99,28 @@ class JISFDL(tfbp.DataLoader):
k = 0
# Filter examples by language
# lang = self.hparams.language
# all_examples = data["common_examples"]
#
# if not bool(lang):
# examples = all_examples
# else:
# examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
lang = self.hparams.language
all_examples = data["common_examples"]
if not bool(lang):
examples = all_examples
else:
examples = filter(lambda exp: any(e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)
# Parse raw data
for exp in data:
for exp in examples:
text = exp["text"]
intent = exp["intent"]
# entities = exp["entities"]
entities = exp["entities"]
# Filter out language entities
# slot_entities = filter(
# lambda e: e["entity"] != "language", entities)
# slots = {e["entity"]: e["value"] for e in slot_entities}
# positions = [[e.get("start", -1), e.get("end", -1)]
# for e in slot_entities]
slot_entities = filter(
lambda e: e["entity"] != "language", entities)
slots = {e["entity"]: e["value"] for e in slot_entities}
positions = [[e.get("start", -1), e.get("end", -1)]
for e in slot_entities]
temp = JointRawData(k, intent, None, None, text)
temp = JointRawData(k, intent, positions, slots, text)
k += 1
intents.append(temp)
@ -133,7 +133,7 @@ class JISFDL(tfbp.DataLoader):
helper = JsonHelper()
if self.method in ["fit", "train"]:
dataset = helper.read_dataset_json_file('english.json')
dataset = helper.read_dataset_json_file('train.json')
train_data = self.parse_dataset_intents(dataset)
return self._transform_dataset(train_data, tokenizer)
elif self.method in ["evaluate"]:
@ -154,14 +154,14 @@ class JISFDL(tfbp.DataLoader):
intent_names = list(set(intents))
# Map slots, load from the model (evaluate), recompute from dataset otherwise (train)
slot_names = set()
# for td in dataset:
# slots = td.slots
# for slot in slots:
# slot_names.add(slot)
# slot_names = list(slot_names)
# # To pad all the texts to the same length, the tokenizer will use special characters.
# # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
# slot_names.insert(0, "<PAD>")
for td in dataset:
slots = td.slots
for slot in slots:
slot_names.add(slot)
slot_names = list(slot_names)
# To pad all the texts to the same length, the tokenizer will use special characters.
# To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
slot_names.insert(0, "<PAD>")
else:
if "intent_names" in model_params:
intent_names = model_params["intent_names"]

View File

@ -43,6 +43,8 @@ class IntentClassifier(tfbp.Model):
"num_epochs": 2,
"dropout_prob": 0.1,
"intent_num_labels": 7,
"gamma": 2,
"k": 3
}
data_loader: JISFDL
@ -129,7 +131,7 @@ class IntentClassifier(tfbp.Model):
# Hyperparams, Optimizer and Loss function
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = SparseCategoricalFocalLoss(gamma=2.5)
losses = SparseCategoricalFocalLoss(gamma=self.hparams.gamma)
metrics = [SparseCategoricalAccuracy("accuracy")]
@ -203,7 +205,7 @@ class IntentClassifier(tfbp.Model):
def compute_normalized_confidence_margin(self, probs):
highest_proba = np.max(probs[0])
sum_of_probas = self.compute_top_k_confidence(probs)
sum_of_probas = self.compute_top_k_confidence(probs, self.hparams.k)
# Normalized margin
normalized_margin = highest_proba / sum_of_probas
return normalized_margin

View File

@ -6,7 +6,6 @@ class JsonHelper:
def __init__(self, model:str = "intent_classifier"):
self.data_folder=os.path.join("data",model)
# self.data_folder = os.path.join(os.path.dirname(__file__), '..', 'data', model)
def read_dataset_json_file(self, filename):
file_path = os.path.join(self.data_folder, filename)