mirror of
https://github.com/hexastack/hexabot
synced 2024-12-01 00:16:01 +00:00
92 lines
3.1 KiB
Python
92 lines
3.1 KiB
Python
from .json_helper import JsonHelper
|
|
|
|
"""
|
|
Transform data set from Rasa structure to a compliant one
|
|
|
|
How to use:
|
|
from utils.jisf_data_mapper import JisfDataMapper
|
|
|
|
|
|
mapper = JisfDataMapper()
|
|
|
|
#mapper.transform_to_new("train.json")
|
|
mapper.transform_to_new("test.json")
|
|
"""
|
|
|
|
class JisfDataMapper(object):
|
|
|
|
def transform_to_new(self, filename: str, reverse: bool = False):
|
|
"""this method allows for changing a file's data format."""
|
|
helper=JsonHelper()
|
|
|
|
data = helper.read_dataset_json_file(filename)
|
|
copy_file = "copy of "+filename
|
|
|
|
# we create a copy of the old data format
|
|
helper.write_dataset_json_file(data, copy_file)
|
|
|
|
# alternatively, we could use this method in the opposite direction
|
|
if not reverse:
|
|
data = self.old_to_new(data)
|
|
else:
|
|
data = self.new_to_old(data)
|
|
|
|
helper.write_dataset_json_file(data, filename)
|
|
|
|
def old_to_new(self,data:dict):
|
|
converted_data=dict()
|
|
converted_data["common_examples"]=[]
|
|
all_intents=set()
|
|
all_slots=dict()
|
|
for k in data.keys():
|
|
common_example=dict()
|
|
|
|
#text and intent are the same in both formats
|
|
common_example["text"]=data[k]["text"]
|
|
common_example["intent"]=data[k]["intent"]
|
|
common_example["entities"]=[]
|
|
all_intents.add(common_example["intent"])
|
|
|
|
#for every entity, we get its corresponding value as well as the index of its
|
|
#start and finish
|
|
for slot in data[k]["slots"].keys():
|
|
all_slots[slot]=all_slots.get(slot,set())
|
|
entity=dict()
|
|
entity["entity"]=slot
|
|
entity["value"]=data[k]["slots"][slot]
|
|
all_slots[slot].add(entity["value"])
|
|
entity["start"],entity["end"]=tuple(data[k]["positions"][slot])
|
|
common_example["entities"].append(entity)
|
|
converted_data["common_examples"].append(common_example)
|
|
|
|
#lookup tables store all the intents as well as all the slot values seen in the dataset
|
|
converted_data["lookup_tables"]=[]
|
|
all_slots["intent"]=all_intents
|
|
for name,value in all_slots.items():
|
|
converted_data["lookup_tables"].append({"name":name,"elements":list(value)})
|
|
|
|
#regex features and entity synonyms will remain empty for now
|
|
converted_data["regex_features"]=[]
|
|
converted_data["entity_synonyms"]=[]
|
|
|
|
return converted_data
|
|
|
|
def new_to_old(self,data:dict):
|
|
|
|
old_data=dict()
|
|
dataset=data["common_examples"]
|
|
|
|
#for each piece of text, we make a JSON object.
|
|
for i in range(len(dataset)):
|
|
item=dict()
|
|
item["text"]=dataset[i]["text"]
|
|
item["intent"]=dataset[i]["intent"]
|
|
item["slots"]=dict()
|
|
item["positions"]=dict()
|
|
for entity in dataset[i]["entities"]:
|
|
item["slots"][entity["entity"]]=entity["value"]
|
|
item["positions"][entity["entity"]]=[entity["start"],entity["end"]]
|
|
old_data[i]=item
|
|
|
|
return old_data
|