hexabot/nlu/data_loaders/jisfdl.py

from typing import Dict, List, Union
import tensorflow as tf
import json
import numpy as np
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

import boilerplate as tfbp
from utils.jisf_data_mapper import JisfDataMapper
from utils.json_helper import JsonHelper


class JointRawData(object):
    id: str
    intent: str
    positions: Dict[str, List[int]]
    slots: Dict[str, str]
    text: str

    def __init__(self, id, intent, positions, slots, text):
        self.id = id
        self.intent = intent
        self.positions = positions
        self.slots = slots
        self.text = text

    def __repr__(self):
        return str(json.dumps(self.__dict__, indent=2))  # type: ignore
##
# JISFDL : Joint Intent and Slot Filling Model Data Loader
##


class JISFDL(tfbp.DataLoader):

    def encode_texts(self, texts: List[str], tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
        # https://huggingface.co/transformers/preprocessing.html
        return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

    def encode_intents(self, intents, intent_map) -> tf.Tensor:
        """Map to train_data values"""
        encoded = []
        for i in intents:
            encoded.append(intent_map[i])
        # convert to tf tensor
        return tf.convert_to_tensor(encoded, dtype="int32")

    def get_slot_from_token(self, token: str, slot_dict: Dict[str, str]):
        """ this function maps a token to its slot label"""
        # each token either belongs to a slot or has a null slot
        for slot_label, value in slot_dict.items():
            if token in value:
                return slot_label
        return None

    def encode_slots(self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
                     all_slots: List[Dict[str, str]], all_texts: List[str],
                     slot_map: Dict[str, int],  max_len: int):

        encoded_slots = np.zeros(
            shape=(len(all_texts), max_len), dtype=np.int32)
        # each slot is assigned to the tokenized sentence instead of the raw text
        # so that mapping a token to its slots is easier since we can use our bert tokenizer.
        for idx, slot_names in enumerate(all_slots):
            for slot_name, slot_text in slot_names.items():
                slot_names[slot_name] = tokenizer.tokenize(slot_text)
            # we now assign the sentence's slot dictionary to its index in all_slots .
            all_slots[idx] = slot_names

        for idx, text in enumerate(all_texts):
            enc = []  # for this idx, to be added at the end to encoded_slots

            # for each text, we retrieve all the slots with the
            # words in that slot.
            slot_names = all_slots[idx]

            # we tokenize our input text to match the tokens in the slot dictionary
            tokens = tokenizer.tokenize(text)

            for token in tokens:
                # each token is matched to its individual label
                token_slot_name = self.get_slot_from_token(token, slot_names)
                # if the token has no label, we give the null label <PAD>
                # the label is then appended to the labels of the current text
                if token_slot_name:
                    enc.append(slot_map[token_slot_name])
                else:
                    enc.append(0)

            # now add to encoded_slots
            # the first and the last elements
            # in encoded text are special characters
            encoded_slots[idx, 1:len(enc)+1] = enc

        return encoded_slots

    def parse_dataset_intents(self, data):

        intents = []
        k = 0

        # Filter examples by language
        lang = self.hparams.language
        all_examples = data["common_examples"]
        examples = filter(lambda exp: any(
            e['entity'] == 'language' and e['value'] == lang for e in exp['entities']), all_examples)

        # Parse raw data
        for exp in examples:
            text = exp["text"]
            intent = exp["intent"]
            entities = exp["entities"]

            # Filter out language entities
            slot_entities = filter(
                lambda e: e["entity"] != "language", entities)
            slots = {e["entity"]: e["value"] for e in slot_entities}
            positions = [[e.get("start", -1), e.get("end", -1)]
                         for e in slot_entities]

            temp = JointRawData(k, intent, positions, slots, text)
            k += 1
            intents.append(temp)

        return intents

    def __call__(self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], model_params = None):
        # I have already transformed the train and test datasets to the new format using
        # the transform to new hidden method.

        helper = JsonHelper()

        if self.method in ["fit", "train"]:
            dataset = helper.read_dataset_json_file('train.json')
            train_data = self.parse_dataset_intents(dataset)
            return self._transform_dataset(train_data, tokenizer)
        elif self.method in ["evaluate"]:
            dataset = helper.read_dataset_json_file('test.json')
            test_data = self.parse_dataset_intents(dataset)
            return self._transform_dataset(test_data, tokenizer, model_params)
        else:
            raise ValueError("Unknown method!")

    def _transform_dataset(self, dataset: List[JointRawData], tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], model_params = None):
        # We have to encode the texts using the tokenizer to create tensors for training
        # the classifier.
        texts = [d.text for d in dataset]
        encoded_texts = self.encode_texts(texts, tokenizer)

        # Map intents, load from the model (evaluate), recompute from dataset otherwise (train)
        intents = [d.intent for d in dataset]
        if not model_params:
            intent_names = list(set(intents))
            # Map slots, load from the model (evaluate), recompute from dataset otherwise (train)
            slot_names = set()
            for td in dataset:
                slots = td.slots
                for slot in slots:
                    slot_names.add(slot)
            slot_names = list(slot_names)
            # To pad all the texts to the same length, the tokenizer will use special characters.
            # To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.
            slot_names.insert(0, "<PAD>")
        else:
            intent_names = model_params.intent_names
            slot_names = model_params.slot_names

        intent_map = dict()  # Dict : intent -> index
        for idx, ui in enumerate(intent_names):
            intent_map[ui] = idx

        # Encode intents
        encoded_intents = self.encode_intents(intents, intent_map)

        slot_map: Dict[str, int] = dict()  # slot -> index
        for idx, us in enumerate(slot_names):
            slot_map[us] = idx

        # Encode slots
        # Text : Add a tune to my elrow Guest List
        # {'music_item': 'tune', 'playlist_owner': 'my', 'playlist': 'elrow Guest List'}
        # [ 0  0  0 18  0 26 12 12 12 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0
        #   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
        max_len = len(encoded_texts["input_ids"][0])  # type: ignore
        all_slots = [td.slots for td in dataset]
        all_texts = [td.text for td in dataset]
        encoded_slots = self.encode_slots(tokenizer,
                                          all_slots, all_texts, slot_map, max_len)

        return encoded_texts, encoded_intents, encoded_slots, intent_names, slot_names

    def get_prediction_data(self) -> str:
        helper = JsonHelper()
        dataset = helper.read_dataset_json_file('predict.json')
        return dataset["text"]

    def encode_text(self, text: str, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]):
        return self.encode_texts([text], tokenizer)