Source code for deeppavlov.dataset_iterators.ranking_iterator

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

import numpy as np
import random
from typing import Dict, List, Tuple


[docs]@register('ranking_iterator')
class RankingIterator(DataLearningIterator):
    """The class contains methods for iterating over a dataset for ranking in training, validation and test mode.

    Note:
        Each sample in ``data['train']`` is arranged as follows:
        ``{'context': 21507, 'response': 7009, 'pos_pool': [7009, 7010], 'neg_pool': None}``.
        The context has a 'context' key in the data sample.
        It is represented by a single integer.
        The correct response has the 'response' key in the sample,
        its value is  also always a single integer.
        The list of possible correct responses (there may be several) can be
        obtained
        with the 'pos\_pool' key.
        The value of the 'response' should be equal to the one item from the
        list
        obtained using the 'pos\_pool' key.
        The list of possible negative responses (there can be a lot of them,
        100–10000) is represented by the key 'neg\_pool'.
        Its value is None, when global sampling is used, or the list of fixed
        length, when sampling from predefined negative responses is used.
        It is important that values in 'pos\_pool' and 'negative\_pool' do
        not overlap.
        Single items in 'context', 'response', 'pos\_pool', 'neg\_pool' are
        represented
        by single integers that give lists of integers
        using some dictionary `integer–list of integers`.
        These lists of integers are converted to lists of tokens with
        some dictionary `integer–token`.
        Samples in ``data['valid']`` and ``data['test']`` representation are almost the same
        as the train sample shown above.

    Args:
        data: A dictionary containing training, validation and test parts of the dataset obtainable via
            ``train``, ``valid`` and ``test`` keys.
        sample_candidates_pool: Whether to sample candidates from  a predefined pool of candidates
            for each sample in training mode. If ``False``, negative sampling from the whole data will be performed.
        sample_candidates_pool_valid: Whether to validate a model on a predefined pool of candidates for each sample.
            If ``False``, sampling from the whole data will be performed for validation.
        sample_candidates_pool_test: Whether to test a model on a predefined pool of candidates for each sample.
            If ``False``, sampling from the whole data will be performed for test.
        num_negative_samples: A size of a predefined pool of candidates
            or a size of data subsample from the whole data in training mode.
        num_ranking_samples_valid: A size of a predefined pool of candidates
            or a size of data subsample from the whole data in validation mode.
        num_ranking_samples_test: A size of a predefined pool of candidates
            or a size of data subsample from the whole data in test mode.
        seed: Random seed.
        shuffle: Whether to shuffle data.
        len_vocab: A length of a vocabulary to perform sampling in training, validation and test mode.
        pos_pool_sample: Whether to sample response from `pos_pool` each time when the batch is generated.
            If ``False``, the response from `response` will be used.
        pos_pool_rank: Whether to count samples from the whole `pos_pool` as correct answers in test / validation mode.
        random_batches: Whether to choose batches randomly or iterate over data sequentally in training mode.
        batches_per_epoch: A number of batches to choose per each epoch in training mode.
            Only required if ``random_batches`` is set to ``True``.
        triplet_mode: Whether to use a model with triplet loss.
            If ``False``, a model with crossentropy loss will be used.
        hard_triplets_sampling: Whether to use hard triplets method of sampling in training mode.
        num_positive_samples: A number of contexts to choose from `pos_pool` for each `context`.
            Only required if ``hard_triplets_sampling`` is set to ``True``.
    """

    def __init__(self,
                 data: Dict[str, List],
                 sample_candidates_pool: bool = False,
                 sample_candidates_pool_valid: bool = True,
                 sample_candidates_pool_test: bool = True,
                 num_negative_samples: int = 10,
                 num_ranking_samples_valid: int = 10,
                 num_ranking_samples_test: int = 10,
                 seed: int = None,
                 shuffle: bool = False,
                 len_vocab: int = 0,
                 pos_pool_sample: bool = False,
                 pos_pool_rank: bool = True,
                 random_batches: bool = False,
                 batches_per_epoch: int = None,
                 triplet_mode: bool = True,
                 hard_triplets_sampling: bool = False,
                 num_positive_samples: int = 5):

        self.sample_candidates_pool = sample_candidates_pool
        self.sample_candidates_pool_valid = sample_candidates_pool_valid
        self.sample_candidates_pool_test = sample_candidates_pool_test
        self.num_negative_samples = num_negative_samples
        self.num_ranking_samples_valid = num_ranking_samples_valid
        self.num_ranking_samples_test = num_ranking_samples_test
        self.len_vocab = len_vocab
        self.pos_pool_sample = pos_pool_sample
        self.pos_pool_rank = pos_pool_rank
        self.random_batches = random_batches
        self.batches_per_epoch = batches_per_epoch
        self.triplet_mode = triplet_mode
        self.hard_triplets_sampling = hard_triplets_sampling
        self.num_positive_samples = num_positive_samples

        np.random.seed(seed)
        self.train = data.get('train', [])
        self.valid = data.get('valid', [])
        self.test = data.get('test', [])
        self.data = {
            'train': self.train,
            'valid': self.valid,
            'test': self.test,
            'all': self.train + self.test + self.valid
        }

        super().__init__(self.data, seed=seed, shuffle=shuffle)


[docs]    def gen_batches(self, batch_size: int, data_type: str = "train", shuffle: bool = True)->\
            Tuple[List[List[Tuple[int, int]]], List[int]]:
        """Generate batches of inputs and expected outputs to train neural networks.

        Args:
            batch_size: number of samples in batch
            data_type: can be either 'train', 'test', or 'valid'
            shuffle: whether to shuffle dataset before batching

        Returns:
            A tuple of a batch of inputs and a batch of expected outputs.

            Inputs and expected outputs have different structure and meaning
            depending on class attributes values and ``data_type``.
        """
        data = self.data[data_type]
        if self.random_batches and self.batches_per_epoch is not None and data_type == "train":
            num_steps = self.batches_per_epoch
            assert(batch_size <= len(data))
        else:
            num_steps = len(data) // batch_size
        if data_type == "train":
            if shuffle:
                np.random.shuffle(data)
            for i in range(num_steps):
                if self.random_batches:
                    context_response_data = np.random.choice(data, size=batch_size, replace=False)
                else:
                    context_response_data = data[i * batch_size:(i + 1) * batch_size]
                context = [el["context"] for el in context_response_data]
                if self.pos_pool_sample:
                    response = [random.choice(el["pos_pool"]) for el in context_response_data]
                else:
                    response = [el["response"] for el in context_response_data]
                if self.triplet_mode:
                    negative_response = self._create_neg_resp_rand(context_response_data, batch_size)
                    if self.hard_triplets_sampling:
                        labels = [el["label"] for el in context_response_data]
                        positives = [random.choices(el["pos_pool"], k=self.num_positive_samples)
                                     for el in context_response_data]
                        x = [[(context[i], el) for el in positives[i]] for i in range(len(context_response_data))]
                        y = labels
                    else:
                        x = [[(context[i], el) for el in [response[i]] + [negative_response[i]]]
                             for i in range(len(context_response_data))]
                        y = batch_size * [np.ones(2)]
                else:
                    y = [el["label"] for el in context_response_data]
                    x = [[(context[i], response[i])] for i in range(len(context_response_data))]
                yield (x, y)
        if data_type in ["valid", "test"]:
            for i in range(num_steps + 1):
                if i < num_steps:
                    context_response_data = data[i * batch_size:(i + 1) * batch_size]
                else:
                    if len(data[i * batch_size:len(data)]) > 0:
                        context_response_data = data[i * batch_size:len(data)]
                context = [el["context"] for el in context_response_data]
                if data_type == "valid":
                    ranking_length = self.num_ranking_samples_valid
                    sample_candidates_pool = self.sample_candidates_pool_valid
                elif data_type == "test":
                    ranking_length = self.num_ranking_samples_test
                    sample_candidates_pool = self.sample_candidates_pool_test
                if not sample_candidates_pool:
                    ranking_length = self.len_vocab
                response_data = self._create_rank_resp(context_response_data, ranking_length)
                if self.pos_pool_rank:
                    y = [len(el["pos_pool"]) * np.ones(ranking_length) for el in context_response_data]
                else:
                    y = [np.ones(ranking_length) for _ in context_response_data]
                x = [[(context[i], el) for el in response_data[i]] for i in range(len(context_response_data))]
                yield (x, y)

    def _create_neg_resp_rand(self, context_response_data, batch_size):
        """Randomly chooses negative response for each context in a batch.

        Sampling is performed from predefined pools of candidates or from the whole data.

        Args:
            context_response_data: A batch from the train part of the dataset.
            batch_size: A batch size.

        Returns:
            one negative response for each context in a batch.
        """
        if self.sample_candidates_pool:
            negative_response_data = [random.choice(el["neg_pool"])
                                      for el in context_response_data]
        else:
            candidates = []
            for i in range(batch_size):
                candidate = np.random.randint(0, self.len_vocab, 1)[0]
                while candidate in context_response_data[i]["pos_pool"]:
                    candidate = np.random.randint(0, self.len_vocab, 1)[0]
                candidates.append(candidate)
            negative_response_data = candidates
        return negative_response_data

    def _create_rank_resp(self, context_response_data, ranking_length):
        """Chooses a set of negative responses for each context in a batch to evaluate ranking quality.

        Negative responses are taken from predefined pools of candidates or from the whole data.

        Args:
            context_response_data: A batch from the train part of the dataset.
            ranking_length: a number of responses for each context to evaluate ranking quality.

        Returns:
            list of responses for each context in a batch.
        """
        response_data = []
        for i in range(len(context_response_data)):
            pos_pool = context_response_data[i]["pos_pool"]
            resp = context_response_data[i]["response"]
            if self.pos_pool_rank:
                pos_pool.insert(0, pos_pool.pop(pos_pool.index(resp)))
            else:
                pos_pool = [resp]
            neg_pool = context_response_data[i]["neg_pool"]
            response = pos_pool + neg_pool
            response_data.append(response[:ranking_length])
        return response_data