Source code for deeppavlov.dataset_iterators.ranking_iterator

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

import numpy as np
import random
from typing import Dict, List, Tuple


[docs]@register('ranking_iterator') class RankingIterator(DataLearningIterator): """The class contains methods for iterating over a dataset for ranking in training, validation and test mode. Note: Each sample in ``data['train']`` is arranged as follows: ``{'context': 21507, 'response': 7009, 'pos_pool': [7009, 7010], 'neg_pool': None}``. The context has a 'context' key in the data sample. It is represented by a single integer. The correct response has the 'response' key in the sample, its value is also always a single integer. The list of possible correct responses (there may be several) can be obtained with the 'pos\_pool' key. The value of the 'response' should be equal to the one item from the list obtained using the 'pos\_pool' key. The list of possible negative responses (there can be a lot of them, 100–10000) is represented by the key 'neg\_pool'. Its value is None, when global sampling is used, or the list of fixed length, when sampling from predefined negative responses is used. It is important that values in 'pos\_pool' and 'negative\_pool' do not overlap. Single items in 'context', 'response', 'pos\_pool', 'neg\_pool' are represented by single integers that give lists of integers using some dictionary `integer–list of integers`. These lists of integers are converted to lists of tokens with some dictionary `integer–token`. Samples in ``data['valid']`` and ``data['test']`` representation are almost the same as the train sample shown above. Args: data: A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. sample_candidates_pool: Whether to sample candidates from a predefined pool of candidates for each sample in training mode. If ``False``, negative sampling from the whole data will be performed. sample_candidates_pool_valid: Whether to validate a model on a predefined pool of candidates for each sample. If ``False``, sampling from the whole data will be performed for validation. sample_candidates_pool_test: Whether to test a model on a predefined pool of candidates for each sample. If ``False``, sampling from the whole data will be performed for test. num_negative_samples: A size of a predefined pool of candidates or a size of data subsample from the whole data in training mode. num_ranking_samples_valid: A size of a predefined pool of candidates or a size of data subsample from the whole data in validation mode. num_ranking_samples_test: A size of a predefined pool of candidates or a size of data subsample from the whole data in test mode. seed: Random seed. shuffle: Whether to shuffle data. len_vocab: A length of a vocabulary to perform sampling in training, validation and test mode. pos_pool_sample: Whether to sample response from `pos_pool` each time when the batch is generated. If ``False``, the response from `response` will be used. pos_pool_rank: Whether to count samples from the whole `pos_pool` as correct answers in test / validation mode. random_batches: Whether to choose batches randomly or iterate over data sequentally in training mode. batches_per_epoch: A number of batches to choose per each epoch in training mode. Only required if ``random_batches`` is set to ``True``. triplet_mode: Whether to use a model with triplet loss. If ``False``, a model with crossentropy loss will be used. hard_triplets_sampling: Whether to use hard triplets method of sampling in training mode. num_positive_samples: A number of contexts to choose from `pos_pool` for each `context`. Only required if ``hard_triplets_sampling`` is set to ``True``. """ def __init__(self, data: Dict[str, List], sample_candidates_pool: bool = False, sample_candidates_pool_valid: bool = True, sample_candidates_pool_test: bool = True, num_negative_samples: int = 10, num_ranking_samples_valid: int = 10, num_ranking_samples_test: int = 10, seed: int = None, shuffle: bool = False, len_vocab: int = 0, pos_pool_sample: bool = False, pos_pool_rank: bool = True, random_batches: bool = False, batches_per_epoch: int = None, triplet_mode: bool = True, hard_triplets_sampling: bool = False, num_positive_samples: int = 5): self.sample_candidates_pool = sample_candidates_pool self.sample_candidates_pool_valid = sample_candidates_pool_valid self.sample_candidates_pool_test = sample_candidates_pool_test self.num_negative_samples = num_negative_samples self.num_ranking_samples_valid = num_ranking_samples_valid self.num_ranking_samples_test = num_ranking_samples_test self.len_vocab = len_vocab self.pos_pool_sample = pos_pool_sample self.pos_pool_rank = pos_pool_rank self.random_batches = random_batches self.batches_per_epoch = batches_per_epoch self.triplet_mode = triplet_mode self.hard_triplets_sampling = hard_triplets_sampling self.num_positive_samples = num_positive_samples np.random.seed(seed) self.train = data.get('train', []) self.valid = data.get('valid', []) self.test = data.get('test', []) self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, 'all': self.train + self.test + self.valid } super().__init__(self.data, seed=seed, shuffle=shuffle)
[docs] def gen_batches(self, batch_size: int, data_type: str = "train", shuffle: bool = True)->\ Tuple[List[List[Tuple[int, int]]], List[int]]: """Generate batches of inputs and expected outputs to train neural networks. Args: batch_size: number of samples in batch data_type: can be either 'train', 'test', or 'valid' shuffle: whether to shuffle dataset before batching Returns: A tuple of a batch of inputs and a batch of expected outputs. Inputs and expected outputs have different structure and meaning depending on class attributes values and ``data_type``. """ data = self.data[data_type] if self.random_batches and self.batches_per_epoch is not None and data_type == "train": num_steps = self.batches_per_epoch assert(batch_size <= len(data)) else: num_steps = len(data) // batch_size if data_type == "train": if shuffle: np.random.shuffle(data) for i in range(num_steps): if self.random_batches: context_response_data = np.random.choice(data, size=batch_size, replace=False) else: context_response_data = data[i * batch_size:(i + 1) * batch_size] context = [el["context"] for el in context_response_data] if self.pos_pool_sample: response = [random.choice(el["pos_pool"]) for el in context_response_data] else: response = [el["response"] for el in context_response_data] if self.triplet_mode: negative_response = self._create_neg_resp_rand(context_response_data, batch_size) if self.hard_triplets_sampling: labels = [el["label"] for el in context_response_data] positives = [random.choices(el["pos_pool"], k=self.num_positive_samples) for el in context_response_data] x = [[(context[i], el) for el in positives[i]] for i in range(len(context_response_data))] y = labels else: x = [[(context[i], el) for el in [response[i]] + [negative_response[i]]] for i in range(len(context_response_data))] y = batch_size * [np.ones(2)] else: y = [el["label"] for el in context_response_data] x = [[(context[i], response[i])] for i in range(len(context_response_data))] yield (x, y) if data_type in ["valid", "test"]: for i in range(num_steps + 1): if i < num_steps: context_response_data = data[i * batch_size:(i + 1) * batch_size] else: if len(data[i * batch_size:len(data)]) > 0: context_response_data = data[i * batch_size:len(data)] context = [el["context"] for el in context_response_data] if data_type == "valid": ranking_length = self.num_ranking_samples_valid sample_candidates_pool = self.sample_candidates_pool_valid elif data_type == "test": ranking_length = self.num_ranking_samples_test sample_candidates_pool = self.sample_candidates_pool_test if not sample_candidates_pool: ranking_length = self.len_vocab response_data = self._create_rank_resp(context_response_data, ranking_length) if self.pos_pool_rank: y = [len(el["pos_pool"]) * np.ones(ranking_length) for el in context_response_data] else: y = [np.ones(ranking_length) for _ in context_response_data] x = [[(context[i], el) for el in response_data[i]] for i in range(len(context_response_data))] yield (x, y)
def _create_neg_resp_rand(self, context_response_data, batch_size): """Randomly chooses negative response for each context in a batch. Sampling is performed from predefined pools of candidates or from the whole data. Args: context_response_data: A batch from the train part of the dataset. batch_size: A batch size. Returns: one negative response for each context in a batch. """ if self.sample_candidates_pool: negative_response_data = [random.choice(el["neg_pool"]) for el in context_response_data] else: candidates = [] for i in range(batch_size): candidate = np.random.randint(0, self.len_vocab, 1)[0] while candidate in context_response_data[i]["pos_pool"]: candidate = np.random.randint(0, self.len_vocab, 1)[0] candidates.append(candidate) negative_response_data = candidates return negative_response_data def _create_rank_resp(self, context_response_data, ranking_length): """Chooses a set of negative responses for each context in a batch to evaluate ranking quality. Negative responses are taken from predefined pools of candidates or from the whole data. Args: context_response_data: A batch from the train part of the dataset. ranking_length: a number of responses for each context to evaluate ranking quality. Returns: list of responses for each context in a batch. """ response_data = [] for i in range(len(context_response_data)): pos_pool = context_response_data[i]["pos_pool"] resp = context_response_data[i]["response"] if self.pos_pool_rank: pos_pool.insert(0, pos_pool.pop(pos_pool.index(resp))) else: pos_pool = [resp] neg_pool = context_response_data[i]["neg_pool"] response = pos_pool + neg_pool response_data.append(response[:ranking_length]) return response_data