Source code for deeppavlov.dataset_iterators.siamese_iterator

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import Dict, List, Tuple

from deeppavlov.core.common.registry import register
from import DataLearningIterator

log = getLogger(__name__)

[docs]@register('siamese_iterator') class SiameseIterator(DataLearningIterator): """The class contains methods for iterating over a dataset for ranking in training, validation and test mode. Args: data: A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. seed: Random seed. shuffle: Whether to shuffle data. num_samples: A number of data samples to use in ``train``, ``validation`` and ``test`` mode. random_batches: Whether to choose batches randomly or iterate over data sequentally in training mode. batches_per_epoch: A number of batches to choose per each epoch in training mode. Only required if ``random_batches`` is set to ``True``. """ def __init__(self, data: Dict[str, List], seed: int = None, shuffle: bool = False, num_samples: int = None, random_batches: bool = False, batches_per_epoch: int = None, *args, **kwargs) -> None: self.len_valid = kwargs.get("len_valid", 1000) self.len_test = kwargs.get("len_test", 1000) super().__init__(data, seed=seed, shuffle=shuffle, *args, **kwargs) self.random_batches = random_batches self.batches_per_epoch = batches_per_epoch["train"] = self.train[:num_samples]["valid"] = self.valid[:num_samples]["test"] = self.test[:num_samples]["all"] = self.train + self.valid + self.test def split(self, *args, **kwargs) -> None: if len(self.valid) == 0 and self.len_valid != 0: self.random.shuffle(self.train) self.valid = self.train[-self.len_valid:] self.train = self.train[:-self.len_valid] if len(self.test) == 0: self.random.shuffle(self.train) self.test = self.train[-self.len_test:] self.train = self.train[:-self.len_test] def gen_batches(self, batch_size: int, data_type: str = "train", shuffle: bool = True) -> \ Tuple[List[List[Tuple[int, int]]], List[int]]: """Generate batches of inputs and expected outputs to train neural networks. Args: batch_size: number of samples in batch data_type: can be either 'train', 'test', or 'valid' shuffle: whether to shuffle dataset before batching Yields: A tuple of a batch of inputs and a batch of expected outputs. Inputs and expected outputs have different structure and meaning depending on class attributes values and ``data_type``. """ data =[data_type] if self.random_batches and self.batches_per_epoch is not None and data_type == "train": num_steps = self.batches_per_epoch if batch_size > len(data): batch_size = len(data) log.warning("The batch size exceeds the dataset size. Setting it equal to the dataset size.") else: num_steps = len(data) // batch_size if data_type == "train": if shuffle: self.random.shuffle(data) for i in range(num_steps): if self.random_batches: context_response_data = self.random.sample(data, k=batch_size) else: context_response_data = data[i * batch_size:(i + 1) * batch_size] yield tuple(zip(*context_response_data)) if data_type in ["valid", "test"]: for i in range(num_steps + 1): context_response_data = data[i * batch_size:(i + 1) * batch_size] if context_response_data: yield tuple(zip(*context_response_data))