Source code for deeppavlov.dataset_iterators.elmo_file_paths_iterator

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import Tuple, Iterator, Optional, Dict, List, Union

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from deeppavlov.core.data.utils import chunk_generator
from deeppavlov.dataset_iterators.file_paths_iterator import FilePathsIterator
from deeppavlov.models.preprocessors.str_utf8_encoder import StrUTF8Encoder

log = getLogger(__name__)


[docs]@register('elmo_file_paths_iterator')
class ELMoFilePathsIterator(FilePathsIterator):
    """Dataset iterator for tokenized datasets like 1 Billion Word Benchmark
    It gets lists of file paths from the data dictionary and returns batches of lines from each file.

    Args:
        data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values
        load_path: path to the vocabulary to be load from
        seed: random seed for data shuffling
        shuffle: whether to shuffle data during batching
        unroll_steps: number of unrolling steps
        n_gpus: number of gpu to use
        max_word_length: max length of word
        bos: tag of begin of sentence
        eos: tag of end of sentence

    """

    def __init__(self,
                 data: Dict[str, List[Union[str, Path]]],
                 load_path: Union[str, Path],
                 seed: Optional[int] = None,
                 shuffle: bool = True,
                 unroll_steps: Optional[int] = None,
                 n_gpus: Optional[int] = None,
                 max_word_length: Optional[int] = None,
                 bos: str = "<S>",
                 eos: str = "</S>",
                 *args, **kwargs) -> None:
        self.unroll_steps = unroll_steps
        self.n_gpus = n_gpus
        self.bos = bos
        self.eos = eos
        self.str_utf8_encoder = StrUTF8Encoder(
            max_word_length=max_word_length,
            pad_special_char_use=True,
            word_boundary_special_char_use=True,
            sentence_boundary_special_char_use=False,
            reversed_sentense_tokens=False,
            bos=self.bos,
            eos=self.eos,
            save_path=load_path,
            load_path=load_path,
        )
        self.simple_vocab = SimpleVocabulary(
            min_freq=2,
            special_tokens=[self.eos, self.bos, "<UNK>"],
            unk_token="<UNK>",
            freq_drop_load=True,
            save_path=load_path,
            load_path=load_path,
        )
        super().__init__(data, seed, shuffle, *args, **kwargs)

    def _line2ids(self, line):
        line = [self.bos] + line.split() + [self.eos]

        char_ids = self.str_utf8_encoder(line)
        reversed_char_ids = list(reversed(char_ids))
        char_ids = char_ids[:-1]
        reversed_char_ids = reversed_char_ids[:-1]

        token_ids = self.simple_vocab(line)
        reversed_token_ids = list(reversed(token_ids))
        token_ids = token_ids[1:]
        reversed_token_ids = reversed_token_ids[1:]
        
        return char_ids, reversed_char_ids, token_ids, reversed_token_ids
            
    def _line_generator(self, shard_generator):
        for shard in shard_generator:
            line_generator = chunk_generator(shard, 1)
            for line in line_generator:
                line = line[0]
                char_ids, reversed_char_ids, token_ids, reversed_token_ids =\
                    self._line2ids(line)
                yield char_ids, reversed_char_ids, token_ids, reversed_token_ids

    @staticmethod
    def _batch_generator(line_generator, batch_size, unroll_steps):
        batch = [[[] for i in range(4)] for i in range(batch_size)]
        stream = [[[] for i in range(4)] for i in range(batch_size)]

        try:
            while True:
                for batch_item, stream_item in zip(batch, stream):
                    while len(stream_item[0]) < unroll_steps:
                        line = next(line_generator)
                        for sti, lni in zip(stream_item, line):
                            sti.extend(lni)
                    for sti, bchi in zip(stream_item, batch_item):
                        _b = sti[:unroll_steps]
                        _s = sti[unroll_steps:]
                        bchi.clear()
                        _b = _b
                        bchi.extend(_b)

                        sti.clear()
                        sti.extend(_s)
                char_ids, reversed_char_ids, token_ids, reversed_token_ids =\
                    zip(*batch)
                yield char_ids, reversed_char_ids, token_ids, reversed_token_ids
        except StopIteration:
            pass

    def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: Optional[bool] = None)\
            -> Iterator[Tuple[str, str]]:
        if shuffle is None:
            shuffle = self.shuffle

        tgt_data = self.data[data_type]
        shard_generator = self._shard_generator(tgt_data, shuffle=shuffle)
        line_generator = self._line_generator(shard_generator)

        if data_type == 'train':
            unroll_steps = self.unroll_steps
            n_gpus = self.n_gpus
        else:
            unroll_steps = 1
            batch_size = 256
            n_gpus = 1

        batch_generator = self._batch_generator(line_generator, batch_size * n_gpus, unroll_steps)

        for char_ids, reversed_char_ids, token_ids, reversed_token_ids in batch_generator:
            batch = [(char_ids, reversed_char_ids), (token_ids, reversed_token_ids)]
            yield batch