Source code for deeppavlov.dataset_iterators.elmo_file_paths_iterator

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path
from typing import Tuple, Iterator, Optional, Dict, List, Union

from deeppavlov.core.common.registry import register
from import SimpleVocabulary
from import chunk_generator
from deeppavlov.dataset_iterators.file_paths_iterator import FilePathsIterator
from deeppavlov.models.preprocessors.str_utf8_encoder import StrUTF8Encoder

log = getLogger(__name__)

[docs]@register('elmo_file_paths_iterator') class ELMoFilePathsIterator(FilePathsIterator): """Dataset iterator for tokenized datasets like 1 Billion Word Benchmark It gets lists of file paths from the data dictionary and returns batches of lines from each file. Args: data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values load_path: path to the vocabulary to be load from seed: random seed for data shuffling shuffle: whether to shuffle data during batching unroll_steps: number of unrolling steps n_gpus: number of gpu to use max_word_length: max length of word bos: tag of begin of sentence eos: tag of end of sentence """ def __init__(self, data: Dict[str, List[Union[str, Path]]], load_path: Union[str, Path], seed: Optional[int] = None, shuffle: bool = True, unroll_steps: Optional[int] = None, n_gpus: Optional[int] = None, max_word_length: Optional[int] = None, bos: str = "<S>", eos: str = "</S>", *args, **kwargs) -> None: self.unroll_steps = unroll_steps self.n_gpus = n_gpus self.bos = bos self.eos = eos self.str_utf8_encoder = StrUTF8Encoder( max_word_length=max_word_length, pad_special_char_use=True, word_boundary_special_char_use=True, sentence_boundary_special_char_use=False, reversed_sentense_tokens=False, bos=self.bos, eos=self.eos, save_path=load_path, load_path=load_path, ) self.simple_vocab = SimpleVocabulary( min_freq=2, special_tokens=[self.eos, self.bos, "<UNK>"], unk_token="<UNK>", freq_drop_load=True, save_path=load_path, load_path=load_path, ) super().__init__(data, seed, shuffle, *args, **kwargs) def _line2ids(self, line): line = [self.bos] + line.split() + [self.eos] char_ids = self.str_utf8_encoder(line) reversed_char_ids = list(reversed(char_ids)) char_ids = char_ids[:-1] reversed_char_ids = reversed_char_ids[:-1] token_ids = self.simple_vocab(line) reversed_token_ids = list(reversed(token_ids)) token_ids = token_ids[1:] reversed_token_ids = reversed_token_ids[1:] return char_ids, reversed_char_ids, token_ids, reversed_token_ids def _line_generator(self, shard_generator): for shard in shard_generator: line_generator = chunk_generator(shard, 1) for line in line_generator: line = line[0] char_ids, reversed_char_ids, token_ids, reversed_token_ids = \ self._line2ids(line) yield char_ids, reversed_char_ids, token_ids, reversed_token_ids @staticmethod def _batch_generator(line_generator, batch_size, unroll_steps): batch = [[[] for i in range(4)] for i in range(batch_size)] stream = [[[] for i in range(4)] for i in range(batch_size)] try: while True: for batch_item, stream_item in zip(batch, stream): while len(stream_item[0]) < unroll_steps: line = next(line_generator) for sti, lni in zip(stream_item, line): sti.extend(lni) for sti, bchi in zip(stream_item, batch_item): _b = sti[:unroll_steps] _s = sti[unroll_steps:] bchi.clear() _b = _b bchi.extend(_b) sti.clear() sti.extend(_s) char_ids, reversed_char_ids, token_ids, reversed_token_ids = \ zip(*batch) yield char_ids, reversed_char_ids, token_ids, reversed_token_ids except StopIteration: pass def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: Optional[bool] = None) \ -> Iterator[Tuple[str, str]]: if shuffle is None: shuffle = self.shuffle tgt_data =[data_type] shard_generator = self._shard_generator(tgt_data, shuffle=shuffle) line_generator = self._line_generator(shard_generator) if data_type == 'train': unroll_steps = self.unroll_steps n_gpus = self.n_gpus else: unroll_steps = 1 batch_size = 256 n_gpus = 1 batch_generator = self._batch_generator(line_generator, batch_size * n_gpus, unroll_steps) for char_ids, reversed_char_ids, token_ids, reversed_token_ids in batch_generator: batch = [(char_ids, reversed_char_ids), (token_ids, reversed_token_ids)] yield batch