Source code for deeppavlov.models.preprocessors.siamese_preprocessor

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from typing import List, Union, Iterable, Optional

import numpy as np

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from import zero_pad_truncate
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.estimator import Estimator

log = getLogger(__name__)

[docs]@register('siamese_preprocessor') class SiamesePreprocessor(Estimator): """ Preprocessing of data samples containing text strings to feed them in a siamese network. First ``num_context_turns`` strings in each data sample corresponds to the dialogue ``context`` and the rest string(s) in the sample is (are) ``response(s)``. Args: save_path: The parameter is only needed to initialize the base class :class:`~deeppavlov.core.models.serializable.Serializable`. load_path: The parameter is only needed to initialize the base class :class:`~deeppavlov.core.models.serializable.Serializable`. max_sequence_length: A maximum length of text sequences in tokens. Longer sequences will be truncated and shorter ones will be padded. dynamic_batch: Whether to use dynamic batching. If ``True``, the maximum length of a sequence for a batch will be equal to the maximum of all sequences lengths from this batch, but not higher than ``max_sequence_length``. padding: Padding. Possible values are ``pre`` and ``post``. If set to ``pre`` a sequence will be padded at the beginning. If set to ``post`` it will padded at the end. truncating: Truncating. Possible values are ``pre`` and ``post``. If set to ``pre`` a sequence will be truncated at the beginning. If set to ``post`` it will truncated at the end. use_matrix: Whether to use a trainable matrix with token (word) embeddings. num_context_turns: A number of ``context`` turns in data samples. num_ranking_samples: A number of condidates for ranking including positive one. add_raw_text: whether add raw text sentences to output data list or not. Use with conjunction of models using sentence encoders tokenizer: An instance of one of the :class:`deeppavlov.models.tokenizers`. vocab: An instance of :class:``. embedder: an instance of one of the :class:`deeppavlov.models.embedders`. sent_vocab: An instance of of :class:``. It is used to store all ``responces`` and to find the best ``response`` to the user ``context`` in the ``interact`` mode. """ def __init__(self, save_path: str = './tok.dict', load_path: str = './tok.dict', max_sequence_length: int = None, dynamic_batch: bool = False, padding: str = 'post', truncating: str = 'post', use_matrix: bool = True, num_context_turns: int = 1, num_ranking_samples: int = 1, add_raw_text: bool = False, tokenizer: Component = None, vocab: Optional[Estimator] = None, embedder: Optional[Component] = None, sent_vocab: Optional[Estimator] = None, **kwargs): self.max_sequence_length = max_sequence_length self.padding = padding self.truncating = truncating self.dynamic_batch = dynamic_batch self.use_matrix = use_matrix self.num_ranking_samples = num_ranking_samples self.num_context_turns = num_context_turns self.add_raw_text = add_raw_text self.tokenizer = tokenizer self.embedder = embedder self.vocab = vocab self.sent_vocab = sent_vocab self.save_path = expand_path(save_path).resolve() self.load_path = expand_path(load_path).resolve() super().__init__(load_path=self.load_path, save_path=self.save_path, **kwargs) def fit(self, x: List[List[str]]) -> None: if self.sent_vocab is not None:[el[self.num_context_turns:] for el in x]) x_tok = [self.tokenizer(el) for el in x][el for x in x_tok for el in x]) def __call__(self, x: Union[List[List[str]], List[str]]) -> Iterable[List[List[np.ndarray]]]: if len(x) == 0 or isinstance(x[0], str): if len(x) == 1: # interact mode: len(batch) == 1 x_preproc = [[sent.strip() for sent in x[0].split('&')]] # List[str] -> List[List[str]] elif len(x) == 0: x_preproc = [['']] else: x_preproc = [[el] for el in x] else: x_preproc = [el[:self.num_context_turns + self.num_ranking_samples] for el in x] for el in x_preproc: x_tok = self.tokenizer(el) x_ctok = [y if len(y) != 0 else [''] for y in x_tok] if self.use_matrix: x_proc = self.vocab(x_ctok) else: x_proc = self.embedder(x_ctok) if self.dynamic_batch: msl = min((max([len(y) for el in x_tok for y in el]), self.max_sequence_length)) else: msl = self.max_sequence_length x_proc = zero_pad_truncate(x_proc, msl, pad=self.padding, trunc=self.truncating) x_proc = list(x_proc) if self.add_raw_text: x_proc += el # add (self.num_context_turns+self.num_ranking_samples) raw sentences yield x_proc def load(self) -> None: pass def save(self) -> None: if self.sent_vocab is not None: