Source code for deeppavlov.models.preprocessors.odqa_preprocessors

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from itertools import chain
from logging import getLogger
from typing import List, Callable, Union

from nltk import sent_tokenize

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


[docs]@register('document_chunker') class DocumentChunker(Component): """Make chunks from a document or a list of documents. Don't tear up sentences if needed. Args: sentencize_fn: a function for sentence segmentation keep_sentences: whether to tear up sentences between chunks or not tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit) flatten_result: whether to flatten the resulting list of lists of chunks paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored Attributes: keep_sentences: whether to tear up sentences between chunks or not tokens_limit: a number of tokens in a single chunk flatten_result: whether to flatten the resulting list of lists of chunks paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored """ def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True, tokens_limit: int = 400, flatten_result: bool = False, paragraphs: bool = False, *args, **kwargs) -> None: self._sentencize_fn = sentencize_fn self.keep_sentences = keep_sentences self.tokens_limit = tokens_limit self.flatten_result = flatten_result self.paragraphs = paragraphs
[docs] def __call__(self, batch_docs: List[Union[str, List[str]]]) -> \ List[Union[List[str], List[List[str]]]]: """Make chunks from a batch of documents. There can be several documents in each batch. Args: batch_docs: a batch of documents / a batch of lists of documents Returns: chunks of docs, flattened or not """ result = [] for docs in batch_docs: batch_chunks = [] if isinstance(docs, str): docs = [docs] for doc in docs: if self.paragraphs: split_doc = doc.split('\n\n') split_doc = [sd.strip() for sd in split_doc] split_doc = list(filter(lambda x: len(x) > 40, split_doc)) batch_chunks.append(split_doc) else: doc_chunks = [] if self.keep_sentences: sentences = sent_tokenize(doc) n_tokens = 0 keep = [] for s in sentences: n_tokens += len(s.split()) if n_tokens > self.tokens_limit: if keep: doc_chunks.append(' '.join(keep)) n_tokens = 0 keep.clear() keep.append(s) if keep: doc_chunks.append(' '.join(keep)) batch_chunks.append(doc_chunks) else: split_doc = doc.split() doc_chunks = [split_doc[i:i + self.tokens_limit] for i in range(0, len(split_doc), self.tokens_limit)] batch_chunks.append(doc_chunks) result.append(batch_chunks) if self.flatten_result: if isinstance(result[0][0], list): for i in range(len(result)): flattened = list(chain.from_iterable(result[i])) result[i] = flattened return result
[docs]@register('string_multiplier') class StringMultiplier(Component): """Make a list of strings from a provided string. A length of the resulting list equals a length of a provided reference argument. """ def __init__(self, **kwargs): pass
[docs] def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]: """ Multiply each string in a provided batch of strings. Args: batch_s: a batch of strings to be multiplied ref: a reference to obtain a length of the resulting list Returns: a multiplied s as list """ res = [] for s, r in zip(batch_s, ref): res.append([s] * len(r)) return res