Source code for deeppavlov.models.preprocessors.odqa_preprocessors

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Callable, Union
from itertools import chain

from nltk import sent_tokenize

from deeppavlov.core.common.log import get_logger
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = get_logger(__name__)


[docs]@register('document_chunker')
class DocumentChunker(Component):
    """Make chunks from a document or a list of documents. Don't tear up sentences if needed.

    Args:
        sentencize_fn: a function for sentence segmentation
        keep_sentences: whether to tear up sentences between chunks or not
        tokens_limit: a number of tokens in a single chunk (usually this number corresponds to the squad model limit)
        flatten_result: whether to flatten the resulting list of lists of chunks
        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored

    Attributes:
        keep_sentences: whether to tear up sentences between chunks or not
        tokens_limit: a number of tokens in a single chunk
        flatten_result: whether to flatten the resulting list of lists of chunks
        paragraphs: whether to split document by paragrahs; if set to True, tokens_limit is ignored

    """

    def __init__(self, sentencize_fn: Callable = sent_tokenize, keep_sentences: bool = True,
                 tokens_limit: int = 400, flatten_result: bool = False,
                 paragraphs: bool = False, *args, **kwargs) -> None:
        self._sentencize_fn = sentencize_fn
        self.keep_sentences = keep_sentences
        self.tokens_limit = tokens_limit
        self.flatten_result = flatten_result
        self.paragraphs = paragraphs

[docs]    def __call__(self, batch_docs: List[Union[str, List[str]]]) -> \
            List[Union[List[str], List[List[str]]]]:
        """Make chunks from a batch of documents. There can be several documents in each batch.
        Args:
            batch_docs: a batch of documents / a batch of lists of documents
        Returns:
            chunks of docs, flattened or not
        """

        result = []

        for docs in batch_docs:
            batch_chunks = []
            if isinstance(docs, str):
                docs = [docs]
            for doc in docs:
                if self.paragraphs:
                    split_doc = doc.split('\n\n')
                    split_doc = [sd.strip() for sd in split_doc]
                    split_doc = list(filter(lambda x: len(x) > 40, split_doc))
                    batch_chunks.append(split_doc)
                else:
                    doc_chunks = []
                    if self.keep_sentences:
                        sentences = sent_tokenize(doc)
                        n_tokens = 0
                        keep = []
                        for s in sentences:
                            n_tokens += len(s.split())
                            if n_tokens > self.tokens_limit:
                                if keep:
                                    doc_chunks.append(' '.join(keep))
                                    n_tokens = 0
                                    keep.clear()
                            keep.append(s)
                        if keep:
                            doc_chunks.append(' '.join(keep))
                        batch_chunks.append(doc_chunks)
                    else:
                        split_doc = doc.split()
                        doc_chunks = [split_doc[i:i + self.tokens_limit] for i in
                                      range(0, len(split_doc), self.tokens_limit)]
                        batch_chunks.append(doc_chunks)
            result.append(batch_chunks)

        if self.flatten_result:
            if isinstance(result[0][0], list):
                for i in range(len(result)):
                    flattened = list(chain.from_iterable(result[i]))
                    result[i] = flattened

        return result


[docs]@register('string_multiplier')
class StringMultiplier(Component):
    """Make a list of strings from a provided string. A length of the resulting list equals a length
    of a provided reference argument.

    """

    def __init__(self, **kwargs):
        pass

[docs]    def __call__(self, batch_s: List[str], ref: List[str]) -> List[List[str]]:
        """ Multiply each string in a provided batch of strings.

        Args:
            batch_s: a batch of strings to be multiplied
            ref: a reference to obtain a length of the resulting list

        Returns:
            a multiplied s as list

        """
        res = []
        for s, r in zip(batch_s, ref):
            res.append([s] * len(r))

        return res