Source code for deeppavlov.models.spelling_correction.levenshtein.searcher_component

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
from math import log10
from typing import Iterable, List, Tuple

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.common.log import get_logger

from .levenshtein_searcher import LevenshteinSearcher


logger = get_logger(__name__)


[docs]@register('spelling_levenshtein')
class LevenshteinSearcherComponent(Component):
    """Component that finds replacement candidates for tokens at a set Damerau-Levenshtein distance

    Args:
        words: list of every correct word
        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates
        error_probability: assigned probability for every edit

    Attributes:
        max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates
        error_probability: assigned logarithmic probability for every edit
        vocab_penalty: assigned logarithmic probability of an out of vocabulary token being the correct one without
         changes
    """

    _punctuation = frozenset(string.punctuation)

    def __init__(self, words: Iterable[str], max_distance: int=1, error_probability: float=1e-4, *args, **kwargs):
        words = list({word.strip().lower().replace('ё', 'е') for word in words})
        alphabet = sorted({letter for word in words for letter in word})
        self.max_distance = max_distance
        self.error_probability = log10(error_probability)
        self.vocab_penalty = self.error_probability * 2
        self.searcher = LevenshteinSearcher(alphabet, words, allow_spaces=True, euristics=2)

    def _infer_instance(self, tokens: Iterable[str]) -> List[List[Tuple[float, str]]]:
        candidates = []
        for word in tokens:
            if word in self._punctuation:
                candidates.append([(0, word)])
            else:
                c = {candidate: self.error_probability * distance
                     for candidate, distance in self.searcher.search(word, d=self.max_distance)}
                c[word] = c.get(word, self.vocab_penalty)
                candidates.append([(score, candidate) for candidate, score in c.items()])
        return candidates

[docs]    def __call__(self, batch: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]:
        """Propose candidates for tokens in sentences

        Args:
            batch: batch of tokenized sentences

        Returns:
            batch of lists of probabilities and candidates for every token
        """
        return [self._infer_instance(tokens) for tokens in batch]