Source code for deeppavlov.models.spelling_correction.levenshtein.searcher_component

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
from math import log10
from typing import Iterable, List, Tuple

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.common.log import get_logger

from .levenshtein_searcher import LevenshteinSearcher


logger = get_logger(__name__)


[docs]@register('spelling_levenshtein') class LevenshteinSearcherComponent(Component): """Component that finds replacement candidates for tokens at a set Damerau-Levenshtein distance Args: words: list of every correct word max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates error_probability: assigned probability for every edit Attributes: max_distance: maximum allowed Damerau-Levenshtein distance between source words and candidates error_probability: assigned logarithmic probability for every edit vocab_penalty: assigned logarithmic probability of an out of vocabulary token being the correct one without changes """ _punctuation = frozenset(string.punctuation) def __init__(self, words: Iterable[str], max_distance: int=1, error_probability: float=1e-4, *args, **kwargs): words = list({word.strip().lower().replace('ё', 'е') for word in words}) alphabet = sorted({letter for word in words for letter in word}) self.max_distance = max_distance self.error_probability = log10(error_probability) self.vocab_penalty = self.error_probability * 2 self.searcher = LevenshteinSearcher(alphabet, words, allow_spaces=True, euristics=2) def _infer_instance(self, tokens: Iterable[str]) -> List[List[Tuple[float, str]]]: candidates = [] for word in tokens: if word in self._punctuation: candidates.append([(0, word)]) else: c = {candidate: self.error_probability * distance for candidate, distance in self.searcher.search(word, d=self.max_distance)} c[word] = c.get(word, self.vocab_penalty) candidates.append([(score, candidate) for candidate, score in c.items()]) return candidates
[docs] def __call__(self, batch: Iterable[Iterable[str]], *args, **kwargs) -> List[List[List[Tuple[float, str]]]]: """Propose candidates for tokens in sentences Args: batch: batch of tokenized sentences Returns: batch of lists of probabilities and candidates for every token """ return [self._infer_instance(tokens) for tokens in batch]