Source code for deeppavlov.vocabs.typos

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import shutil
from collections import defaultdict
from logging import getLogger
from pathlib import Path

import requests
from lxml import html

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import load_pickle, save_pickle
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, mark_done

log = getLogger(__name__)


[docs]@register('static_dictionary')
class StaticDictionary:
    """Trie vocabulary used in spelling correction algorithms

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory
        dictionary_name: logical name of the dictionary
        raw_dictionary_path: path to the source file with the list of words

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.debug('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.debug('built')
        else:
            log.debug('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)

    @staticmethod
    def _get_source(data_dir, raw_dictionary_path, *args, **kwargs):
        raw_path = expand_path(raw_dictionary_path)
        with raw_path.open(newline='', encoding='utf8') as f:
            data = [line.strip().split('\t')[0] for line in f]
        return data

    @staticmethod
    def _normalize(word):
        return '⟬{}⟭'.format(word.strip().lower().replace('ё', 'е'))


[docs]@register('russian_words_vocab')
class RussianWordsVocab(StaticDictionary):
    """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from https://github.com/danakt/russian-words/

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
        kwargs['dictionary_name'] = 'russian_words_vocab'
        super().__init__(data_dir, *args, **kwargs)

    @staticmethod
    def _get_source(*args, **kwargs):
        log.debug('Downloading russian vocab from https://github.com/danakt/russian-words/')
        url = 'https://github.com/danakt/russian-words/raw/master/russian.txt'
        page = requests.get(url)
        return [word.strip() for word in page.content.decode('cp1251').strip().split('\n')]


[docs]@register('wikitionary_100K_vocab')
class Wiki100KDictionary(StaticDictionary):
    """Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data
    from `Wikitionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__

    Args:
        data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
            relative to pipeline's data directory

    Attributes:
        dict_name: logical name of the dictionary
        alphabet: set of all the characters used in this dictionary
        words_set: set of all the words
        words_trie: trie structure of all the words
    """

    def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
        kwargs['dictionary_name'] = 'wikipedia_100K_vocab'
        super().__init__(data_dir, *args, **kwargs)

    @staticmethod
    def _get_source(*args, **kwargs):
        words = []
        log.debug('Downloading english vocab from Wiktionary')
        for i in range(1, 100000, 10000):
            k = 10000 + i - 1
            url = 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/08/{}-{}'.format(i, k)
            page = requests.get(url)
            tree = html.fromstring(page.content)
            words += tree.xpath('//div[@class="mw-parser-output"]/p/a/text()')
        return words