# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
from collections import defaultdict
from logging import getLogger
from pathlib import Path
import requests
from lxml import html
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.file import load_pickle, save_pickle
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, mark_done
log = getLogger(__name__)
[docs]@register('static_dictionary')
class StaticDictionary:
"""Trie vocabulary used in spelling correction algorithms
Args:
data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
relative to pipeline's data directory
dictionary_name: logical name of the dictionary
raw_dictionary_path: path to the source file with the list of words
Attributes:
dict_name: logical name of the dictionary
alphabet: set of all the characters used in this dictionary
words_set: set of all the words
words_trie: trie structure of all the words
"""
def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs):
data_dir = expand_path(data_dir) / dictionary_name
alphabet_path = data_dir / 'alphabet.pkl'
words_path = data_dir / 'words.pkl'
words_trie_path = data_dir / 'words_trie.pkl'
if not is_done(data_dir):
log.debug('Trying to build a dictionary in {}'.format(data_dir))
if data_dir.is_dir():
shutil.rmtree(str(data_dir))
data_dir.mkdir(parents=True)
words = self._get_source(data_dir, *args, **kwargs)
words = {self._normalize(word) for word in words}
alphabet = {c for w in words for c in w}
alphabet.remove('⟬')
alphabet.remove('⟭')
save_pickle(alphabet, alphabet_path)
save_pickle(words, words_path)
words_trie = defaultdict(set)
for word in words:
for i in range(len(word)):
words_trie[word[:i]].add(word[:i + 1])
words_trie[word] = set()
words_trie = {k: sorted(v) for k, v in words_trie.items()}
save_pickle(words_trie, words_trie_path)
mark_done(data_dir)
log.debug('built')
else:
log.debug('Loading a dictionary from {}'.format(data_dir))
self.alphabet = load_pickle(alphabet_path)
self.words_set = load_pickle(words_path)
self.words_trie = load_pickle(words_trie_path)
@staticmethod
def _get_source(data_dir, raw_dictionary_path, *args, **kwargs):
raw_path = expand_path(raw_dictionary_path)
with raw_path.open(newline='', encoding='utf8') as f:
data = [line.strip().split('\t')[0] for line in f]
return data
@staticmethod
def _normalize(word):
return '⟬{}⟭'.format(word.strip().lower().replace('ё', 'е'))
[docs]@register('russian_words_vocab')
class RussianWordsVocab(StaticDictionary):
"""Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data from https://github.com/danakt/russian-words/
Args:
data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
relative to pipeline's data directory
Attributes:
dict_name: logical name of the dictionary
alphabet: set of all the characters used in this dictionary
words_set: set of all the words
words_trie: trie structure of all the words
"""
def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
kwargs['dictionary_name'] = 'russian_words_vocab'
super().__init__(data_dir, *args, **kwargs)
@staticmethod
def _get_source(*args, **kwargs):
log.debug('Downloading russian vocab from https://github.com/danakt/russian-words/')
url = 'https://github.com/danakt/russian-words/raw/master/russian.txt'
page = requests.get(url)
return [word.strip() for word in page.content.decode('cp1251').strip().split('\n')]
[docs]@register('wikitionary_100K_vocab')
class Wiki100KDictionary(StaticDictionary):
"""Implementation of :class:`~deeppavlov.vocabs.typos.StaticDictionary` that builds data
from `Wikitionary <https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg>`__
Args:
data_dir: path to the directory where the built trie will be stored. Relative paths are interpreted as
relative to pipeline's data directory
Attributes:
dict_name: logical name of the dictionary
alphabet: set of all the characters used in this dictionary
words_set: set of all the words
words_trie: trie structure of all the words
"""
def __init__(self, data_dir: [Path, str] = '', *args, **kwargs):
kwargs['dictionary_name'] = 'wikipedia_100K_vocab'
super().__init__(data_dir, *args, **kwargs)
@staticmethod
def _get_source(*args, **kwargs):
words = []
log.debug('Downloading english vocab from Wiktionary')
for i in range(1, 100000, 10000):
k = 10000 + i - 1
url = 'https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2005/08/{}-{}'.format(i, k)
page = requests.get(url)
tree = html.fromstring(page.content)
words += tree.xpath('//div[@class="mw-parser-output"]/p/a/text()')
return words