Source code for deeppavlov.dataset_readers.typos_reader

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from typing import Dict, List, Tuple

import requests
from pathlib import Path

from lxml import html

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, download, mark_done
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.common.log import get_logger


log = get_logger(__name__)


[docs]@register('typos_custom_reader') class TyposCustom(DatasetReader): """Base class for reading spelling corrections dataset files """ def __init__(self): pass
[docs] @staticmethod def build(data_path: str) -> Path: """Base method that interprets ``data_path`` argument. Args: data_path: path to the tsv-file containing erroneous and corrected words Returns: the same path as a :class:`~pathlib.Path` object """ return Path(data_path)
[docs] @classmethod def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = cls.build(data_path) with fname.open(newline='', encoding='utf8') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') next(reader) res = [(mistake, correct) for mistake, correct in reader] return {'train': res}
[docs]@register('typos_wikipedia_reader') class TyposWikipedia(TyposCustom): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with English Wikipedia's list of common misspellings """
[docs] @staticmethod def build(data_path: str) -> Path: """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_ Args: data_path: target directory to download the data to Returns: path to the resulting tsv-file """ data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='', encoding='utf8') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
[docs]@register('typos_kartaslov_reader') class TyposKartaslov(DatasetReader): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with a Russian misspellings dataset from `kartaslov <https://github.com/dkulagin/kartaslov>`_ """ def __init__(self): pass
[docs] @staticmethod def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
[docs] @staticmethod def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = TyposKartaslov.build(data_path) with open(str(fname), newline='', encoding='utf8') as csvfile: reader = csv.reader(csvfile, delimiter=';') next(reader) res = [(mistake, correct) for correct, mistake, weight in reader] return {'train': res}