Source code for deeppavlov.dataset_readers.typos_reader

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from logging import getLogger
from pathlib import Path
from typing import Dict, List, Tuple

import requests
from lxml import html

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import is_done, download, mark_done

log = getLogger(__name__)


[docs]@register('typos_custom_reader') class TyposCustom(DatasetReader): """Base class for reading spelling corrections dataset files """ def __init__(self): pass
[docs] @staticmethod def build(data_path: str) -> Path: """Base method that interprets ``data_path`` argument. Args: data_path: path to the tsv-file containing erroneous and corrected words Returns: the same path as a :class:`~pathlib.Path` object """ return Path(data_path)
[docs] @classmethod def read(cls, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposCustom.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = cls.build(data_path) with fname.open(newline='', encoding='utf8') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') next(reader) res = [(mistake, correct) for mistake, correct in reader] return {'train': res}
[docs]@register('typos_wikipedia_reader') class TyposWikipedia(TyposCustom): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with English Wikipedia's list of common misspellings """
[docs] @staticmethod def build(data_path: str) -> Path: """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_ Args: data_path: target directory to download the data to Returns: path to the resulting tsv-file """ data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='', encoding='utf8') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
[docs]@register('typos_kartaslov_reader') class TyposKartaslov(DatasetReader): """Implementation of :class:`~deeppavlov.dataset_readers.typos_reader.TyposCustom` that works with a Russian misspellings dataset from `kartaslov <https://github.com/dkulagin/kartaslov>`_ """ def __init__(self): pass
[docs] @staticmethod def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
[docs] @staticmethod def read(data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[str, str]]]: """Read train data for spelling corrections algorithms Args: data_path: path that needs to be interpreted with :meth:`~deeppavlov.dataset_readers.typos_reader.TyposKartaslov.build` Returns: train data to pass to a :class:`~deeppavlov.dataset_iterators.typos_iterator.TyposDatasetIterator` """ fname = TyposKartaslov.build(data_path) with open(str(fname), newline='', encoding='utf8') as csvfile: reader = csv.reader(csvfile, delimiter=';') next(reader) res = [(mistake, correct) for correct, mistake, weight in reader] return {'train': res}