Source code for deeppavlov.models.tokenizers.ru_tokenizer

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Generator, Any, Optional, Union, Tuple

from nltk.tokenize.toktok import ToktokTokenizer
# from nltk.corpus import stopwords
# STOPWORDS = stopwords.words('russian')
import pymorphy2

from deeppavlov.core.models.component import Component
from deeppavlov.core.common.registry import register
from deeppavlov.models.tokenizers.utils import detokenize, ngramize
from deeppavlov.core.common.log import get_logger

logger = get_logger(__name__)


[docs]@register('ru_tokenizer') class RussianTokenizer(Component): """Tokenize or lemmatize a list of documents for Russian language. Default models are :class:`ToktokTokenizer` tokenizer and :mod:`pymorphy2` lemmatizer. Return a list of tokens or lemmas for a whole document. If is called onto ``List[str]``, performs detokenizing procedure. Args: stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing and ngrams creation ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method Attributes: stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing and ngrams creation tokenizer: an instance of :class:`ToktokTokenizer` tokenizer class lemmatizer: an instance of :class:`pymorphy2.MorphAnalyzer` lemmatizer class ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method tok2morph: token-to-lemma cache """ def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None, lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None, **kwargs): if ngram_range is None: ngram_range = [1, 1] self.stopwords = stopwords or [] self.tokenizer = ToktokTokenizer() self.lemmatizer = pymorphy2.MorphAnalyzer() self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only self.tok2morph = {}
[docs] def __call__(self, batch: Union[List[str], List[List[str]]]) -> \ Union[List[List[str]], List[str]]: """Tokenize or detokenize strings, depends on the type structure of passed arguments. Args: batch: a batch of documents to perform tokenizing/lemmatizing; or a batch of lists of tokens/lemmas to perform detokenizing Returns: a batch of lists of tokens/lemmas; or a batch of detokenized strings Raises: TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``. """ if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0])))
def _tokenize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1), lowercase: bool=True)\ -> Generator[List[str], Any, None]: """Tokenize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods Yields: list of lists of ngramized tokens or list of detokenized strings Returns: None """ # DEBUG # size = len(data) _ngram_range = self.ngram_range or ngram_range if self.lowercase is None: _lowercase = lowercase else: _lowercase = self.lowercase for i, doc in enumerate(data): # DEBUG # logger.info("Tokenize doc {} from {}".format(i, size)) tokens = self.tokenizer.tokenize(doc) if _lowercase: tokens = [t.lower() for t in tokens] filtered = self._filter(tokens) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def _lemmatize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1)) -> \ Generator[List[str], Any, None]: """Lemmatize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default Yields: list of lists of ngramized tokens or list of detokenized strings Returns: None """ # DEBUG # size = len(data) _ngram_range = self.ngram_range or ngram_range tokenized_data = list(self._tokenize(data)) for i, doc in enumerate(tokenized_data): # DEBUG # logger.info("Lemmatize doc {} from {}".format(i, size)) lemmas = [] for token in doc: try: lemma = self.tok2morph[token] except KeyError: lemma = self.lemmatizer.parse(token)[0].normal_form self.tok2morph[token] = lemma lemmas.append(lemma) filtered = self._filter(lemmas) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def _filter(self, items: List[str], alphas_only: bool=True) -> List[str]: """Filter a list of tokens/lemmas. Args: items: a list of tokens/lemmas to filter alphas_only: whether to filter out non-alpha tokens Returns: a list of filtered tokens/lemmas """ if self.alphas_only is None: _alphas_only = alphas_only else: _alphas_only = self.alphas_only if _alphas_only: filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords else: filter_fn = lambda x: not x.isspace() and x not in self.stopwords return list(filter(filter_fn, items)) def set_stopwords(self, stopwords: List[str]) -> None: """Redefine a list of stopwords. Args: stopwords: a list of stopwords Returns: None """ self.stopwords = stopwords