Source code for deeppavlov.models.preprocessors.capitalization

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple, List, Optional

import numpy as np

from deeppavlov.core.common.registry import register
from import zero_pad
from deeppavlov.core.models.component import Component

[docs]@register('capitalization_featurizer') class CapitalizationPreprocessor(Component): """ Featurizer useful for NER task. It detects following patterns in the words: - no capitals - single capital single character - single capital multiple characters - all capitals multiple characters Args: pad_zeros: whether to pad capitalization features batch with zeros up to maximal length or not. Attributes: dim: dimensionality of the feature vectors, produced by the featurizer """ def __init__(self, pad_zeros: bool = True, *args, **kwargs) -> None: self.pad_zeros = pad_zeros self._num_of_features = 4 @property def dim(self): return self._num_of_features def __call__(self, tokens_batch, **kwargs): cap_batch = [] max_batch_len = 0 for utterance in tokens_batch: cap_list = [] max_batch_len = max(max_batch_len, len(utterance)) for token in utterance: cap = np.zeros(4, np.float32) # Check the case and produce corresponding one-hot if len(token) > 0: if token[0].islower(): cap[0] = 1 elif len(token) == 1 and token[0].isupper(): cap[1] = 1 elif len(token) > 1 and token[0].isupper() and any(ch.islower() for ch in token): cap[2] = 1 elif all(ch.isupper() for ch in token): cap[3] = 1 cap_list.append(cap) cap_batch.append(cap_list) if self.pad_zeros: return zero_pad(cap_batch) else: return cap_batch
[docs]def process_word(word: str, to_lower: bool = False, append_case: Optional[str] = None) -> Tuple[str]: """The method implements the following operations: 1. converts word to a tuple of symbols (character splitting), 2. optionally converts it to lowercase and 3. adds capitalization label. Args: word: input word to_lower: whether to lowercase append_case: whether to add case mark ('<FIRST_UPPER>' for first capital and '<ALL_UPPER>' for all caps) Returns: a preprocessed word. Example: >>> process_word(word="Zaman", to_lower=True, append_case="first") ('<FIRST_UPPER>', 'z', 'a', 'm', 'a', 'n') >>> process_word(word="MSU", to_lower=True, append_case="last") ('m', 's', 'u', '<ALL_UPPER>') """ if all(x.isupper() for x in word) and len(word) > 1: uppercase = "<ALL_UPPER>" elif word[0].isupper(): uppercase = "<FIRST_UPPER>" else: uppercase = None if to_lower: word = word.lower() if word.isdigit(): answer = ["<DIGIT>"] elif word.startswith("http://") or word.startswith("www."): answer = ["<HTTP>"] else: answer = list(word) if to_lower and uppercase is not None: if append_case == "first": answer = [uppercase] + answer elif append_case == "last": answer = answer + [uppercase] return tuple(answer)
[docs]@register('char_splitting_lowercase_preprocessor') class CharSplittingLowercasePreprocessor(Component): """A callable wrapper over :func:`process_word`. Takes as input a batch of tokenized sentences and returns a batch of preprocessed sentences. """ def __init__(self, to_lower: bool = True, append_case: str = "first", *args, **kwargs): self.to_lower = to_lower self.append_case = append_case def __call__(self, tokens_batch: List[List[str]], **kwargs) -> List[List[Tuple[str]]]: answer = [] for elem in tokens_batch: # if isinstance(elem, str): # elem = NLTKMosesTokenizer()([elem])[0] # # elem = [x for x in re.split("(\w+|[,.])", elem) if x.strip() != ""] answer.append([process_word(x, self.to_lower, self.append_case) for x in elem]) return answer