Source code for deeppavlov.models.syntax_parser.joint

from typing import Union, List

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.common.chainer import Chainer

from deeppavlov.models.morpho_tagger.common import TagOutputPrettifier,\
    LemmatizedOutputPrettifier, DependencyOutputPrettifier


UD_COLUMN_FEAT_MAPPING = {"id": 0, "word": 1, "lemma": 2, "upos": 3, "feats": 5, "head": 6, "deprel": 7}


[docs]@register("joint_tagger_parser") class JointTaggerParser(Component): """ A class to perform joint morphological and syntactic parsing. It is just a wrapper that calls the models for tagging and parsing and comprises their results in a single output. Args: tagger: the morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) parser_path: the syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) output_format: the output format, it may be either `ud` (alias: `conllu`) or `json`. to_output_string: whether to convert the output to a list of strings Attributes: tagger: a morphological tagger model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) parser: a syntactic parser model (a :class:`~deeppavlov.core.common.chainer.Chainer` instance) """ def __init__(self, tagger: Chainer, parser: Chainer, output_format: str = "ud", to_output_string: bool = False, *args, **kwargs): if output_format not in ["ud", "conllu", "json", "dict"]: UserWarning("JointTaggerParser output_format can be only `ud`, `conllu` or `json`. "\ "Unknown format: {}, setting the output_format to `ud`.".format(output_format)) output_format = "ud" self.output_format = output_format self.to_output_string = to_output_string self.tagger = tagger self.parser = parser self._check_models() def _check_models(self): tagger_prettifier = self.tagger[-1] if not isinstance(tagger_prettifier, (TagOutputPrettifier, LemmatizedOutputPrettifier)): raise ValueError("The tagger should output prettified data: last component of the config " "should be either a TagOutputPrettifier or a LemmatizedOutputPrettifier " "instance.") if isinstance(tagger_prettifier, TagOutputPrettifier): tagger_prettifier.set_format_mode("ud") tagger_prettifier.return_string = False parser_prettifier = self.parser[-1] if not isinstance(parser_prettifier, DependencyOutputPrettifier): raise ValueError("The tagger should output prettified data: last component of the config " "should be either a DependencyOutputPrettifier instance.") parser_prettifier.return_string = False
[docs] def __call__(self, data: Union[List[str], List[List[str]]])\ -> Union[List[List[dict]], List[str], List[List[str]]]: r"""Parses a batch of sentences. Args: data: either a batch of tokenized sentences, or a batch of raw sentences Returns: `answer`, a batch of parsed sentences. A sentence parse is a list of single word parses. Each word parse is either a CoNLL-U-formatted string or a dictionary. A sentence parse is returned either as is if ``self.to_output_string`` is ``False``, or as a single string, where each word parse begins with a new string. .. code-block:: python >>> from deeppavlov.core.commands.infer import build_model >>> model = build_model("ru_syntagrus_joint_parsing") >>> batch = ["Девушка пела в церковном хоре.", "У этой задачи есть сложное решение."] >>> print(*model(batch), sep="\\n\\n") 1 Девушка девушка NOUN _ Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing 2 nsubj _ _ 2 пела петь VERB _ Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _ 3 в в ADP _ _ 5 case _ _ 4 церковном церковный ADJ _ Case=Loc|Degree=Pos|Gender=Masc|Number=Sing 5 amod _ _ 5 хоре хор NOUN _ Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing 2 obl _ _ 6 . . PUNCT _ _ 2 punct _ _ 1 У у ADP _ _ 3 case _ _ 2 этой этот DET _ Case=Gen|Gender=Fem|Number=Sing 3 det _ _ 3 задачи задача NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 4 obl _ _ 4 есть быть VERB _ Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ _ 5 сложное сложный ADJ _ Case=Nom|Degree=Pos|Gender=Neut|Number=Sing 6 amod _ _ 6 решение решение NOUN _ Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing 4 nsubj _ _ 7 . . PUNCT _ _ 4 punct _ _ >>> # Dirty hacks to change model parameters in the code, you should do it in the configuration file. >>> model["main"].to_output_string = False >>> model["main"].output_format = "json" >>> for sent_parse in model(batch): >>> for word_parse in sent_parse: >>> print(word_parse) >>> print("") {'id': '1', 'word': 'Девушка', 'lemma': 'девушка', 'upos': 'NOUN', 'feats': 'Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing', 'head': '2', 'deprel': 'nsubj'} {'id': '2', 'word': 'пела', 'lemma': 'петь', 'upos': 'VERB', 'feats': 'Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act', 'head': '0', 'deprel': 'root'} {'id': '3', 'word': 'в', 'lemma': 'в', 'upos': 'ADP', 'feats': '_', 'head': '5', 'deprel': 'case'} {'id': '4', 'word': 'церковном', 'lemma': 'церковный', 'upos': 'ADJ', 'feats': 'Case=Loc|Degree=Pos|Gender=Masc|Number=Sing', 'head': '5', 'deprel': 'amod'} {'id': '5', 'word': 'хоре', 'lemma': 'хор', 'upos': 'NOUN', 'feats': 'Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing', 'head': '2', 'deprel': 'obl'} {'id': '6', 'word': '.', 'lemma': '.', 'upos': 'PUNCT', 'feats': '_', 'head': '2', 'deprel': 'punct'} {'id': '1', 'word': 'У', 'lemma': 'у', 'upos': 'ADP', 'feats': '_', 'head': '3', 'deprel': 'case'} {'id': '2', 'word': 'этой', 'lemma': 'этот', 'upos': 'DET', 'feats': 'Case=Gen|Gender=Fem|Number=Sing', 'head': '3', 'deprel': 'det'} {'id': '3', 'word': 'задачи', 'lemma': 'задача', 'upos': 'NOUN', 'feats': 'Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing', 'head': '4', 'deprel': 'obl'} {'id': '4', 'word': 'есть', 'lemma': 'быть', 'upos': 'VERB', 'feats': 'Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act', 'head': '0', 'deprel': 'root'} {'id': '5', 'word': 'сложное', 'lemma': 'сложный', 'upos': 'ADJ', 'feats': 'Case=Nom|Degree=Pos|Gender=Neut|Number=Sing', 'head': '6', 'deprel': 'amod'} {'id': '6', 'word': 'решение', 'lemma': 'решение', 'upos': 'NOUN', 'feats': 'Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing', 'head': '4', 'deprel': 'nsubj'} {'id': '7', 'word': '.', 'lemma': '.', 'upos': 'PUNCT', 'feats': '_', 'head': '4', 'deprel': 'punct'} """ tagger_output = self.tagger(data) parser_output = self.parser(data) answer = [] for i, (tagger_sent, parser_sent) in enumerate(zip(tagger_output, parser_output)): curr_sent_answer = [] for j, curr_word_tagger_output in enumerate(tagger_sent): curr_word_tagger_output = curr_word_tagger_output.split("\t") curr_word_parser_output = parser_sent[j].split("\t") curr_word_answer = curr_word_tagger_output[:] # setting parser output curr_word_answer[6:8] = curr_word_parser_output[6:8] if self.output_format in ["json", "dict"]: curr_word_answer = {key: curr_word_answer[index] for key, index in UD_COLUMN_FEAT_MAPPING.items()} if self.to_output_string: curr_word_answer = str(curr_word_answer) elif self.to_output_string: curr_word_answer = "\t".join(curr_word_answer) curr_sent_answer.append(curr_word_answer) if self.to_output_string: curr_sent_answer = "\n".join(str(x) for x in curr_sent_answer) answer.append(curr_sent_answer) return answer