Source code for deeppavlov.models.kbqa.kb_answer_parser_wikidata

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
from pathlib import Path
from string import punctuation
from logging import getLogger
from typing import List, Tuple, Optional, Dict

import numpy as np

from deeppavlov.core.models.serializable import Serializable
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.models.kbqa.entity_linking import EntityLinker

log = getLogger(__name__)


[docs]@register('kb_answer_parser_wikidata')
class KBAnswerParserWikidata(Component, Serializable):
    """
        This class generates an answer for a given question using Wikidata.
        It searches for matching triplet from the Wikidata with entity and
        relation mentioned in the question. It uses results of the Named
        Entity Recognition component to extract entity mention and Classification
        component to determine relation which connects extracted entity and the
        answer entity.
    """

    def __init__(self, load_path: str, top_k_classes: int, linker: EntityLinker, classes_vocab_keys: Tuple,
                 debug: bool = False, relations_maping_filename: str = None, templates_filename: str = None,
                 return_confidences: bool = True, *args, **kwargs) -> None:
        """

        Args:
            load_path: path to folder with wikidata files
            top_k_classes: number of relations with top k probabilities
            linker: component `deeppavlov.models.kbqa.entity_linking`
            classes_vocab_keys: list of relations predicted by `deeppavlov.models.ner.network` model
            debug: whether to print entities and relations extracted from the question
            relations_maping_filename: file with the dictionary of ids(keys) and titles(values) of relations
            from Wikidata
            templates_filename: file with the dictionary of question templates(keys) and relations for these templates
            (values)
            return_confidences: whether to return confidences of answers
            *args:
            **kwargs:
        """
        super().__init__(save_path=None, load_path=load_path)
        self.top_k_classes = top_k_classes
        self.classes = list(classes_vocab_keys)
        self._debug = debug
        self._relations_filename = relations_maping_filename
        self._templates_filename = templates_filename
        self._q_to_name: Optional[Dict[str, Dict[str, str]]] = None
        self._relations_mapping: Optional[Dict[str, str]] = None
        self.templates: Optional[Dict[str, str]] = None
        self.return_confidences = return_confidences
        self.linker = linker
        self.load()

    def load(self) -> None:
        with open(self.load_path, 'rb') as fl:
            self._q_to_name = pickle.load(fl)
        if self._relations_filename is not None:
            with open(self.load_path.parent / self._relations_filename, 'rb') as f:
                self._relations_mapping = pickle.load(f)
        if self._templates_filename is not None:
            with open(self.load_path.parent / self._templates_filename, 'rb') as t:
                self.templates = pickle.load(t)

    def save(self) -> None:
        pass

    def __call__(self, tokens_batch: List[List[str]],
                 tags_batch: List[List[int]],
                 relations_probs_batch: List[List[float]],
                 *args, **kwargs) -> List[str]:

        objects_batch = []
        confidences_batch = []

        for tokens, tags, relations_probs in zip(tokens_batch, tags_batch, relations_probs_batch):
            is_kbqa = self.is_kbqa_question(tokens)
            if is_kbqa:
                if self._templates_filename is not None:
                    entity_from_template, relation_from_template = self.entities_and_rels_from_templates(tokens)
                else:
                    entity_from_template = None
                if entity_from_template:
                    if self._debug:
                        relation_title = self._relations_mapping[relation_from_template]
                        log.debug("entity {}, relation {}".format(entity_from_template, relation_title))
                    entity_triplets, entity_linking_confidences = self.linker(entity_from_template, tokens)
                    relation_prob = 1.0
                    obj, confidence = self._match_triplet(entity_triplets,
                                                          entity_linking_confidences,
                                                          [relation_from_template],
                                                          [relation_prob])
                else:
                    entity_from_ner = self.extract_entities(tokens, tags)
                    entity_triplets, entity_linking_confidences = self.linker(entity_from_ner, tokens)
                    top_k_relations, top_k_probs = self._parse_relations_probs(relations_probs)
                    top_k_relation_names = [self._relations_mapping[rel] for rel in top_k_relations]
                    if self._debug:
                        log.debug("top k relations {}" .format(str(top_k_relation_names)))
                    obj, confidence = self._match_triplet(entity_triplets,
                                                          entity_linking_confidences,
                                                          top_k_relations,
                                                          top_k_probs)
                objects_batch.append(obj)
                confidences_batch.append(confidence)
            else:
                objects_batch.append('')
                confidences_batch.append(0.0)

        parsed_objects_batch, confidences_batch = self._parse_wikidata_object(objects_batch, confidences_batch)
        if self.return_confidences:
            return parsed_objects_batch, confidences_batch
        else:
            return parsed_objects_batch

    def _parse_wikidata_object(self,
                               objects_batch: List[str],
                               confidences_batch: List[float]) -> Tuple[List[str], List[float]]:
        parsed_objects = []
        for n, obj in enumerate(objects_batch):
            if len(obj) > 0:
                if obj.startswith('Q'):
                    if obj in self._q_to_name:
                        parsed_object = self._q_to_name[obj]["name"]
                        parsed_objects.append(parsed_object)
                    else:
                        parsed_objects.append('Not Found')
                        confidences_batch[n] = 0.0
                else:
                    parsed_objects.append(obj)
            else:
                parsed_objects.append('Not Found')
                confidences_batch[n] = 0.0
        return parsed_objects, confidences_batch

    @staticmethod
    def _match_triplet(entity_triplets: List[List[str]],
                       entity_linking_confidences: List[float],
                       relations: List[int],
                       relations_probs: List[float]) -> Tuple[str, float]:
        obj = ''
        confidence = 0.0
        for predicted_relation, rel_prob in zip(relations, relations_probs):
            for entities, linking_confidence in zip(entity_triplets, entity_linking_confidences):
                for rel_triplets in entities:
                    relation_from_wiki = rel_triplets[0]
                    if predicted_relation == relation_from_wiki:
                        obj = rel_triplets[1]
                        confidence = linking_confidence * rel_prob
                        return obj, confidence
        return obj, confidence

    def _parse_relations_probs(self, probs: List[float]) -> Tuple[List[str], List[str]]:
        top_k_inds = np.asarray(probs).argsort()[-self.top_k_classes:][::-1]
        top_k_classes = [self.classes[k] for k in top_k_inds]
        top_k_probs = [probs[k] for k in top_k_inds]
        return top_k_classes, top_k_probs

    @staticmethod
    def extract_entities(tokens: List[str], tags: List[str]) -> str:
        entity = []
        for j, tok in enumerate(tokens):
            if tags[j] != 0:  # TODO: replace with tag 'O' (not necessary 0)
                entity.append(tok)
        entity = ' '.join(entity)

        return entity

    def entities_and_rels_from_templates(self, tokens: List[List[str]]) -> Tuple[str, int]:
        s_sanitized = ' '.join([ch for ch in tokens if ch not in punctuation]).lower()
        ent = ''
        relation = ''
        for template in self.templates:
            template_start, template_end = template.lower().split('xxx')
            if template_start in s_sanitized and template_end in s_sanitized:
                template_start_pos = s_sanitized.find(template_start)
                template_end_pos = s_sanitized.find(template_end)
                ent_cand = s_sanitized[template_start_pos+len(template_start): template_end_pos or len(s_sanitized)]
                if len(ent_cand) < len(ent) or len(ent) == 0:
                    ent = ent_cand
                    relation = self.templates[template]
        return ent, relation

    def is_kbqa_question(self, question_tokens: List[List[str]]) -> bool:
        not_kbqa_question_templates = ["почему", "когда будет", "что будет", "что если", "для чего ", "как ", \
                                       "что делать", "зачем", "что может"]
        kbqa_question_templates = ["как зовут", "как называется", "как звали", "как ты думаешь", "как твое мнение", \
                                   "как ты считаешь"]
        question_init = ' '.join(question_tokens)
        question = ''.join([ch for ch in question_init if ch not in punctuation]).lower()
        is_kbqa = (all(template not in question for template in not_kbqa_question_templates) or
                   all(template in question for template in kbqa_question_templates))
        return is_kbqa