Source code for deeppavlov.models.kbqa.wiki_parser

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import re
from logging import getLogger
from typing import List, Tuple, Dict
from collections import namedtuple

from hdt import HDTDocument

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


[docs]@register('wiki_parser') class WikiParser: """This class extract relations, objects or triplets from Wikidata HDT file"""
[docs] def __init__(self, wiki_filename: str, lang: str = "@en", **kwargs) -> None: """ Args: wiki_filename: hdt file with wikidata lang: Russian or English language **kwargs: """ log.debug(f'__init__ wiki_filename: {wiki_filename}') wiki_path = expand_path(wiki_filename) self.description_rel = "http://schema.org/description" self.lang = lang self.document = HDTDocument(str(wiki_path))
[docs] def __call__(self, what_return: List[str], query_seq: List[List[str]], filter_info: List[Tuple[str]], order_info: namedtuple) -> List[List[str]]: """ Let us consider an example of the question "What is the deepest lake in Russia?" with the corresponding SPARQL query "SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5" arguments: what_return: ["?obj"] query_seq: [["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]] filter_info: [] order_info: order_info(variable='?obj', sorting_order='asc') """ extended_combs = [] combs = [] for n, query in enumerate(query_seq): unknown_elem_positions = [(pos, elem) for pos, elem in enumerate(query) if elem.startswith('?')] """ n = 0, query = ["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] unknown_elem_positions = ["?ent"] n = 1, query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] unknown_elem_positions = [(0, "?ent")] n = 2, query = ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"] unknown_elem_positions = [(0, "?ent"), (2, "?obj")] """ if n == 0: combs = self.search(query, unknown_elem_positions) # combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] else: if combs: known_elements = [] extended_combs = [] for elem in query: if elem in combs[0].keys(): known_elements.append(elem) for comb in combs: """ n = 1 query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] comb = {"?ent": "http://www.wikidata.org/entity/Q5513"} known_elements = ["?ent"], known_values = ["http://www.wikidata.org/entity/Q5513"] filled_query = ["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] new_combs = [["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ...] extended_combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] """ known_values = [comb[known_elem] for known_elem in known_elements] for known_elem, known_value in zip(known_elements, known_values): filled_query = [elem.replace(known_elem, known_value) for elem in query] new_combs = self.search(filled_query, unknown_elem_positions) for new_comb in new_combs: extended_combs.append({**comb, **new_comb}) combs = extended_combs if combs: if filter_info: for filter_elem, filter_value in filter_info: combs = [comb for comb in combs if filter_value in comb[filter_elem]] if order_info.variable is not None: reverse = True if order_info.sorting_order == "desc" else False sort_elem = order_info.variable combs = sorted(combs, key=lambda x: float(x[sort_elem].split('^^')[0].strip('"')), reverse=reverse) combs = [combs[0]] if what_return[-1].startswith("count"): combs = [[combs[0][key] for key in what_return[:-1]] + [len(combs)]] else: combs = [[elem[key] for key in what_return] for elem in combs] return combs
def search(self, query: List[str], unknown_elem_positions: List[Tuple[int, str]]) -> List[Dict[str, str]]: query = list(map(lambda elem: "" if elem.startswith('?') else elem, query)) subj, rel, obj = query triplets, c = self.document.search_triples(subj, rel, obj) if rel == self.description_rel: triplets = [triplet for triplet in triplets if triplet[2].endswith(self.lang)] combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets] return combs def find_label(self, entity: str, question: str) -> str: entity = str(entity).replace('"', '') if entity.startswith("Q"): # example: "Q5513" entity = "http://www.wikidata.org/entity/" + entity # "http://www.wikidata.org/entity/Q5513" if entity.startswith("http://www.wikidata.org/entity/"): labels, cardinality = self.document.search_triples(entity, "http://www.w3.org/2000/01/rdf-schema#label", "") # labels = [["http://www.wikidata.org/entity/Q5513", "http://www.w3.org/2000/01/rdf-schema#label", '"Lake Baikal"@en'], ...] for label in labels: if label[2].endswith(self.lang): found_label = label[2].strip(self.lang).replace('"', '') return found_label elif entity.endswith(self.lang): # entity: '"Lake Baikal"@en' entity = entity[:-3] return entity elif "^^" in entity: """ examples: '"1799-06-06T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime>' (date) '"+1642"^^<http://www.w3.org/2001/XMLSchema#decimal>' (number) """ entity = entity.split("^^")[0] for token in ["T00:00:00Z", "+"]: entity = entity.replace(token, '') year = re.findall("([\d]{3,4})-[\d]{1,2}-[\d]{1,2}", entity) if "how old" in question.lower() and year: entity = datetime.datetime.now().year - int(year[0]) return entity elif entity.isdigit(): return entity return "Not Found" def find_alias(self, entity: str) -> List[str]: aliases = [] if entity.startswith("http://www.wikidata.org/entity/"): labels, cardinality = self.document.search_triples(entity, "http://www.w3.org/2004/02/skos/core#altLabel", "") aliases = [label[2].strip(self.lang).strip('"') for label in labels if label[2].endswith(self.lang)] return aliases def find_rels(self, entity: str, direction: str, rel_type: str = "no_type") -> List[str]: if direction == "forw": triplets, num = self.document.search_triples(f"http://www.wikidata.org/entity/{entity}", "", "") else: triplets, num = self.document.search_triples("", "", f"http://www.wikidata.org/entity/{entity}") if rel_type != "no_type": start_str = f"http://www.wikidata.org/prop/{rel_type}" else: start_str = "http://www.wikidata.org/prop/P" rels = [triplet[1] for triplet in triplets if triplet[1].startswith(start_str)] return rels