Source code for deeppavlov.models.entity_extraction.entity_linking

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import sqlite3
from logging import getLogger
from typing import List, Dict, Tuple, Any, Union
from collections import defaultdict

import nltk
import spacy
from hdt import HDTDocument
from nltk.corpus import stopwords
from rapidfuzz import fuzz

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.serializable import Serializable
from deeppavlov.models.entity_extraction.find_word import WordSearcher

log = getLogger(__name__)
nltk.download("stopwords")


[docs]@register("entity_linker") class EntityLinker(Component, Serializable): """ Class for linking of entity substrings in the document to entities in Wikidata """
[docs] def __init__( self, load_path: str, entity_ranker=None, entities_database_filename: str = None, words_dict_filename: str = None, ngrams_matrix_filename: str = None, num_entities_for_bert_ranking: int = 50, num_entities_for_conn_ranking: int = 5, num_entities_to_return: int = 10, max_text_len: int = 300, max_paragraph_len: int = 150, lang: str = "ru", use_descriptions: bool = True, alias_coef: float = 1.1, use_tags: bool = False, lemmatize: bool = False, full_paragraph: bool = False, use_connections: bool = False, kb_filename: str = None, prefixes: Dict[str, Any] = None, **kwargs, ) -> None: """ Args: load_path: path to folder with inverted index files entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert entities_database_filename: filename with database with entities index words_dict_filename: filename with words and corresponding tags ngrams_matrix_filename: filename with char tfidf matrix num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context num_entities_for_conn_ranking: number of candidate entities for ranking using connections in the knowledge graph num_entities_to_return: number of candidate entities for the substring which are returned max_text_len: maximal length of entity context max_paragraph_len: maximal length of context paragraphs lang: russian or english use_description: whether to perform entity ranking by context and description alias_coef: coefficient which is multiplied by the substring matching confidence if the substring is the title of the entity use_tags: whether to filter candidate entities by tags lemmatize: whether to lemmatize tokens full_paragraph: whether to use full paragraph for entity context use_connections: whether to rank entities by connections in the knowledge graph kb_filename: filename with the knowledge base in HDT format prefixes: entity and title prefixes **kwargs: """ super().__init__(save_path=None, load_path=load_path) self.lemmatize = lemmatize self.num_entities_for_bert_ranking = num_entities_for_bert_ranking self.num_entities_for_conn_ranking = num_entities_for_conn_ranking self.entity_ranker = entity_ranker self.entities_database_filename = entities_database_filename self.num_entities_to_return = num_entities_to_return self.max_text_len = max_text_len self.max_paragraph_len = max_paragraph_len self.lang = f"@{lang}" if self.lang == "@en": self.stopwords = set(stopwords.words("english")) self.nlp = spacy.load("en_core_web_sm") elif self.lang == "@ru": self.stopwords = set(stopwords.words("russian")) self.nlp = spacy.load("ru_core_news_sm") self.alias_coef = alias_coef self.use_descriptions = use_descriptions self.use_connections = use_connections self.use_tags = use_tags self.full_paragraph = full_paragraph self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.related_tags = { "loc": ["gpe", "country", "city", "us_state", "river"], "gpe": ["loc", "country", "city", "us_state"], "work_of_art": ["product", "law"], "product": ["work_of_art"], "law": ["work_of_art"], "org": ["fac", "business"], "business": ["org"] } self.word_searcher = None if words_dict_filename: self.word_searcher = WordSearcher(words_dict_filename, ngrams_matrix_filename, self.lang) self.kb_filename = kb_filename self.prefixes = prefixes self.load()
def load(self) -> None: self.conn = sqlite3.connect(str(self.load_path / self.entities_database_filename)) self.cur = self.conn.cursor() self.kb = None if self.kb_filename: self.kb = HDTDocument(str(expand_path(self.kb_filename))) def save(self) -> None: pass
[docs] def __call__( self, substr_batch: List[List[str]], tags_batch: List[List[str]] = None, probas_batch: List[List[float]] = None, sentences_batch: List[List[str]] = None, offsets_batch: List[List[List[int]]] = None, sentences_offsets_batch: List[List[Tuple[int, int]]] = None, entities_to_link_batch: List[List[int]] = None ): if (not sentences_offsets_batch or sentences_offsets_batch[0] is None) and sentences_batch is not None: sentences_offsets_batch = [] for sentences_list in sentences_batch: sentences_offsets_list = [] start = 0 for sentence in sentences_list: end = start + len(sentence) sentences_offsets_list.append([start, end]) start = end + 1 sentences_offsets_batch.append(sentences_offsets_list) if sentences_batch is None: sentences_batch = [[] for _ in substr_batch] sentences_offsets_batch = [[] for _ in substr_batch] if not entities_to_link_batch or entities_to_link_batch[0] is None: entities_to_link_batch = [[1 for _ in substr_list] for substr_list in substr_batch] log.debug(f"substr: {substr_batch} --- sentences_batch: {sentences_batch} --- offsets: {offsets_batch}") if (not offsets_batch or offsets_batch[0] is None) and sentences_batch: offsets_batch = [] for substr_list, sentences_list in zip(substr_batch, sentences_batch): text = " ".join(sentences_list).lower() log.debug(f"text {text}") offsets_list = [] for substr in substr_list: st_offset = text.find(substr.lower()) end_offset = st_offset + len(substr) offsets_list.append([st_offset, end_offset]) offsets_batch.append(offsets_list) ids_batch, conf_batch, pages_batch, labels_batch = [], [], [], [] for substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, \ entities_to_link in zip(substr_batch, offsets_batch, tags_batch, probas_batch, sentences_batch, sentences_offsets_batch, entities_to_link_batch): ids_list, conf_list, pages_list, labels_list = \ self.link_entities(substr_list, offsets_list, tags_list, probas_list, sentences_list, sentences_offsets_list, entities_to_link) log.debug(f"ids_list {ids_list} conf_list {conf_list}") if self.num_entities_to_return == 1: pages_list = [pages[0] for pages in pages_list] else: pages_list = [pages[: len(ids)] for pages, ids in zip(pages_list, ids_list)] ids_batch.append(ids_list) conf_batch.append(conf_list) pages_batch.append(pages_list) labels_batch.append(labels_list) return ids_batch, conf_batch, pages_batch, labels_batch
def link_entities( self, substr_list: List[str], offsets_list: List[List[int]], tags_list: List[str], probas_list: List[float], sentences_list: List[str], sentences_offsets_list: List[List[int]], entities_to_link: List[int] ) -> Tuple[List[Any], List[Any], List[List[Union[str, Any]]], List[List[Union[str, Any]]]]: log.debug(f"substr_list {substr_list} tags_list {tags_list} probas {probas_list} offsets_list {offsets_list}") ids_list, conf_list, pages_list, label_list, descr_list = [], [], [], [], [] if substr_list: entities_scores_list = [] cand_ent_scores_list = [] for substr, tags, proba in zip(substr_list, tags_list, probas_list): for old_symb, new_symb in [("'s", ""), ("@", ""), (" ", " "), (".", ""), (",", ""), ("-", " "), ("'", " "), ("!", ""), (":", ""), ("&", ""), ("/", " "), ('"', ""), (" ", " ")]: substr = substr.replace(old_symb, new_symb) substr = substr.strip() cand_ent_init = defaultdict(set) if len(substr) > 1: if isinstance(tags, str): tags = [tags] tags = [tag.lower() for tag in tags] if tags and not isinstance(tags[0], (list, tuple)): tags = [(tag, 1.0) for tag in tags] if tags and tags[0][0] == "e": use_tags_flag = False else: use_tags_flag = True cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag) new_substr = re.sub(r"\b([a-z]{1}) ([a-z]{1})\b", r"\1\2", substr) if substr != new_substr: new_cand_ent_init = self.find_exact_match(new_substr, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) init_substr_split = substr.lower().split(" ") if tags[0][0] in {"person", "work_of_art"}: substr_split = [word for word in substr.lower().split(" ") if len(word) > 0] else: substr_split = [word for word in substr.lower().split(" ") if word not in self.stopwords and len(word) > 0] substr_split_lemm = [self.nlp(tok)[0].lemma_ for tok in substr_split] substr_lemm = " ".join(substr_split_lemm) if substr_split != substr_split_lemm \ or (tags[0][0] == "work_of_art" and len(substr_split) != len(init_substr_split)): new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if substr_split != substr_split_lemm: new_cand_ent_init = self.find_exact_match(substr_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) all_low_conf = self.define_all_low_conf(cand_ent_init, 1.0) clean_tags, corr_tags, corr_clean_tags = self.correct_tags(tags) log.debug(f"substr: {substr} --- lemm: {substr_split_lemm} --- tags: {tags} --- corr_tags: " f"{corr_tags} --- all_low_conf: {all_low_conf} --- cand_ent_init: {len(cand_ent_init)}") if (not cand_ent_init or all_low_conf) and corr_tags: corr_cand_ent_init = self.find_exact_match(substr, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, corr_cand_ent_init) if substr_split != substr_split_lemm: new_cand_ent_init = self.find_exact_match(substr_lemm, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, corr_tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if not cand_ent_init and len(substr_split) == 1 and self.word_searcher: corr_words = self.word_searcher(substr_split[0], set(clean_tags + corr_clean_tags)) if corr_words: cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags, use_tags=use_tags_flag) if not cand_ent_init and len(substr_split) > 1: cand_ent_init = self.find_fuzzy_match(substr_split, tags) all_low_conf = self.define_all_low_conf(cand_ent_init, 0.85) if (not cand_ent_init or all_low_conf) and tags[0][0] != "t": use_tags_flag = False new_cand_ent_init = self.find_exact_match(substr, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) if substr_split != substr_split_lemm and (tags[0][0] == "e" or not cand_ent_init): new_cand_ent_init = self.find_fuzzy_match(substr_split, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) new_cand_ent_init = self.find_fuzzy_match(substr_split_lemm, tags, use_tags=use_tags_flag) cand_ent_init = self.unite_dicts(cand_ent_init, new_cand_ent_init) cand_ent_scores = [] for entity in cand_ent_init: entities_scores = list(cand_ent_init[entity]) entities_scores = sorted(entities_scores, key=lambda x: (x[0], x[2], x[1]), reverse=True) cand_ent_scores.append(([entity] + list(entities_scores[0]))) cand_ent_scores = sorted(cand_ent_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True) cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking] cand_ent_scores_list.append(cand_ent_scores) entity_ids = [elem[0] for elem in cand_ent_scores] scores = [elem[1:4] for elem in cand_ent_scores] conf_list.append(scores) entities_scores_list.append( {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)} ) ids_list.append(entity_ids) pages = [elem[4] for elem in cand_ent_scores] entity_labels = [elem[5] for elem in cand_ent_scores] pages_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)}) label_list.append( {entity_id: entity_label for entity_id, entity_label in zip(entity_ids, entity_labels)}) descr_list.append([elem[6] for elem in cand_ent_scores]) scores_dict = {} if self.use_connections and self.kb: scores_dict = self.rank_by_connections(ids_list) substr_lens = [len(entity_substr.split()) for entity_substr in substr_list] ids_list, conf_list = self.rank_by_description(substr_list, tags_list, offsets_list, ids_list, descr_list, entities_scores_list, sentences_list, sentences_offsets_list, substr_lens, scores_dict) label_list = [[label_dict.get(entity_id, "") for entity_id in entity_ids] for entity_ids, label_dict in zip(ids_list, label_list)] pages_list = [[pages_dict.get(entity_id, "") for entity_id in entity_ids] for entity_ids, pages_dict in zip(ids_list, pages_list)] f_ids_list, f_conf_list, f_pages_list, f_label_list = [], [], [], [] for ids, confs, pages, labels, add_flag in \ zip(ids_list, conf_list, pages_list, label_list, entities_to_link): if add_flag: f_ids_list.append(ids) f_conf_list.append(confs) f_pages_list.append(pages) f_label_list.append(labels) return f_ids_list, f_conf_list, f_pages_list, f_label_list def define_all_low_conf(self, cand_ent_init, thres): all_low_conf = True for entity_id in cand_ent_init: entity_info_set = cand_ent_init[entity_id] for entity_info in entity_info_set: if entity_info[0] >= thres: all_low_conf = False break if not all_low_conf: break return all_low_conf def correct_tags(self, tags): clean_tags = [tag for tag, conf in tags] corr_tags, corr_clean_tags = [], [] for tag, conf in tags: if tag in self.related_tags: corr_tag_list = self.related_tags[tag] for corr_tag in corr_tag_list: if corr_tag not in clean_tags and corr_tag not in corr_clean_tags: corr_tags.append([corr_tag, conf]) corr_clean_tags.append(corr_tag) return clean_tags, corr_tags, corr_clean_tags def unite_dicts(self, cand_ent_init, new_cand_ent_init): for entity_id in new_cand_ent_init: if entity_id in cand_ent_init: for entity_info in new_cand_ent_init[entity_id]: cand_ent_init[entity_id].add(entity_info) else: cand_ent_init[entity_id] = new_cand_ent_init[entity_id] return cand_ent_init def process_cand_ent(self, cand_ent_init, entities_and_ids, substr_split, tag, tag_conf, use_tags): for title, entity_id, rels, ent_tag, page, label, descr in entities_and_ids: if (ent_tag == tag and use_tags) or not use_tags: substr_score = self.calc_substr_score(title, substr_split, tag, ent_tag, label) cand_ent_init[entity_id].add((substr_score, rels, tag_conf, page, label, descr)) return cand_ent_init def sanitize_substr(self, entity_substr, tag): if tag == "person": entity_substr_split = entity_substr.split() if len(entity_substr_split) > 1 and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1: entity_substr = entity_substr_split[-1] return entity_substr def find_exact_match(self, entity_substr, tags, use_tags=True): entity_substr = entity_substr.lower() entity_substr_split = entity_substr.split() cand_ent_init = defaultdict(set) for tag, tag_conf in tags: entity_substr = self.sanitize_substr(entity_substr, tag) query = "SELECT * FROM inverted_index WHERE title MATCH ?;" entities_and_ids = [] try: res = self.cur.execute(query, (entity_substr,)) entities_and_ids = res.fetchall() except: log.info(f"error in query execute {query}") if entities_and_ids: cand_ent_init = self.process_cand_ent( cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf, use_tags) return cand_ent_init def find_fuzzy_match(self, entity_substr_split, tags, use_tags=True): cand_ent_init = defaultdict(set) for tag, tag_conf in tags: if len(entity_substr_split) > 3: entity_substr_split = [" ".join(entity_substr_split[i:i + 2]) for i in range(len(entity_substr_split) - 1)] for word in entity_substr_split: if len(word) > 1 and word not in self.stopwords: query = "SELECT * FROM inverted_index WHERE title MATCH ?;" part_entities_and_ids = [] try: res = self.cur.execute(query, (word,)) part_entities_and_ids = res.fetchall() except: log.info(f"error in query execute {query}") if part_entities_and_ids: cand_ent_init = self.process_cand_ent( cand_ent_init, part_entities_and_ids, entity_substr_split, tag, tag_conf, use_tags) return cand_ent_init def match_tokens(self, entity_substr_split, label_tokens): cnt = 0.0 if not (len(entity_substr_split) > 1 and len(label_tokens) > 1 and set(entity_substr_split) != set(label_tokens) and label_tokens[0] != label_tokens[-1] and ((entity_substr_split[0] == label_tokens[-1]) or (entity_substr_split[-1] == label_tokens[0]))): for ent_tok in entity_substr_split: found = False for label_tok in label_tokens: if label_tok == ent_tok: found = True break if found: cnt += 1.0 else: for label_tok in label_tokens: if label_tok[:2] == ent_tok[:2]: fuzz_score = fuzz.ratio(label_tok, ent_tok) c_long_toks = len(label_tok) >= 8 and label_tok[:6] == ent_tok[:6] and fuzz_score > 70.0 c_shrt_toks = len(label_tokens) > 2 and len(label_tok) > 3 and label_tok[:4] == ent_tok[:4] if (fuzz_score >= 75.0 or c_long_toks or c_shrt_toks) and not found: cnt += fuzz_score * 0.01 break substr_score = round(cnt / max(len(label_tokens), len(entity_substr_split)), 3) if len(label_tokens) == 2 and len(entity_substr_split) == 1: if entity_substr_split[0] == label_tokens[1]: substr_score = 0.5 elif entity_substr_split[0] == label_tokens[0]: substr_score = 0.3 return substr_score def correct_substr_score(self, entity_substr_split, label_tokens, substr_score): if sum([len(tok) == 1 for tok in entity_substr_split]) == 2 and len(label_tokens) >= 2 \ and any([(len(tok) == 2 and re.findall(r"[a-z]{2}", tok)) for tok in label_tokens]): new_label_tokens = [] for tok in label_tokens: if len(tok) == 2 and re.findall(r"[a-z]{2}", tok): new_label_tokens.append(tok[0]) new_label_tokens.append(tok[1]) else: new_label_tokens.append(tok) label_tokens = new_label_tokens if any([re.findall(r"[\d]{4}", tok) for tok in entity_substr_split]) \ and any([re.findall(r"[\d]{4}–[\d]{2}", tok) for tok in label_tokens]): new_label_tokens = [] for tok in label_tokens: if re.findall(r"[\d]{4}–[\d]{2}", tok): new_label_tokens.append(tok[:4]) new_label_tokens.append(tok[5:]) else: new_label_tokens.append(tok) label_tokens = new_label_tokens new_substr_score = self.match_tokens(entity_substr_split, label_tokens) substr_score = max(substr_score, new_substr_score) return substr_score def calc_substr_score(self, entity_title, entity_substr_split, tag, ent_tag, entity_label): if self.lang == "@ru": entity_title = entity_title.replace("ё", "е") label_tokens = entity_title.split() substr_score = self.match_tokens(entity_substr_split, label_tokens) substr_score = self.correct_substr_score(entity_substr_split, label_tokens, substr_score) if re.findall(r" \(.*\)", entity_label): entity_label_split = entity_label.replace("(", "").replace(")", "").lower().split() lbl_substr_score = self.match_tokens(entity_substr_split, entity_label_split) substr_score = max(substr_score, lbl_substr_score) if tag == ent_tag and tag.lower() == "person" and len(entity_substr_split) > 1 \ and len(entity_substr_split[-1]) > 1 and len(entity_substr_split[-2]) == 1 \ and len(label_tokens) == len(entity_substr_split): cnt = 0.0 for j in range(len(label_tokens) - 1): if label_tokens[j][0] == entity_substr_split[j][0]: cnt += 1.0 if label_tokens[-1] == entity_substr_split[-1]: cnt += 1.0 new_substr_score = cnt / len(label_tokens) substr_score = max(substr_score, new_substr_score) if entity_title.lower() == entity_label.lower() and substr_score == 1.0: substr_score = substr_score * self.alias_coef return substr_score def rank_by_description( self, entity_substr_list: List[str], tags_list: List[str], entity_offsets_list: List[List[int]], cand_ent_list: List[List[str]], cand_ent_descr_list: List[List[str]], entities_scores_list: List[Dict[str, Tuple[int, float]]], sentences_list: List[str], sentences_offsets_list: List[Tuple[int, int]], substr_lens: List[int], scores_dict: Dict[str, int] = None ) -> Tuple[List[Union[Union[float, List[Any], List[Union[float, Any]]], Any]], List[ Union[Union[tuple, List[tuple], List[Any], List[Tuple[Union[float, Any], ...]]], Any]]]: entity_ids_list = [] conf_list = [] contexts = [] for entity_offset in entity_offsets_list: context, sentence = "", "" if len(entity_offset) == 2: entity_start_offset, entity_end_offset = entity_offset rel_start_offset = 0 rel_end_offset = 0 found_sentence_num = 0 for num, (sent, (sent_start_offset, sent_end_offset)) in enumerate( zip(sentences_list, sentences_offsets_list) ): if entity_start_offset >= sent_start_offset and entity_end_offset <= sent_end_offset: sentence = sent found_sentence_num = num rel_start_offset = entity_start_offset - sent_start_offset rel_end_offset = entity_end_offset - sent_start_offset break if sentence: start_of_sentence = 0 end_of_sentence = len(sentence) if len(sentence) > self.max_text_len: start_of_sentence = max(rel_start_offset - self.max_text_len // 2, 0) end_of_sentence = min(rel_end_offset + self.max_text_len // 2, len(sentence)) text_before = sentence[start_of_sentence:rel_start_offset] text_after = sentence[rel_end_offset:end_of_sentence] context = text_before + "[ENT]" + text_after if self.full_paragraph: cur_sent_len = len(re.findall(self.re_tokenizer, context)) first_sentence_num = found_sentence_num last_sentence_num = found_sentence_num context = [context] while True: added = False if last_sentence_num < len(sentences_list) - 1: sentence_tokens = re.findall(self.re_tokenizer, sentences_list[last_sentence_num + 1]) last_sentence_len = len(sentence_tokens) if cur_sent_len + last_sentence_len < self.max_paragraph_len: context.append(sentences_list[last_sentence_num + 1]) cur_sent_len += last_sentence_len last_sentence_num += 1 added = True if first_sentence_num > 0: sentence_tokens = re.findall(self.re_tokenizer, sentences_list[first_sentence_num - 1]) first_sentence_len = len(sentence_tokens) if cur_sent_len + first_sentence_len < self.max_paragraph_len: context = [sentences_list[first_sentence_num - 1]] + context cur_sent_len += first_sentence_len first_sentence_num -= 1 added = True if not added: break context = " ".join(context) log.debug(f"rank, context: {context}") contexts.append(context) if self.use_descriptions: scores_list = self.entity_ranker(contexts, cand_ent_list, cand_ent_descr_list) else: scores_list = [[(entity_id, 1.0) for entity_id in cand_ent] for cand_ent in cand_ent_list] for entity_substr, tag, context, candidate_entities, substr_len, entities_scores, scores in zip( entity_substr_list, tags_list, contexts, cand_ent_list, substr_lens, entities_scores_list, scores_list ): entities_with_scores = [] max_conn_score = 0 if scores_dict and scores: max_conn_score = max([scores_dict.get(entity, 0) for entity, _ in scores]) for entity, score in scores: substr_score = round(entities_scores.get(entity, (0.0, 0))[0], 2) num_rels = entities_scores.get(entity, (0.0, 0))[1] if len(context.split()) < 4: score = 0.95 elif scores_dict and 0 < max_conn_score == scores_dict.get(entity, 0): score = 1.0 num_rels = 200 entities_with_scores.append((entity, substr_score, num_rels, float(score))) if tag == "t": entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) else: entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[3], x[2]), reverse=True) log.debug(f"{entity_substr} --- tag: {tag} --- entities_with_scores: {entities_with_scores}") if not entities_with_scores: top_entities = [] top_conf = [] elif entities_with_scores and substr_len == 1 and entities_with_scores[0][1] < 1.0: top_entities = [] top_conf = [] elif entities_with_scores and ( entities_with_scores[0][1] < 0.3 or (entities_with_scores[0][3] < 0.13 and entities_with_scores[0][2] < 20) or (entities_with_scores[0][3] < 0.3 and entities_with_scores[0][2] < 4) or entities_with_scores[0][1] < 0.6 ): top_entities = [] top_conf = [] else: top_entities = [score[0] for score in entities_with_scores] top_conf = [score[1:] for score in entities_with_scores] high_conf_entities = [] high_conf_nums = [] for elem_num, (entity, conf) in enumerate(zip(top_entities, top_conf)): if len(conf) == 3 and conf[0] >= 1.0 and conf[1] > 50 and conf[2] > 0.3: new_conf = list(conf) if new_conf[1] > 55: new_conf[2] = 1.0 new_conf = tuple(new_conf) high_conf_entities.append((entity,) + new_conf) high_conf_nums.append(elem_num) high_conf_entities = sorted(high_conf_entities, key=lambda x: (x[1], x[3], x[2]), reverse=True) log.debug(f"high_conf_entities: {high_conf_entities}") for n, elem_num in enumerate(high_conf_nums): if 0 <= elem_num - n < len(top_entities): del top_entities[elem_num - n] del top_conf[elem_num - n] top_entities = [elem[0] for elem in high_conf_entities] + top_entities top_conf = [elem[1:] for elem in high_conf_entities] + top_conf if not top_entities: entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) top_entities = [score[0] for score in entities_with_scores] top_conf = [score[1:] for score in entities_with_scores] if self.num_entities_to_return == 1 and top_entities: entity_ids_list.append(top_entities[0]) conf_list.append([round(cnf, 2) for cnf in top_conf[0]]) elif self.num_entities_to_return == "max": if top_conf: max_conf = top_conf[0][0] max_rank_conf = top_conf[0][2] entity_ids, confs = [], [] for entity_id, conf in zip(top_entities, top_conf): if (conf[0] >= max_conf * 0.9 and max_rank_conf <= 1.0) \ or (max_rank_conf == 1.0 and conf[2] == 1.0): entity_ids.append(entity_id) confs.append([round(cnf, 2) for cnf in conf]) entity_ids_list.append(entity_ids) conf_list.append(confs) else: entity_ids_list.append([]) conf_list.append([]) else: entity_ids_list.append(top_entities[: self.num_entities_to_return]) conf_list.append([[round(cnf, 2) for cnf in conf] for conf in top_conf[: self.num_entities_to_return]]) log.debug(f"{entity_substr} --- top entities {entity_ids_list[-1]} --- top_conf {conf_list[-1]}") return entity_ids_list, conf_list def sort_out_low_conf(self, entity_substr, top_entities, top_conf): if len(entity_substr.split()) > 1 and top_conf: f_top_entities, f_top_conf = [], [] for top_conf_thres, conf_thres in [(1.0, 0.9), (0.9, 0.8)]: if top_conf[0][0] >= top_conf_thres: for ent, conf in zip(top_entities, top_conf): if conf[0] > conf_thres: f_top_entities.append(ent) f_top_conf.append(conf) return f_top_entities, f_top_conf return top_entities, top_conf def rank_by_connections(self, ids_list): objects_sets_dict, scores_dict, conn_dict = {}, {}, {} for ids in ids_list: for entity_id in ids: scores_dict[entity_id] = 0 conn_dict[entity_id] = set() for ids in ids_list: for entity_id in ids[:self.num_entities_for_conn_ranking]: objects = set() for prefix in self.prefixes["entity"]: tr, _ = self.kb.search_triples(f"{prefix}/{entity_id}", "", "") for subj, rel, obj in tr: if rel.split("/")[-1] not in {"P31", "P279"}: if any([obj.startswith(pr) for pr in self.prefixes["entity"]]): objects.add(obj.split("/")[-1]) if rel.startswith(self.prefixes["rels"]["no_type"]): tr2, _ = self.kb.search_triples(obj, "", "") for _, rel2, obj2 in tr2: if rel2.startswith(self.prefixes["rels"]["statement"]) \ or rel2.startswith(self.prefixes["rels"]["qualifier"]): if any([obj2.startswith(pr) for pr in self.prefixes["entity"]]): objects.add(obj2.split("/")[-1]) objects_sets_dict[entity_id] = objects for obj in objects: if obj not in objects_sets_dict: objects_sets_dict[obj] = set() objects_sets_dict[obj].add(entity_id) for i in range(len(ids_list)): for j in range(len(ids_list)): if i != j: for entity_id1 in ids_list[i][:self.num_entities_for_conn_ranking]: for entity_id2 in ids_list[j][:self.num_entities_for_conn_ranking]: if entity_id1 in objects_sets_dict[entity_id2]: conn_dict[entity_id1].add(entity_id2) conn_dict[entity_id2].add(entity_id1) for entity_id in conn_dict: scores_dict[entity_id] = len(conn_dict[entity_id]) return scores_dict