Source code for deeppavlov.models.ranking.tfidf_ranker

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Dict, Any, Tuple

import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.common.log import get_logger
from deeppavlov.core.models.estimator import Estimator
from deeppavlov.models.vectorizers.hashing_tfidf_vectorizer import HashingTfIdfVectorizer
from deeppavlov.core.data.data_fitting_iterator import DataFittingIterator

logger = get_logger(__name__)


[docs]@register("tfidf_ranker") class TfidfRanker(Estimator): """Rank documents according to input strings. Args: vectorizer: a vectorizer class top_n: a number of doc ids to return active: whether to return a number specified by :attr:`top_n` (``True``) or all ids (``False``) Attributes: top_n: a number of doc ids to return vectorizer: an instance of vectorizer class active: whether to return a number specified by :attr:`top_n` or all ids tfidf_matrix: a loaded tfidf matrix ngram_range: ngram range used when tfidf matrix was created hash_size: hash size of the tfidf matrix term_freqs: a dictionary with tfidf terms and their frequences doc_index: a dictionary of doc ids and corresponding doc titles index2doc: inverted :attr:`doc_index` iterator: a dataset iterator used for generating batches while fitting the vectorizer """
[docs] def get_main_component(self) -> 'TfidfRanker': """Temporary stub to run REST API Returns: self """ return self
def __init__(self, vectorizer: HashingTfIdfVectorizer, top_n=5, active: bool = True, **kwargs): self.top_n = top_n self.vectorizer = vectorizer self.active = active if kwargs['mode'] != 'train': if self.vectorizer.load_path.exists(): self.tfidf_matrix, opts = self.vectorizer.load() self.ngram_range = opts['ngram_range'] self.hash_size = opts['hash_size'] self.term_freqs = opts['term_freqs'].squeeze() self.doc_index = opts['doc_index'] self.vectorizer.doc_index = self.doc_index self.vectorizer.term_freqs = self.term_freqs self.vectorizer.hash_size = self.hash_size self.index2doc = self.get_index2doc() else: self.iterator = None logger.warning("TfidfRanker load_path doesn't exist, is waiting for training.")
[docs] def get_index2doc(self) -> Dict[Any, int]: """Invert doc_index. Returns: inverted doc_index dict """ return dict(zip(self.doc_index.values(), self.doc_index.keys()))
[docs] def __call__(self, questions: List[str]) -> Tuple[List[Any], List[float]]: """Rank documents and return top n document titles with scores. Args: questions: list of queries used in ranking Returns: a tuple of selected doc ids and their scores """ batch_doc_ids, batch_docs_scores = [], [] q_tfidfs = self.vectorizer(questions) for q_tfidf in q_tfidfs: scores = q_tfidf * self.tfidf_matrix scores = np.squeeze( scores.toarray() + 0.0001) # add a small value to eliminate zero scores if self.active: thresh = self.top_n else: thresh = len(self.doc_index) if thresh >= len(scores): o = np.argpartition(-scores, len(scores) - 1)[0:thresh] else: o = np.argpartition(-scores, thresh)[0:thresh] o_sort = o[np.argsort(-scores[o])] doc_scores = scores[o_sort] doc_ids = [self.index2doc[i] for i in o_sort] batch_doc_ids.append(doc_ids) batch_docs_scores.append(doc_scores) return batch_doc_ids, batch_docs_scores
[docs] def fit_batches(self, iterator: DataFittingIterator, batch_size: int) -> None: """Generate a batch to be fit to a vectorizer. Args: iterator: an instance of an iterator class batch_size: a size of a generated batch Returns: None """ self.vectorizer.doc_index = iterator.doc2index for x, y in iterator.gen_batches(batch_size): self.vectorizer.fit_batch(x, y)
[docs] def fit(self) -> None: """Pass method to :class:`Chainer`. Returns: None """ pass
[docs] def save(self) -> None: """Pass method to :attr:`vectorizer`. Returns: None """ self.vectorizer.save()
[docs] def load(self) -> None: """Pass method to :attr:`vectorizer`. Returns: None """ self.vectorizer.load()