Source code for deeppavlov.models.ranking.emb_dict

import numpy as np
from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors
from deeppavlov.core.commands.utils import expand_path
from pathlib import Path
from deeppavlov.core.data.utils import download
from deeppavlov.core.common.log import get_logger


log = get_logger(__name__)


[docs]class EmbDict(object):
    """The class that provides token (word) embeddings.

    Args:
        save_path: A path including filename to store the instance of
            :class:`deeppavlov.models.ranking.ranking_network.RankingNetwork`.
        load_path: A path including filename to load the instance of
            :class:`deeppavlov.models.ranking.ranking_network.RankingNetwork`.
        max_sequence_length: A maximum length of a sequence in tokens.
            Longer sequences will be truncated and shorter ones will be padded.
        seed: Random seed.
        embeddings: A type of embeddings. Possible values are ``fasttext``, ``word2vec`` and ``random``.
        embeddings_path: A path to an embeddings model including filename.
            The type of the model should coincide with the type of embeddings defined by the ``embeddings`` parameter.
        embedding_dim: Dimensionality of token (word) embeddings.
        use_matrix: Whether to use trainable matrix with token (word) embeddings.
    """

    def __init__(self,
                 save_path: str,
                 load_path: str,
                 embeddings_path: str,
                 max_sequence_length: int,
                 embedding_dim: int = 300,
                 embeddings: str = "word2vec",
                 seed: int = None,
                 use_matrix: bool = False):

        np.random.seed(seed)
        save_path = expand_path(save_path).resolve().parent
        load_path = expand_path(load_path).resolve().parent
        self.int2emb_save_path = save_path / "int2emb.npy"
        self.int2emb_load_path = load_path / "int2emb.npy"
        self.embeddings = embeddings
        self.embedding_dim = embedding_dim
        self.max_sequence_length = max_sequence_length
        self.emb_model_file = expand_path(embeddings_path)
        self.use_matrix = use_matrix
        self.emb_matrix = None

    def init_from_scratch(self, tok2int_vocab):
        if self.embeddings == "fasttext":
            self.embeddings_model = FastText.load_fasttext_format(str(self.emb_model_file))
        elif self.embeddings == "word2vec":
            self.embeddings_model = KeyedVectors.load_word2vec_format(str(self.emb_model_file),
                                                                      binary=True)
        elif self.embeddings == "random":
            self.embeddings_model = {el: np.random.uniform(-0.6, 0.6, self.embedding_dim)
                                     for el in tok2int_vocab.keys()}
        log.info("[initializing new `{}`]".format(self.__class__.__name__))
        self.build_emb_matrix(tok2int_vocab)

    def load(self):
        """Initialize embeddings from the file."""
        if not self.use_matrix:
            log.info("[initializing `{}` from saved]".format(self.__class__.__name__))
            if self.int2emb_load_path.is_file():
                self.emb_matrix = np.load(self.int2emb_load_path)

    def save(self):
        """Save the dictionary tok2emb to the file."""
        if not self.use_matrix:
            log.info("[saving `{}`]".format(self.__class__.__name__))
            if not self.int2emb_save_path.is_file():
                np.save(self.int2emb_save_path, self.emb_matrix)

    def build_emb_matrix(self, tok2int_vocab):
        self.emb_matrix = np.zeros((len(tok2int_vocab), self.embedding_dim))
        for tok, i in tok2int_vocab.items():
            if tok == '<UNK>':
                self.emb_matrix[i] = np.random.uniform(-0.6, 0.6, self.embedding_dim)
            else:
                try:
                    self.emb_matrix[i] = self.embeddings_model[tok]
                except:
                    self.emb_matrix[i] = np.random.uniform(-0.6, 0.6, self.embedding_dim)
        del self.embeddings_model

    def get_embs(self, ints):
        embs = []
        for el in ints:
            emb = []
            for int_tok in el:
                assert type(int_tok) != int
                emb.append(self.emb_matrix[int_tok])
            emb = np.vstack(emb)
            embs.append(emb)
        embs = [np.reshape(el, (1, self.max_sequence_length, self.embedding_dim)) for el in embs]
        embs = np.vstack(embs)
        return embs