Source code for deeppavlov.models.preprocessors.assemble_embeddings_matrix

import numpy as np
from deeppavlov.core.common.registry import register
from sklearn.decomposition import PCA


[docs]@register('emb_mat_assembler')
class EmbeddingsMatrixAssembler:
    """Assembles matrix of embeddings obtained from some embedder."""
    def __init__(self, embedder, vocab, character_level=False, emb_dim=None, estimate_by_n=10000, *args, **kwargs):
        if emb_dim is None:
            emb_dim = embedder.dim
        self.emb_mat = np.zeros([len(vocab), emb_dim], dtype=np.float32)
        tokens_for_estimation = list(embedder)[:estimate_by_n]
        estimation_matrix = np.array([embedder([[word]])[0][0] for word in tokens_for_estimation], dtype=np.float32)
        emb_std = np.std(estimation_matrix)

        if emb_dim < embedder.dim:
            pca = PCA(n_components=emb_dim)
            pca.fit(estimation_matrix)
        elif emb_dim > embedder.dim:
            raise RuntimeError(f'Model dimension must be greater then requsted embeddings '
                               'dimension! model_dim = {embedder.dim}, requested_dim = {emb_dim}')
        else:
            pca = None
        for n, token in enumerate(vocab):
            if character_level:
                char_in_word_bool = np.array([token in word for word in tokens_for_estimation], dtype=bool)
                all_words_with_character = estimation_matrix[char_in_word_bool]
                if len(all_words_with_character) != 0:
                    if pca is not None:
                        all_words_with_character = pca.transform(all_words_with_character)
                    self.emb_mat[n] = sum(all_words_with_character) / len(all_words_with_character)
                else:
                    self.emb_mat[n] = np.random.randn(emb_dim) * np.std(self.emb_mat[:n])
            else:
                try:
                    if pca is not None:
                        self.emb_mat[n] = pca(embedder([[token]])[0])[0]
                    else:
                        self.emb_mat[n] = embedder([[token]])[0][0]

                except KeyError:
                    self.emb_mat[n] = np.random.randn(emb_dim) * emb_std

    @property
    def dim(self):
        return self.emb_mat.shape[1]


[docs]@register('random_emb_mat')
class RandomEmbeddingsMatrix:
    """Assembles matrix of random embeddings."""
    def __init__(self, vocab_len, emb_dim, *args, **kwargs):
        self.emb_mat = np.random.randn(vocab_len, emb_dim).astype(np.float32) / np.sqrt(emb_dim)