Source code for deeppavlov.models.preprocessors.assemble_embeddings_matrix

import numpy as np
from deeppavlov.core.common.registry import register
from sklearn.decomposition import PCA


[docs]@register('emb_mat_assembler') class EmbeddingsMatrixAssembler: """Assembles matrix of embeddings obtained from some embedder.""" def __init__(self, embedder, vocab, character_level=False, emb_dim=None, estimate_by_n=10000, *args, **kwargs): if emb_dim is None: emb_dim = embedder.dim self.emb_mat = np.zeros([len(vocab), emb_dim], dtype=np.float32) tokens_for_estimation = list(embedder)[:estimate_by_n] estimation_matrix = np.array([embedder([[word]])[0][0] for word in tokens_for_estimation], dtype=np.float32) emb_std = np.std(estimation_matrix) if emb_dim < embedder.dim: pca = PCA(n_components=emb_dim) pca.fit(estimation_matrix) elif emb_dim > embedder.dim: raise RuntimeError(f'Model dimension must be greater then requsted embeddings ' 'dimension! model_dim = {embedder.dim}, requested_dim = {emb_dim}') else: pca = None for n, token in enumerate(vocab): if character_level: char_in_word_bool = np.array([token in word for word in tokens_for_estimation], dtype=bool) all_words_with_character = estimation_matrix[char_in_word_bool] if len(all_words_with_character) != 0: if pca is not None: all_words_with_character = pca.transform(all_words_with_character) self.emb_mat[n] = sum(all_words_with_character) / len(all_words_with_character) else: self.emb_mat[n] = np.random.randn(emb_dim) * np.std(self.emb_mat[:n]) else: try: if pca is not None: self.emb_mat[n] = pca(embedder([[token]])[0])[0] else: self.emb_mat[n] = embedder([[token]])[0][0] except KeyError: self.emb_mat[n] = np.random.randn(emb_dim) * emb_std @property def dim(self): return self.emb_mat.shape[1]
[docs]@register('random_emb_mat') class RandomEmbeddingsMatrix: """Assembles matrix of random embeddings.""" def __init__(self, vocab_len, emb_dim, *args, **kwargs): self.emb_mat = np.random.randn(vocab_len, emb_dim).astype(np.float32) / np.sqrt(emb_dim)