Source code for deeppavlov.models.preprocessors.assemble_embeddings_matrix

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from sklearn.decomposition import PCA

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from deeppavlov.models.embedders.abstract_embedder import Embedder


[docs]@register('emb_mat_assembler') class EmbeddingsMatrixAssembler: """For a given Vocabulary assembles matrix of embeddings obtained from some `Embedder`. This class also can assemble embeddins of characters using Args: embedder: an instance of the class that convertes tokens to vectors. For example :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder` or :class:`~deeppavlov.models.embedders.glove_embedder.GloVeEmbedder` vocab: instance of :class:`~deeppavlov.core.data.SimpleVocab`. The matrix of embeddings will be assembled relying on every token in the vocabulary. the indexing will match vocabulary indexing. character_level: whether to perform assembling on character level. This procedure will assemble matrix with embeddings for every character using averaged embeddings of words, that contain this character. emb_dim: dimensionality of the resulting embeddings. If not `None` it should be less or equal to the dimensionality of the embeddings provided by `Embedder`. The reduction of dimensionality is performed by taking main components of PCA. estimate_by_n: how much samples to use to estimate covariance matrix for PCA. 10000 seems to be enough. Attributes: dim: dimensionality of the embeddings (can be less than dimensionality of embeddings produced by `Embedder`. """ def __init__(self, embedder: Embedder, vocab: SimpleVocabulary, character_level: bool = False, emb_dim: int = None, estimate_by_n: int = 10000, *args, **kwargs) -> None: if emb_dim is None: emb_dim = embedder.dim self.emb_mat = np.zeros([len(vocab), emb_dim], dtype=np.float32) tokens_for_estimation = list(embedder)[:estimate_by_n] estimation_matrix = np.array([embedder([[word]])[0][0] for word in tokens_for_estimation], dtype=np.float32) emb_std = np.std(estimation_matrix) if emb_dim < embedder.dim: pca = PCA(n_components=emb_dim) pca.fit(estimation_matrix) elif emb_dim > embedder.dim: raise RuntimeError(f'Model dimension must be greater than requested embeddings ' f'dimension! model_dim = {embedder.dim}, requested_dim = {emb_dim}') else: pca = None for n, token in enumerate(vocab): if character_level: char_in_word_bool = np.array([token in word for word in tokens_for_estimation], dtype=bool) all_words_with_character = estimation_matrix[char_in_word_bool] if len(all_words_with_character) != 0: if pca is not None: all_words_with_character = pca.transform(all_words_with_character) self.emb_mat[n] = sum(all_words_with_character) / len(all_words_with_character) else: self.emb_mat[n] = np.random.randn(emb_dim) * np.std(self.emb_mat[:n]) else: try: if pca is not None: self.emb_mat[n] = pca.transform(embedder([[token]])[0])[0] else: self.emb_mat[n] = embedder([[token]])[0][0] except KeyError: self.emb_mat[n] = np.random.randn(emb_dim) * emb_std @property def dim(self): return self.emb_mat.shape[1]