Source code for deeppavlov.models.elmo.elmo

# originally based on https://github.com/allenai/bilm-tf/blob/master/bilm/training.py

# Modifications copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import json
from logging import getLogger
from typing import Optional, List

import numpy as np
import tensorflow as tf
from overrides import overrides

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.nn_model import NNModel
from deeppavlov.models.elmo.bilm_model import LanguageModel
from deeppavlov.models.elmo.elmo2tfhub import export2hub
from deeppavlov.models.elmo.train_utils import average_gradients, clip_grads, safely_str2int, dump_weights

log = getLogger(__name__)


[docs]@register('elmo_model') class ELMo(NNModel): """ The :class:`~deeppavlov.models.elmo.elmo.ELMo` is a deep contextualized word representation that models both complex characteristics of word use (e.g., syntax and semantics), and how these uses vary across linguistic contexts (i.e., to model polysemy). You can use this component for LM training, fine tuning, dumping ELMo to a hdf5 file and wrapping it to the tensorflow hub. Parameters: options_json_path: Path to the json configure. char_cnn: Options of char_cnn. For example {"activation":"relu","embedding":{"dim":16}, "filters":[[1,32],[2,32],[3,64],[4,128],[5,256],[6,512],[7,1024]],"max_characters_per_token":50, "n_characters":261,"n_highway":2} bidirectional: Whether to use bidirectional or not. unroll_steps: Number of unrolling steps. n_tokens_vocab: A size of a vocabulary. lstm: Options of lstm. It is a dict of "cell_clip":int, "dim":int, "n_layers":int, "proj_clip":int, "projection_dim":int, "use_skip_connections":bool dropout: Probability of keeping the network state, values from 0 to 1. n_negative_samples_batch: Whether to use negative samples batch or not. Number of batch samples. all_clip_norm_val: Clip the gradients. initial_accumulator_value: Whether to use dropout between layers or not. learning_rate: Learning rate to use during the training (usually from 0.1 to 0.0001) n_gpus: Number of gpu to use. seed: Random seed. batch_size: A size of a train batch. load_epoch_num: An index of loading epoch. epoch_load_path: An epoch loading path relative to save_path. epoch_save_path: An epoch saving path relative to save_path. If epoch_save_path is None then epoch_save_path = epoch_load_path. dumps_save_path: A dump saving path relative to save_path. tf_hub_save_path: A tf_hub saving path relative to save_path. To train ELMo representations from a paper `Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`__ you can use multiple GPUs by set ``n_gpus`` parameter. You can explicitly specify the path to a json file with hyperparameters of ELMo used to train by ``options_json_path`` parameter. The json file must be the same as the json file from `original ELMo implementation <https://github.com/allenai/bilm-tf>`__. You can define the architecture using the separate parameters. Saving the model will take place in directories with some structure, see below example: {MODELS_PATH}/ elmo_model/ saves/ epochs/ 1/, 2/, .... # directories of epochs dumps/ weights_epoch_n_1.hdf5, weights_epoch_n_2.hdf5, .... # hdf5 files of dumped ELMo weights hubs/ tf_hub_model_epoch_n_1/, tf_hub_model_epoch_n_2/, .... # directories of tensorflow hub wrapped ELMo Intermediate checkpoints saved to `saves` directory. To specify load/save paths use ``load_epoch_num``, ``epoch_load_path``, ``epoch_save_path``, ``dumps_save_path``, ``tf_hub_save_path``. Dumping and tf_hub wrapping of ELMo occurs after each epoch. For learning the LM model dataset like 1 Billion Word Benchmark dataset is needed. Examples of how datasets should look like you can learn from the configs of the examples below. Vocabulary file is a text file, with one token per line, separated by newlines. Each token in the vocabulary is cached as the appropriate 50 character id sequence once. It is recommended to always include the special <S> and </S> tokens (case sensitive) in the vocabulary file. For fine-tuning of LM on specific data, it is enough to save base model to path ``{MODELS_PATH}/elmo_model/saves/epochs/0/`` and start training. Also for fine-tuning of LM on specific data, you can use pre-trained model for russian language on different datasets. LM model pre-trained on `ru-news` dataset ( lines = 63M, tokens = 946M, size = 12GB ), model is available by :config:`elmo_lm_ready4fine_tuning_ru_news </elmo/elmo_lm_ready4fine_tuning_ru_news.json>` configuration file or :config:`elmo_lm_ready4fine_tuning_ru_news_simple </elmo/elmo_lm_ready4fine_tuning_ru_news_simple.json>` configuration file. LM model pre-trained on `ru-twitter` dataset ( lines = 104M, tokens = 810M, size = 8.5GB ), model is available by :config:`elmo_lm_ready4fine_tuning_ru_twitter </elmo/elmo_lm_ready4fine_tuning_ru_twitter.json>` configuration file or :config:`elmo_lm_ready4fine_tuning_ru_twitter_simple </elmo/elmo_lm_ready4fine_tuning_ru_twitter_simple.json>` configuration file. LM model pre-trained on `ru-wiki` dataset ( lines = 1M, tokens = 386M, size = 5GB ), model is available by :config:`elmo_lm_ready4fine_tuning_ru_wiki </elmo/elmo_lm_ready4fine_tuning_ru_wiki.json>` configuration file or :config:`elmo_lm_ready4fine_tuning_ru_wiki_simple </elmo/elmo_lm_ready4fine_tuning_ru_wiki_simple.json>` configuration file. `simple` configuration file is a configuration of a model without special tags of output vocab used for first training. .. note:: You need to download about **4 GB** also by default about **32 GB** of RAM and **10 GB** of GPU memory required to running the :config:`elmo_lm_ready4fine_tuning_ru_* </elmo/>` on one GPU. After training you can use ``{MODELS_PATH}/elmo_model/saves/hubs/tf_hub_model_epoch_n_*/`` as a ``ModuleSpec`` by using `TensorFlow Hub <https://www.tensorflow.org/hub/overview>`__ or by DeepPavlov :class:`~deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder`. More about the ELMo model you can get from `original ELMo implementation <https://github.com/allenai/bilm-tf>`__. If some required packages are missing, install all the requirements by running in command line: .. code:: bash python -m deeppavlov install <path_to_config> where ``<path_to_config>`` is a path to one of the :config:`provided config files <elmo_embedder>` or its name without an extension, for example : .. code:: bash python -m deeppavlov install elmo_1b_benchmark_test Examples: For a quick start, you can run test training of the test model on small data by this command from bash: .. code:: bash python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark_test.json -d To download the prepared `1 Billion Word Benchmark dataset <http://www.statmt.org/lm-benchmark/>`__ and start a training model use this command from bash: .. note:: You need to download about **2 GB** also by default about **10 GB** of RAM and **10 GB** of GPU memory required to running :config:`elmo_1b_benchmark <elmo/elmo_1b_benchmark.json>` on one GPU. .. code:: bash python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark.json -d To fine-tune ELMo as LM model on `1 Billion Word Benchmark dataset <http://www.statmt.org/lm-benchmark/>`__ use commands from bash : .. code:: bash # download the prepared 1 Billion Word Benchmark dataset python -m deeppavlov download deeppavlov/configs/elmo/elmo_1b_benchmark.json # copy model checkpoint, network configuration, vocabulary of pre-trained LM model mkdir -p ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0 cp my_ckpt.data-00000-of-00001 ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.data-00000-of-00001 cp my_ckpt.index ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.index cp my_ckpt.meta ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/model.meta cp checkpoint ${MODELS_PATH}/elmo-1b-benchmark/saves/epochs/0/checkpoint cp my_options.json ${MODELS_PATH}/elmo-1b-benchmark/options.json cp my_vocab {MODELS_PATH}/elmo-1b-benchmark/vocab-2016-09-10.txt # start a fine-tuning python -m deeppavlov train deeppavlov/configs/elmo/elmo_1b_benchmark.json After training you can use the ELMo model from tf_hub wrapper by `TensorFlow Hub <https://www.tensorflow.org/hub/overview>`__ or by DeepPavlov :class:`~deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder`: >>> from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder >>> spec = f"{MODELS_PATH}/elmo-1b-benchmark_test/saves/hubs/tf_hub_model_epoch_n_1/" >>> elmo = ELMoEmbedder(spec) >>> elmo([['вопрос', 'жизни', 'Вселенной', 'и', 'вообще', 'всего'], ['42']]) array([[ 0.00719104, 0.08544601, -0.07179783, ..., 0.10879009, -0.18630421, -0.2189409 ], [ 0.16325025, -0.04736076, 0.12354863, ..., -0.1889013 , 0.04972512, 0.83029324]], dtype=float32) """ def __init__(self, options_json_path: Optional[str] = None, # Configure by json file char_cnn: Optional[dict] = None, # Net architecture by direct params, use for overwrite a json arch. bidirectional: Optional[bool] = None, unroll_steps: Optional[int] = None, n_tokens_vocab: Optional[int] = None, lstm: Optional[dict] = None, dropout: Optional[float] = None, # Regularization n_negative_samples_batch: Optional[int] = None, # Train options all_clip_norm_val: Optional[float] = None, initial_accumulator_value: float = 1.0, learning_rate: float = 2e-1, # For AdagradOptimizer n_gpus: int = 1, # TODO: Add cpu supporting seed: Optional[int] = None, # Other batch_size: int = 128, # Data params load_epoch_num: Optional[int] = None, epoch_load_path: str = 'epochs', epoch_save_path: Optional[str] = None, dumps_save_path: str = 'dumps', tf_hub_save_path: str = 'hubs', **kwargs) -> None: # ================ Checking input args ================= if not (options_json_path or (char_cnn and bidirectional and unroll_steps and n_tokens_vocab and lstm and dropout and n_negative_samples_batch and all_clip_norm_val )): raise Warning('Use options_json_path or/and direct params to set net architecture.') self.options = self._load_options(options_json_path) self._update_arch_options(char_cnn, bidirectional, unroll_steps, n_tokens_vocab, lstm) self._update_other_options(dropout, n_negative_samples_batch, all_clip_norm_val) # Special options self.options['learning_rate'] = learning_rate self.options['initial_accumulator_value'] = initial_accumulator_value self.options['seed'] = seed self.options['n_gpus'] = n_gpus self.options['batch_size'] = batch_size self.permanent_options = self.options self.train_options = {} self.valid_options = {'batch_size': 256, 'unroll_steps': 1, 'n_gpus': 1} self.model_mode = '' tf.set_random_seed(seed) np.random.seed(seed) super().__init__(**kwargs) self.epoch_load_path = epoch_load_path if load_epoch_num is None: load_epoch_num = self._get_epoch_from(self.epoch_load_path, None) if epoch_save_path is None: self.epoch_save_path = self.epoch_load_path self.save_epoch_num = self._get_epoch_from(self.epoch_save_path) self.dumps_save_path = dumps_save_path self.tf_hub_save_path = tf_hub_save_path self._build_model(train=False, epoch=load_epoch_num) self.save() # after building the model and saving to the specified save path # change the way to load intermediate checkpoints self.load_path = self.save_path def _load_options(self, options_json_path): if options_json_path: options_json_path = expand_path(options_json_path) with open(options_json_path, 'r') as fin: options = json.load(fin) else: options = {} return options def _update_arch_options(self, char_cnn, bidirectional, unroll_steps, n_tokens_vocab, lstm): if char_cnn is not None: self.options['char_cnn'] = char_cnn if bidirectional is not None: self.options['bidirectional'] = bidirectional if unroll_steps is not None: self.options['unroll_steps'] = unroll_steps if n_tokens_vocab is not None: self.options['n_tokens_vocab'] = n_tokens_vocab if lstm is not None: self.options['lstm'] = lstm def _update_other_options(self, dropout, n_negative_samples_batch, all_clip_norm_val): if dropout is not None: self.options['dropout'] = dropout if n_negative_samples_batch is not None: self.options['n_negative_samples_batch'] = n_negative_samples_batch if all_clip_norm_val is not None: self.options['all_clip_norm_val'] = all_clip_norm_val def _get_epoch_from(self, epoch_load_path, default=0): path = self.load_path path = path.parent / epoch_load_path candidates = path.resolve().glob('[0-9]*') candidates = list(safely_str2int(i.parts[-1]) for i in candidates if safely_str2int(i.parts[-1]) is not None) epoch_num = max(candidates, default=default) return epoch_num def _build_graph(self, graph, train=True): with graph.as_default(): with tf.device('/cpu:0'): init_step = 0 global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(init_step), trainable=False) self.global_step = global_step # set up the optimizer opt = tf.train.AdagradOptimizer(learning_rate=self.options['learning_rate'], initial_accumulator_value=1.0) # calculate the gradients on each GPU tower_grads = [] models = [] loss = tf.get_variable( 'train_perplexity', [], initializer=tf.constant_initializer(0.0), trainable=False) for k in range(self.options['n_gpus']): with tf.device('/gpu:%d' % k): with tf.variable_scope('lm', reuse=k > 0): # calculate the loss for one model replica and get # lstm states model = LanguageModel(self.options, True) total_train_loss = model.total_train_loss total_eval_loss = model.total_eval_loss models.append(model) # get gradients grads = opt.compute_gradients( tf.reduce_mean(total_train_loss) * self.options['unroll_steps'], aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE, ) tower_grads.append(grads) # # keep track of loss across all GPUs if train: loss += total_train_loss else: loss += total_eval_loss # calculate the mean of each gradient across all GPUs grads = average_gradients(tower_grads, self.options['batch_size'], self.options) grads, _ = clip_grads(grads, self.options, True, global_step) loss = loss / self.options['n_gpus'] train_op = opt.apply_gradients(grads, global_step=global_step) return models, train_op, loss, graph def _init_session(self): sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=sess_config) self.sess.run(tf.global_variables_initializer()) batch_size = self.options['batch_size'] unroll_steps = self.options['unroll_steps'] # get the initial lstm states init_state_tensors = [] final_state_tensors = [] for model in self.models: init_state_tensors.extend(model.init_lstm_state) final_state_tensors.extend(model.final_lstm_state) char_inputs = 'char_cnn' in self.options if char_inputs: max_chars = self.options['char_cnn']['max_characters_per_token'] if not char_inputs: feed_dict = { model.token_ids: np.zeros([batch_size, unroll_steps], dtype=np.int64) for model in self.models } else: feed_dict = { model.tokens_characters: np.zeros([batch_size, unroll_steps, max_chars], dtype=np.int32) for model in self.models } if self.options['bidirectional']: if not char_inputs: feed_dict.update({ model.token_ids_reverse: np.zeros([batch_size, unroll_steps], dtype=np.int64) for model in self.models }) else: feed_dict.update({ model.tokens_characters_reverse: np.zeros([batch_size, unroll_steps, max_chars], dtype=np.int32) for model in self.models }) init_state_values = self.sess.run(init_state_tensors, feed_dict=feed_dict) return init_state_values, init_state_tensors, final_state_tensors def _fill_feed_dict(self, char_ids_batches, reversed_char_ids_batches, token_ids_batches=None, reversed_token_ids_batches=None): # init state tensors feed_dict = {t: v for t, v in zip(self.init_state_tensors, self.init_state_values)} for k, model in enumerate(self.models): start = k * self.options['batch_size'] end = (k + 1) * self.options['batch_size'] # character inputs char_ids = char_ids_batches[start:end] # get char_ids feed_dict[model.tokens_characters] = char_ids if self.options['bidirectional']: feed_dict[model.tokens_characters_reverse] = \ reversed_char_ids_batches[start:end] # get tokens_characters_reverse if token_ids_batches is not None: feed_dict[model.next_token_id] = token_ids_batches[start:end] # get next_token_id if self.options['bidirectional']: feed_dict[model.next_token_id_reverse] = \ reversed_token_ids_batches[start:end] # get next_token_id_reverse return feed_dict def __call__(self, x, y, *args, **kwargs) -> List[float]: if len(args) != 0: return [] char_ids_batches, reversed_char_ids_batches = x token_ids_batches, reversed_token_ids_batches = y feed_dict = self._fill_feed_dict(char_ids_batches, reversed_char_ids_batches, token_ids_batches, reversed_token_ids_batches) with self.graph.as_default(): loss, self.init_state_values = self.sess.run([self.loss, self.final_state_tensors], feed_dict) return loss @overrides def load(self, epoch: Optional[int] = None) -> None: """Load model parameters from self.load_path""" path = self.load_path if epoch is not None: path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1] path.resolve() log.info(f'[loading {epoch} epoch]') # path.parent.mkdir(parents=True, exist_ok=True) path = str(path) # Check presence of the model files if tf.train.checkpoint_exists(path): log.info(f'[loading model from {path}]') with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, path) else: log.info(f'[A checkpoint not found in {path}]') @overrides def save(self, epoch: Optional[int] = None) -> None: """Save model parameters to self.save_path""" path = self.save_path if epoch is not None: path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1] path.resolve() log.info(f'[saving {epoch} epoch]') path.parent.mkdir(parents=True, exist_ok=True) path = str(path) log.info(f'[saving model to {path}]') with self.graph.as_default(): saver = tf.train.Saver() saver.save(self.sess, path) def train_on_batch(self, x_char_ids: list, y_token_ids: list) -> List[float]: """ This method is called by trainer to make one training step on one batch. Args: x_char_ids: a batch of char_ids y_token_ids: a batch of token_ids Returns: value of loss function on batch """ char_ids_batches, reversed_char_ids_batches = x_char_ids token_ids_batches, reversed_token_ids_batches = y_token_ids feed_dict = self._fill_feed_dict(char_ids_batches, reversed_char_ids_batches, token_ids_batches, reversed_token_ids_batches) with self.graph.as_default(): loss, _, self.init_state_values = self.sess.run([self.loss, self.train_op, self.final_state_tensors], feed_dict) return np.mean(loss) def _build_model(self, train: bool, epoch: Optional[int] = None, **kwargs): if hasattr(self, 'sess'): self.sess.close() self.options = copy.deepcopy(self.permanent_options) if train: self.options.update(self.train_options) self.options.update(kwargs) self.models, self.train_op, self.loss, self.graph = self._build_graph(tf.Graph()) else: self.options.update(self.valid_options) self.options.update(kwargs) self.models, self.train_op, self.loss, self.graph = self._build_graph(tf.Graph(), train=False) with self.graph.as_default(): self.init_state_values, self.init_state_tensors, self.final_state_tensors = \ self._init_session() self.load(epoch) def process_event(self, event_name, data): if event_name == 'before_train' and self.model_mode != 'train': self._build_model(train=True) self.model_mode = 'train' elif event_name == 'before_validation' and self.model_mode != 'validation': epoch = self.save_epoch_num + int(data['epochs_done']) self.save(epoch) self.save() self.elmo_export(epoch) self._build_model(train=False) self.model_mode = 'validation' def elmo_export(self, epoch: Optional[int] = None) -> None: """ Dump the trained weights from a model to a HDF5 file and export a TF-Hub module. """ if hasattr(self, 'sess'): self.sess.close() path = self.save_path if epoch: from_path = path.parent / self.epoch_save_path / str(epoch) / path.parts[-1] weights_to_path = path.parent / self.dumps_save_path / f'weights_epoch_n_{epoch}.hdf5' tf_hub_to_path = path.parent / self.tf_hub_save_path / f'tf_hub_model_epoch_n_{epoch}' from_path.resolve() weights_to_path.resolve() tf_hub_to_path.resolve() log.info(f'[exporting {epoch} epoch]') else: from_path = path weights_to_path = path.parent / self.dumps_save_path / 'weights.hdf5' tf_hub_to_path = path.parent / self.tf_hub_save_path / 'tf_hub_model' weights_to_path.parent.mkdir(parents=True, exist_ok=True) tf_hub_to_path.parent.mkdir(parents=True, exist_ok=True) # Check presence of the model files if tf.train.checkpoint_exists(str(from_path)): dump_weights(from_path.parent, weights_to_path, self.permanent_options) options = copy.deepcopy(self.permanent_options) options['char_cnn']['n_characters'] = 262 export2hub(weights_to_path, tf_hub_to_path, options) def destroy(self) -> None: """ Delete model from memory Returns: None """ if hasattr(self, 'sess'): for k in list(self.sess.graph.get_all_collection_keys()): self.sess.graph.clear_collection(k) super().destroy()