Source code for deeppavlov.models.bert.bert_as_summarizer

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from logging import getLogger
from typing import List, Optional

import numpy as np
import tensorflow.compat.v1 as tf

from bert_dp.modeling import BertConfig, BertModel, create_initializer, get_assignment_map_from_checkpoint
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.tf_model import TFModel
from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor

logger = getLogger(__name__)


[docs]@register('bert_as_summarizer') class BertAsSummarizer(TFModel): """Naive Extractive Summarization model based on BERT. BERT model was trained on Masked Language Modeling (MLM) and Next Sentence Prediction (NSP) tasks. NSP head was trained to detect in ``[CLS] text_a [SEP] text_b [SEP]`` if text_b follows text_a in original document. This NSP head can be used to stack sentences from a long document, based on a initial sentence: summary_0 = init_sentence summary_1 = summary_0 + argmax(nsp_score(candidates)) summary_2 = summary_1 + argmax(nsp_score(candidates)) ... , where candidates are all sentences from a document. Args: bert_config_file: path to Bert configuration file pretrained_bert: path to pretrained Bert checkpoint vocab_file: path to Bert vocabulary max_summary_length: limit on summary length, number of sentences is used if ``max_summary_length_in_tokens`` is set to False, else number of tokens is used. max_summary_length_in_tokens: Use number of tokens as length of summary. Defaults to ``False``. max_seq_length: max sequence length in subtokens, including ``[SEP]`` and ``[CLS]`` tokens. `max_seq_length` is used in Bert to compute NSP scores. Defaults to ``128``. do_lower_case: set ``True`` if lowercasing is needed. Defaults to ``False``. lang: use ru_sent_tokenizer for 'ru' and ntlk.sent_tokener for other languages. Defaults to ``'ru'``. """ def __init__(self, bert_config_file: str, pretrained_bert: str, vocab_file: str, max_summary_length: int, max_summary_length_in_tokens: Optional[bool] = False, max_seq_length: Optional[int] = 128, do_lower_case: Optional[bool] = False, lang: Optional[str] = 'ru', **kwargs) -> None: self.max_summary_length = max_summary_length self.max_summary_length_in_tokens = max_summary_length_in_tokens self.bert_config = BertConfig.from_json_file(str(expand_path(bert_config_file))) self.bert_preprocessor = BertPreprocessor(vocab_file=vocab_file, do_lower_case=do_lower_case, max_seq_length=max_seq_length) self.tokenize_reg = re.compile(r"[\w']+|[^\w ]") if lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize self.sess_config = tf.ConfigProto(allow_soft_placement=True) self.sess_config.gpu_options.allow_growth = True self.sess = tf.Session(config=self.sess_config) self._init_graph() self.sess.run(tf.global_variables_initializer()) if pretrained_bert is not None: pretrained_bert = str(expand_path(pretrained_bert)) if tf.train.checkpoint_exists(pretrained_bert): logger.info('[initializing model with Bert from {}]'.format(pretrained_bert)) tvars = tf.trainable_variables() assignment_map, _ = get_assignment_map_from_checkpoint(tvars, pretrained_bert) tf.train.init_from_checkpoint(pretrained_bert, assignment_map) def _init_graph(self): self._init_placeholders() self.bert = BertModel(config=self.bert_config, is_training=self.is_train_ph, input_ids=self.input_ids_ph, input_mask=self.input_masks_ph, token_type_ids=self.token_types_ph, use_one_hot_embeddings=False, ) # next sentence prediction head with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, self.bert_config.hidden_size], initializer=create_initializer(self.bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) nsp_logits = tf.matmul(self.bert.get_pooled_output(), output_weights, transpose_b=True) nsp_logits = tf.nn.bias_add(nsp_logits, output_bias) self.nsp_probs = tf.nn.softmax(nsp_logits, axis=-1) def _init_placeholders(self): self.input_ids_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ids_ph') self.input_masks_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='masks_ph') self.token_types_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='token_types_ph') self.is_train_ph = tf.placeholder_with_default(False, shape=[], name='is_train_ph') def _build_feed_dict(self, input_ids, input_masks, token_types): feed_dict = { self.input_ids_ph: input_ids, self.input_masks_ph: input_masks, self.token_types_ph: token_types, } return feed_dict
[docs] def _get_nsp_predictions(self, sentences: List[str], candidates: List[str]): """Compute NextSentence probability for every (sentence_i, candidate_i) pair. [CLS] sentence_i [SEP] candidate_i [SEP] Args: sentences: list of sentences candidates: list of candidates to be the next sentence Returns: probabilities that candidate is a next sentence """ features = self.bert_preprocessor(texts_a=sentences, texts_b=candidates) input_ids = [f.input_ids for f in features] input_masks = [f.input_mask for f in features] input_type_ids = [f.input_type_ids for f in features] feed_dict = self._build_feed_dict(input_ids, input_masks, input_type_ids) nsp_probs = self.sess.run(self.nsp_probs, feed_dict=feed_dict) return nsp_probs[:, 0]
[docs] def __call__(self, texts: List[str], init_sentences: Optional[List[str]] = None) -> List[List[str]]: """Builds summary for text from `texts` Args: texts: texts to build summaries for init_sentences: ``init_sentence`` is used as the first sentence in summary. Defaults to None. Returns: List[List[str]]: summaries tokenized on sentences """ summaries = [] # build summaries for each text, init_sentence pair if init_sentences is None: init_sentences = [None] * len(texts) for text, init_sentence in zip(texts, init_sentences): text_sentences = self.sent_tokenizer(text) if init_sentence is None: init_sentence = text_sentences[0] text_sentences = text_sentences[1:] # remove duplicates text_sentences = list(set(text_sentences)) # remove init_sentence from text sentences text_sentences = [sent for sent in text_sentences if sent != init_sentence] summary = [init_sentence] if self.max_summary_length_in_tokens: # get length in tokens def get_length(x): return len(self.tokenize_reg.findall(' '.join(x))) else: # get length as number of sentences get_length = len candidates = text_sentences[:] while len(candidates) > 0: # todo: use batches candidates_scores = [self._get_nsp_predictions([' '.join(summary)], [cand]) for cand in candidates] best_candidate_idx = np.argmax(candidates_scores) best_candidate = candidates[best_candidate_idx] del candidates[best_candidate_idx] if get_length(summary + [best_candidate]) > self.max_summary_length: break summary = summary + [best_candidate] summaries += [summary] return summaries
def train_on_batch(self, **kwargs): raise NotImplementedError