| ## @package translate |
| # Module caffe2.python.models.seq2seq.translate |
| |
| |
| |
| |
| |
| from abc import ABCMeta, abstractmethod |
| import argparse |
| from future.utils import viewitems |
| import logging |
| import numpy as np |
| import sys |
| |
| from caffe2.python import core, rnn_cell, workspace |
| from caffe2.python.models.seq2seq.beam_search import BeamSearchForwardOnly |
| from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper |
| import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util |
| |
| |
| logger = logging.getLogger(__name__) |
| logger.setLevel(logging.INFO) |
| logger.addHandler(logging.StreamHandler(sys.stderr)) |
| |
| |
| def _weighted_sum(model, values, weight, output_name): |
| values_weights = zip(values, [weight] * len(values)) |
| values_weights_flattened = [x for v_w in values_weights for x in v_w] |
| return model.net.WeightedSum( |
| values_weights_flattened, |
| output_name, |
| ) |
| |
| |
| class Seq2SeqModelCaffe2EnsembleDecoderBase(metaclass=ABCMeta): |
| |
| @abstractmethod |
| def get_model_file(self, model): |
| pass |
| |
| @abstractmethod |
| def get_db_type(self): |
| pass |
| |
| def build_word_rewards(self, vocab_size, word_reward, unk_reward): |
| word_rewards = np.full([vocab_size], word_reward, dtype=np.float32) |
| word_rewards[seq2seq_util.PAD_ID] = 0 |
| word_rewards[seq2seq_util.GO_ID] = 0 |
| word_rewards[seq2seq_util.EOS_ID] = 0 |
| word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward |
| return word_rewards |
| |
| def load_models(self): |
| db_reader = 'reader' |
| for model, scope_name in zip( |
| self.models, |
| self.decoder_scope_names, |
| ): |
| params_for_current_model = [ |
| param |
| for param in self.model.GetAllParams() |
| if str(param).startswith(scope_name) |
| ] |
| assert workspace.RunOperatorOnce(core.CreateOperator( |
| 'CreateDB', |
| [], [db_reader], |
| db=self.get_model_file(model), |
| db_type=self.get_db_type()) |
| ), 'Failed to create db {}'.format(self.get_model_file(model)) |
| assert workspace.RunOperatorOnce(core.CreateOperator( |
| 'Load', |
| [db_reader], |
| params_for_current_model, |
| load_all=1, |
| add_prefix=scope_name + '/', |
| strip_prefix='gpu_0/', |
| )) |
| logger.info('Model {} is loaded from a checkpoint {}'.format( |
| scope_name, self.get_model_file(model))) |
| |
| |
| class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase): |
| |
| def get_model_file(self, model): |
| return model['model_file'] |
| |
| def get_db_type(self): |
| return 'minidb' |
| |
| def scope(self, scope_name, blob_name): |
| return ( |
| scope_name + '/' + blob_name |
| if scope_name is not None |
| else blob_name |
| ) |
| |
| def _build_decoder( |
| self, |
| model, |
| step_model, |
| model_params, |
| scope, |
| previous_tokens, |
| timestep, |
| fake_seq_lengths, |
| ): |
| attention_type = model_params['attention'] |
| assert attention_type in ['none', 'regular'] |
| use_attention = (attention_type != 'none') |
| |
| with core.NameScope(scope): |
| encoder_embeddings = seq2seq_util.build_embeddings( |
| model=model, |
| vocab_size=self.source_vocab_size, |
| embedding_size=model_params['encoder_embedding_size'], |
| name='encoder_embeddings', |
| freeze_embeddings=False, |
| ) |
| |
| ( |
| encoder_outputs, |
| weighted_encoder_outputs, |
| final_encoder_hidden_states, |
| final_encoder_cell_states, |
| encoder_units_per_layer, |
| ) = seq2seq_util.build_embedding_encoder( |
| model=model, |
| encoder_params=model_params['encoder_type'], |
| num_decoder_layers=len(model_params['decoder_layer_configs']), |
| inputs=self.encoder_inputs, |
| input_lengths=self.encoder_lengths, |
| vocab_size=self.source_vocab_size, |
| embeddings=encoder_embeddings, |
| embedding_size=model_params['encoder_embedding_size'], |
| use_attention=use_attention, |
| num_gpus=0, |
| forward_only=True, |
| scope=scope, |
| ) |
| with core.NameScope(scope): |
| if use_attention: |
| # [max_source_length, beam_size, encoder_output_dim] |
| encoder_outputs = model.net.Tile( |
| encoder_outputs, |
| 'encoder_outputs_tiled', |
| tiles=self.beam_size, |
| axis=1, |
| ) |
| |
| if weighted_encoder_outputs is not None: |
| weighted_encoder_outputs = model.net.Tile( |
| weighted_encoder_outputs, |
| 'weighted_encoder_outputs_tiled', |
| tiles=self.beam_size, |
| axis=1, |
| ) |
| |
| decoder_embeddings = seq2seq_util.build_embeddings( |
| model=model, |
| vocab_size=self.target_vocab_size, |
| embedding_size=model_params['decoder_embedding_size'], |
| name='decoder_embeddings', |
| freeze_embeddings=False, |
| ) |
| embedded_tokens_t_prev = step_model.net.Gather( |
| [decoder_embeddings, previous_tokens], |
| 'embedded_tokens_t_prev', |
| ) |
| |
| decoder_cells = [] |
| decoder_units_per_layer = [] |
| for i, layer_config in enumerate(model_params['decoder_layer_configs']): |
| num_units = layer_config['num_units'] |
| decoder_units_per_layer.append(num_units) |
| if i == 0: |
| input_size = model_params['decoder_embedding_size'] |
| else: |
| input_size = ( |
| model_params['decoder_layer_configs'][i - 1]['num_units'] |
| ) |
| |
| cell = rnn_cell.LSTMCell( |
| forward_only=True, |
| input_size=input_size, |
| hidden_size=num_units, |
| forget_bias=0.0, |
| memory_optimization=False, |
| ) |
| decoder_cells.append(cell) |
| |
| with core.NameScope(scope): |
| if final_encoder_hidden_states is not None: |
| for i in range(len(final_encoder_hidden_states)): |
| if final_encoder_hidden_states[i] is not None: |
| final_encoder_hidden_states[i] = model.net.Tile( |
| final_encoder_hidden_states[i], |
| 'final_encoder_hidden_tiled_{}'.format(i), |
| tiles=self.beam_size, |
| axis=1, |
| ) |
| if final_encoder_cell_states is not None: |
| for i in range(len(final_encoder_cell_states)): |
| if final_encoder_cell_states[i] is not None: |
| final_encoder_cell_states[i] = model.net.Tile( |
| final_encoder_cell_states[i], |
| 'final_encoder_cell_tiled_{}'.format(i), |
| tiles=self.beam_size, |
| axis=1, |
| ) |
| initial_states = \ |
| seq2seq_util.build_initial_rnn_decoder_states( |
| model=model, |
| encoder_units_per_layer=encoder_units_per_layer, |
| decoder_units_per_layer=decoder_units_per_layer, |
| final_encoder_hidden_states=final_encoder_hidden_states, |
| final_encoder_cell_states=final_encoder_cell_states, |
| use_attention=use_attention, |
| ) |
| |
| attention_decoder = seq2seq_util.LSTMWithAttentionDecoder( |
| encoder_outputs=encoder_outputs, |
| encoder_output_dim=encoder_units_per_layer[-1], |
| encoder_lengths=None, |
| vocab_size=self.target_vocab_size, |
| attention_type=attention_type, |
| embedding_size=model_params['decoder_embedding_size'], |
| decoder_num_units=decoder_units_per_layer[-1], |
| decoder_cells=decoder_cells, |
| weighted_encoder_outputs=weighted_encoder_outputs, |
| name=scope, |
| ) |
| states_prev = step_model.net.AddExternalInputs(*[ |
| '{}/{}_prev'.format(scope, s) |
| for s in attention_decoder.get_state_names() |
| ]) |
| decoder_outputs, states = attention_decoder.apply( |
| model=step_model, |
| input_t=embedded_tokens_t_prev, |
| seq_lengths=fake_seq_lengths, |
| states=states_prev, |
| timestep=timestep, |
| ) |
| |
| state_configs = [ |
| BeamSearchForwardOnly.StateConfig( |
| initial_value=initial_state, |
| state_prev_link=BeamSearchForwardOnly.LinkConfig( |
| blob=state_prev, |
| offset=0, |
| window=1, |
| ), |
| state_link=BeamSearchForwardOnly.LinkConfig( |
| blob=state, |
| offset=1, |
| window=1, |
| ), |
| ) |
| for initial_state, state_prev, state in zip( |
| initial_states, |
| states_prev, |
| states, |
| ) |
| ] |
| |
| with core.NameScope(scope): |
| decoder_outputs_flattened, _ = step_model.net.Reshape( |
| [decoder_outputs], |
| [ |
| 'decoder_outputs_flattened', |
| 'decoder_outputs_and_contexts_combination_old_shape', |
| ], |
| shape=[-1, attention_decoder.get_output_dim()], |
| ) |
| output_logits = seq2seq_util.output_projection( |
| model=step_model, |
| decoder_outputs=decoder_outputs_flattened, |
| decoder_output_size=attention_decoder.get_output_dim(), |
| target_vocab_size=self.target_vocab_size, |
| decoder_softmax_size=model_params['decoder_softmax_size'], |
| ) |
| # [1, beam_size, target_vocab_size] |
| output_probs = step_model.net.Softmax( |
| output_logits, |
| 'output_probs', |
| ) |
| output_log_probs = step_model.net.Log( |
| output_probs, |
| 'output_log_probs', |
| ) |
| if use_attention: |
| attention_weights = attention_decoder.get_attention_weights() |
| else: |
| attention_weights = step_model.net.ConstantFill( |
| [self.encoder_inputs], |
| 'zero_attention_weights_tmp_1', |
| value=0.0, |
| ) |
| attention_weights = step_model.net.Transpose( |
| attention_weights, |
| 'zero_attention_weights_tmp_2', |
| ) |
| attention_weights = step_model.net.Tile( |
| attention_weights, |
| 'zero_attention_weights_tmp', |
| tiles=self.beam_size, |
| axis=0, |
| ) |
| |
| return ( |
| state_configs, |
| output_log_probs, |
| attention_weights, |
| ) |
| |
| def __init__( |
| self, |
| translate_params, |
| ): |
| self.models = translate_params['ensemble_models'] |
| decoding_params = translate_params['decoding_params'] |
| self.beam_size = decoding_params['beam_size'] |
| |
| assert len(self.models) > 0 |
| source_vocab = self.models[0]['source_vocab'] |
| target_vocab = self.models[0]['target_vocab'] |
| for model in self.models: |
| assert model['source_vocab'] == source_vocab |
| assert model['target_vocab'] == target_vocab |
| |
| self.source_vocab_size = len(source_vocab) |
| self.target_vocab_size = len(target_vocab) |
| |
| self.decoder_scope_names = [ |
| 'model{}'.format(i) for i in range(len(self.models)) |
| ] |
| |
| self.model = Seq2SeqModelHelper(init_params=True) |
| |
| self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs') |
| self.encoder_lengths = self.model.net.AddExternalInput( |
| 'encoder_lengths' |
| ) |
| self.max_output_seq_len = self.model.net.AddExternalInput( |
| 'max_output_seq_len' |
| ) |
| |
| fake_seq_lengths = self.model.param_init_net.ConstantFill( |
| [], |
| 'fake_seq_lengths', |
| shape=[self.beam_size], |
| value=100000, |
| dtype=core.DataType.INT32, |
| ) |
| |
| beam_decoder = BeamSearchForwardOnly( |
| beam_size=self.beam_size, |
| model=self.model, |
| go_token_id=seq2seq_util.GO_ID, |
| eos_token_id=seq2seq_util.EOS_ID, |
| ) |
| step_model = beam_decoder.get_step_model() |
| |
| state_configs = [] |
| output_log_probs = [] |
| attention_weights = [] |
| for model, scope_name in zip( |
| self.models, |
| self.decoder_scope_names, |
| ): |
| ( |
| state_configs_per_decoder, |
| output_log_probs_per_decoder, |
| attention_weights_per_decoder, |
| ) = self._build_decoder( |
| model=self.model, |
| step_model=step_model, |
| model_params=model['model_params'], |
| scope=scope_name, |
| previous_tokens=beam_decoder.get_previous_tokens(), |
| timestep=beam_decoder.get_timestep(), |
| fake_seq_lengths=fake_seq_lengths, |
| ) |
| state_configs.extend(state_configs_per_decoder) |
| output_log_probs.append(output_log_probs_per_decoder) |
| if attention_weights_per_decoder is not None: |
| attention_weights.append(attention_weights_per_decoder) |
| |
| assert len(attention_weights) > 0 |
| num_decoders_with_attention_blob = ( |
| self.model.param_init_net.ConstantFill( |
| [], |
| 'num_decoders_with_attention_blob', |
| value=1 / float(len(attention_weights)), |
| shape=[1], |
| ) |
| ) |
| # [beam_size, encoder_length, 1] |
| attention_weights_average = _weighted_sum( |
| model=step_model, |
| values=attention_weights, |
| weight=num_decoders_with_attention_blob, |
| output_name='attention_weights_average', |
| ) |
| |
| num_decoders_blob = self.model.param_init_net.ConstantFill( |
| [], |
| 'num_decoders_blob', |
| value=1 / float(len(output_log_probs)), |
| shape=[1], |
| ) |
| # [beam_size, target_vocab_size] |
| output_log_probs_average = _weighted_sum( |
| model=step_model, |
| values=output_log_probs, |
| weight=num_decoders_blob, |
| output_name='output_log_probs_average', |
| ) |
| word_rewards = self.model.param_init_net.ConstantFill( |
| [], |
| 'word_rewards', |
| shape=[self.target_vocab_size], |
| value=0.0, |
| dtype=core.DataType.FLOAT, |
| ) |
| ( |
| self.output_token_beam_list, |
| self.output_prev_index_beam_list, |
| self.output_score_beam_list, |
| self.output_attention_weights_beam_list, |
| ) = beam_decoder.apply( |
| inputs=self.encoder_inputs, |
| length=self.max_output_seq_len, |
| log_probs=output_log_probs_average, |
| attentions=attention_weights_average, |
| state_configs=state_configs, |
| data_dependencies=[], |
| word_rewards=word_rewards, |
| ) |
| |
| workspace.RunNetOnce(self.model.param_init_net) |
| workspace.FeedBlob( |
| 'word_rewards', |
| self.build_word_rewards( |
| vocab_size=self.target_vocab_size, |
| word_reward=translate_params['decoding_params']['word_reward'], |
| unk_reward=translate_params['decoding_params']['unk_reward'], |
| ) |
| ) |
| |
| workspace.CreateNet( |
| self.model.net, |
| input_blobs=[ |
| str(self.encoder_inputs), |
| str(self.encoder_lengths), |
| str(self.max_output_seq_len), |
| ], |
| ) |
| |
| logger.info('Params created: ') |
| for param in self.model.params: |
| logger.info(param) |
| |
| def decode(self, numberized_input, max_output_seq_len): |
| workspace.FeedBlob( |
| self.encoder_inputs, |
| np.array([ |
| [token_id] for token_id in reversed(numberized_input) |
| ]).astype(dtype=np.int32), |
| ) |
| workspace.FeedBlob( |
| self.encoder_lengths, |
| np.array([len(numberized_input)]).astype(dtype=np.int32), |
| ) |
| workspace.FeedBlob( |
| self.max_output_seq_len, |
| np.array([max_output_seq_len]).astype(dtype=np.int64), |
| ) |
| |
| workspace.RunNet(self.model.net) |
| |
| num_steps = max_output_seq_len |
| score_beam_list = workspace.FetchBlob(self.output_score_beam_list) |
| token_beam_list = ( |
| workspace.FetchBlob(self.output_token_beam_list) |
| ) |
| prev_index_beam_list = ( |
| workspace.FetchBlob(self.output_prev_index_beam_list) |
| ) |
| |
| attention_weights_beam_list = ( |
| workspace.FetchBlob(self.output_attention_weights_beam_list) |
| ) |
| best_indices = (num_steps, 0) |
| for i in range(num_steps + 1): |
| for hyp_index in range(self.beam_size): |
| if ( |
| ( |
| token_beam_list[i][hyp_index][0] == |
| seq2seq_util.EOS_ID or |
| i == num_steps |
| ) and |
| ( |
| score_beam_list[i][hyp_index][0] > |
| score_beam_list[best_indices[0]][best_indices[1]][0] |
| ) |
| ): |
| best_indices = (i, hyp_index) |
| |
| i, hyp_index = best_indices |
| output = [] |
| attention_weights_per_token = [] |
| best_score = -score_beam_list[i][hyp_index][0] |
| while i > 0: |
| output.append(token_beam_list[i][hyp_index][0]) |
| attention_weights_per_token.append( |
| attention_weights_beam_list[i][hyp_index] |
| ) |
| hyp_index = prev_index_beam_list[i][hyp_index][0] |
| i -= 1 |
| |
| attention_weights_per_token = reversed(attention_weights_per_token) |
| # encoder_inputs are reversed, see get_batch func |
| attention_weights_per_token = [ |
| list(reversed(attention_weights))[:len(numberized_input)] |
| for attention_weights in attention_weights_per_token |
| ] |
| output = list(reversed(output)) |
| return output, attention_weights_per_token, best_score |
| |
| |
| def run_seq2seq_beam_decoder(args, model_params, decoding_params): |
| source_vocab = seq2seq_util.gen_vocab( |
| args.source_corpus, |
| args.unk_threshold, |
| ) |
| logger.info('Source vocab size {}'.format(len(source_vocab))) |
| target_vocab = seq2seq_util.gen_vocab( |
| args.target_corpus, |
| args.unk_threshold, |
| ) |
| inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)} |
| logger.info('Target vocab size {}'.format(len(target_vocab))) |
| |
| decoder = Seq2SeqModelCaffe2EnsembleDecoder( |
| translate_params=dict( |
| ensemble_models=[dict( |
| source_vocab=source_vocab, |
| target_vocab=target_vocab, |
| model_params=model_params, |
| model_file=args.checkpoint, |
| )], |
| decoding_params=decoding_params, |
| ), |
| ) |
| decoder.load_models() |
| |
| for line in sys.stdin: |
| numerized_source_sentence = seq2seq_util.get_numberized_sentence( |
| line, |
| source_vocab, |
| ) |
| translation, alignment, _ = decoder.decode( |
| numerized_source_sentence, |
| 2 * len(numerized_source_sentence) + 5, |
| ) |
| print(' '.join([inversed_target_vocab[tid] for tid in translation])) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser( |
| description='Caffe2: Seq2Seq Translation', |
| ) |
| parser.add_argument('--source-corpus', type=str, default=None, |
| help='Path to source corpus in a text file format. Each ' |
| 'line in the file should contain a single sentence', |
| required=True) |
| parser.add_argument('--target-corpus', type=str, default=None, |
| help='Path to target corpus in a text file format', |
| required=True) |
| parser.add_argument('--unk-threshold', type=int, default=50, |
| help='Threshold frequency under which token becomes ' |
| 'labeled unknown token') |
| |
| parser.add_argument('--use-bidirectional-encoder', action='store_true', |
| help='Set flag to use bidirectional recurrent network ' |
| 'in encoder') |
| parser.add_argument('--use-attention', action='store_true', |
| help='Set flag to use seq2seq with attention model') |
| parser.add_argument('--encoder-cell-num-units', type=int, default=512, |
| help='Number of cell units per encoder layer') |
| parser.add_argument('--encoder-num-layers', type=int, default=2, |
| help='Number encoder layers') |
| parser.add_argument('--decoder-cell-num-units', type=int, default=512, |
| help='Number of cell units in the decoder layer') |
| parser.add_argument('--decoder-num-layers', type=int, default=2, |
| help='Number decoder layers') |
| parser.add_argument('--encoder-embedding-size', type=int, default=256, |
| help='Size of embedding in the encoder layer') |
| parser.add_argument('--decoder-embedding-size', type=int, default=512, |
| help='Size of embedding in the decoder layer') |
| parser.add_argument('--decoder-softmax-size', type=int, default=None, |
| help='Size of softmax layer in the decoder') |
| |
| parser.add_argument('--beam-size', type=int, default=6, |
| help='Size of beam for the decoder') |
| parser.add_argument('--word-reward', type=float, default=0.0, |
| help='Reward per each word generated.') |
| parser.add_argument('--unk-reward', type=float, default=0.0, |
| help='Reward per each UNK token generated. ' |
| 'Typically should be negative.') |
| |
| parser.add_argument('--checkpoint', type=str, default=None, |
| help='Path to checkpoint', required=True) |
| |
| args = parser.parse_args() |
| |
| encoder_layer_configs = [ |
| dict( |
| num_units=args.encoder_cell_num_units, |
| ), |
| ] * args.encoder_num_layers |
| |
| if args.use_bidirectional_encoder: |
| assert args.encoder_cell_num_units % 2 == 0 |
| encoder_layer_configs[0]['num_units'] /= 2 |
| |
| decoder_layer_configs = [ |
| dict( |
| num_units=args.decoder_cell_num_units, |
| ), |
| ] * args.decoder_num_layers |
| |
| run_seq2seq_beam_decoder( |
| args, |
| model_params=dict( |
| attention=('regular' if args.use_attention else 'none'), |
| decoder_layer_configs=decoder_layer_configs, |
| encoder_type=dict( |
| encoder_layer_configs=encoder_layer_configs, |
| use_bidirectional_encoder=args.use_bidirectional_encoder, |
| ), |
| encoder_embedding_size=args.encoder_embedding_size, |
| decoder_embedding_size=args.decoder_embedding_size, |
| decoder_softmax_size=args.decoder_softmax_size, |
| ), |
| decoding_params=dict( |
| beam_size=args.beam_size, |
| word_reward=args.word_reward, |
| unk_reward=args.unk_reward, |
| ), |
| ) |
| |
| |
| if __name__ == '__main__': |
| main() |