From 9ccfde9f90e561423008bdb15f751da651ef54d8 Mon Sep 17 00:00:00 2001 From: Tiberiu Date: Mon, 12 Mar 2018 11:15:15 +0200 Subject: [PATCH 01/38] Added num_encoder_layers/num_decoder_layers to WMT16 standard hparams. --- nmt/standard_hparams/wmt16_gnmt_4_layer.json | 2 ++ nmt/standard_hparams/wmt16_gnmt_8_layer.json | 2 ++ 2 files changed, 4 insertions(+) diff --git a/nmt/standard_hparams/wmt16_gnmt_4_layer.json b/nmt/standard_hparams/wmt16_gnmt_4_layer.json index 0031a54e9..5a0fcafc9 100644 --- a/nmt/standard_hparams/wmt16_gnmt_4_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_4_layer.json @@ -14,6 +14,8 @@ "metrics": ["bleu"], "num_buckets": 5, "num_layers": 4, + "num_encoder_layers": 4, + "num_decoder_layers": 4, "num_train_steps": 340000, "decay_scheme": "luong10", "num_units": 1024, diff --git a/nmt/standard_hparams/wmt16_gnmt_8_layer.json b/nmt/standard_hparams/wmt16_gnmt_8_layer.json index 438ddcf55..8a3120f08 100644 --- a/nmt/standard_hparams/wmt16_gnmt_8_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_8_layer.json @@ -14,6 +14,8 @@ "metrics": ["bleu"], "num_buckets": 5, "num_layers": 8, + "num_encoder_layers": 8, + "num_deocder_layers": 8, "num_train_steps": 340000, "decay_scheme": "luong10", "num_units": 1024, From 8522665bb1ac9e17c728f940326044d3fe72a80f Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Wed, 3 Jan 2018 12:23:22 -0800 Subject: [PATCH 02/38] Add TrainOutputTuple, EvalOutputTuple, InferOutputTuple to allow for flexibility in extending models. Clean up and factor train.py PiperOrigin-RevId: 180703151 --- nmt/model.py | 206 ++++++++++++++++++++++++++++---------------- nmt/model_helper.py | 15 ++-- nmt/model_test.py | 13 +-- nmt/train.py | 14 +-- 4 files changed, 155 insertions(+), 93 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index fb789815a..0b2affa85 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -19,7 +19,7 @@ from __future__ import print_function import abc - +import collections import tensorflow as tf from tensorflow.python.layers import core as layers_core @@ -33,6 +33,27 @@ __all__ = ["BaseModel", "Model"] +class TrainOutputTuple(collections.namedtuple( + "TrainOutputTuple", ("train_summary", "train_loss", "predict_count", + "global_step", "word_count", "batch_size", "grad_norm", + "learning_rate"))): + """To allow for flexibily in returing different outputs.""" + pass + + +class EvalOutputTuple(collections.namedtuple( + "EvalOutputTuple", ("eval_loss", "predict_count", "batch_size"))): + """To allow for flexibily in returing different outputs.""" + pass + + +class InferOutputTuple(collections.namedtuple( + "InferOutputTuple", ("infer_logits", "infer_summary", "sample_id", + "sample_words"))): + """To allow for flexibily in returing different outputs.""" + pass + + class BaseModel(object): """Sequence-to-sequence base class. """ @@ -60,44 +81,10 @@ def __init__(self, extra_args: model_helper.ExtraArgs, for passing customizable functions. """ - assert isinstance(iterator, iterator_utils.BatchedInput) - self.iterator = iterator - self.mode = mode - self.src_vocab_table = source_vocab_table - self.tgt_vocab_table = target_vocab_table - - self.src_vocab_size = hparams.src_vocab_size - self.tgt_vocab_size = hparams.tgt_vocab_size - self.num_gpus = hparams.num_gpus - self.time_major = hparams.time_major - - # extra_args: to make it flexible for adding external customizable code - self.single_cell_fn = None - if extra_args: - self.single_cell_fn = extra_args.single_cell_fn - - # Set num layers - self.num_encoder_layers = hparams.num_encoder_layers - self.num_decoder_layers = hparams.num_decoder_layers - assert self.num_encoder_layers - assert self.num_decoder_layers - - # Set num residual layers - if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils - self.num_encoder_residual_layers = hparams.num_residual_layers - self.num_decoder_residual_layers = hparams.num_residual_layers - else: - self.num_encoder_residual_layers = hparams.num_encoder_residual_layers - self.num_decoder_residual_layers = hparams.num_decoder_residual_layers - - # Initializer - initializer = model_helper.get_initializer( - hparams.init_op, hparams.random_seed, hparams.init_weight) - tf.get_variable_scope().set_initializer(initializer) - - # Embeddings - self.init_embeddings(hparams, scope) - self.batch_size = tf.size(self.iterator.source_sequence_length) + # Set params + self._set_params_initializer(hparams, mode, iterator, + source_vocab_table, target_vocab_table, + scope, extra_args) # Projection with tf.variable_scope(scope or "build_network"): @@ -107,7 +94,6 @@ def __init__(self, ## Train graph res = self.build_graph(hparams, scope=scope) - if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.train_loss = res[1] self.word_count = tf.reduce_sum( @@ -125,7 +111,6 @@ def __init__(self, self.predict_count = tf.reduce_sum( self.iterator.target_sequence_length) - self.global_step = tf.Variable(0, trainable=False) params = tf.trainable_variables() # Gradients and SGD update operation for training the model. @@ -140,9 +125,10 @@ def __init__(self, # Optimizer if hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(self.learning_rate) - tf.summary.scalar("lr", self.learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(self.learning_rate) + else: + raise ValueError("Unknown optimizer type %s" % hparams.optimizer) # Gradients gradients = tf.gradients( @@ -152,18 +138,15 @@ def __init__(self, clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( gradients, max_gradient_norm=hparams.max_gradient_norm) + self.grad_norm_summary = grad_norm_summary self.grad_norm = grad_norm self.update = opt.apply_gradients( zip(clipped_grads, params), global_step=self.global_step) # Summary - self.train_summary = tf.summary.merge([ - tf.summary.scalar("lr", self.learning_rate), - tf.summary.scalar("train_loss", self.train_loss), - ] + grad_norm_summary) - - if self.mode == tf.contrib.learn.ModeKeys.INFER: + self.train_summary = self._get_train_summary() + elif self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) # Saver @@ -176,6 +159,60 @@ def __init__(self, utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) + def _set_params_initializer(self, + hparams, + mode, + iterator, + source_vocab_table, + target_vocab_table, + scope, + extra_args=None): + """Set various params for self and initialize.""" + assert isinstance(iterator, iterator_utils.BatchedInput) + self.iterator = iterator + self.mode = mode + self.src_vocab_table = source_vocab_table + self.tgt_vocab_table = target_vocab_table + + self.src_vocab_size = hparams.src_vocab_size + self.tgt_vocab_size = hparams.tgt_vocab_size + self.num_gpus = hparams.num_gpus + self.time_major = hparams.time_major + + # extra_args: to make it flexible for adding external customizable code + self.single_cell_fn = None + if extra_args: + self.single_cell_fn = extra_args.single_cell_fn + + # Set num layers + self.num_encoder_layers = hparams.num_encoder_layers + self.num_decoder_layers = hparams.num_decoder_layers + assert self.num_encoder_layers + assert self.num_decoder_layers + + # Set num residual layers + if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils + self.num_encoder_residual_layers = hparams.num_residual_layers + self.num_decoder_residual_layers = hparams.num_residual_layers + else: + self.num_encoder_residual_layers = hparams.num_encoder_residual_layers + self.num_decoder_residual_layers = hparams.num_decoder_residual_layers + + # Batch size + self.batch_size = tf.size(self.iterator.source_sequence_length) + + # Global step + self.global_step = tf.Variable(0, trainable=False) + + # Initializer + initializer = model_helper.get_initializer( + hparams.init_op, hparams.random_seed, hparams.init_weight) + tf.get_variable_scope().set_initializer(initializer) + + # Embeddings + self.init_embeddings(hparams, scope) + + def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps @@ -201,8 +238,8 @@ def _get_learning_rate_warmup(self, hparams): lambda: self.learning_rate, name="learning_rate_warump_cond") - def _get_learning_rate_decay(self, hparams): - """Get learning rate decay.""" + def _get_decay_info(self, hparams): + """Return decay info based on decay_scheme.""" if hparams.decay_scheme in ["luong5", "luong10", "luong234"]: decay_factor = 0.5 if hparams.decay_scheme == "luong5": @@ -222,6 +259,11 @@ def _get_learning_rate_decay(self, hparams): decay_factor = 1.0 elif hparams.decay_scheme: raise ValueError("Unknown decay scheme %s" % hparams.decay_scheme) + return start_decay_step, decay_steps, decay_factor + + def _get_learning_rate_decay(self, hparams): + """Get learning rate decay.""" + start_decay_step, decay_steps, decay_factor = self._get_decay_info(hparams) utils.print_out(" decay_scheme=%s, start_decay_step=%d, decay_steps %d, " "decay_factor %g" % (hparams.decay_scheme, start_decay_step, @@ -253,23 +295,34 @@ def init_embeddings(self, hparams, scope): tgt_embed_file=hparams.tgt_embed_file, scope=scope,)) + def _get_train_summary(self): + """Get train summary.""" + train_summary = tf.summary.merge( + [tf.summary.scalar("lr", self.learning_rate), + tf.summary.scalar("train_loss", self.train_loss)] + + self.grad_norm_summary) + return train_summary + def train(self, sess): + """Execute train graph.""" assert self.mode == tf.contrib.learn.ModeKeys.TRAIN - return sess.run([self.update, - self.train_loss, - self.predict_count, - self.train_summary, - self.global_step, - self.word_count, - self.batch_size, - self.grad_norm, - self.learning_rate]) + output_tuple = TrainOutputTuple(train_summary=self.train_summary, + train_loss=self.train_loss, + predict_count=self.predict_count, + global_step=self.global_step, + word_count=self.word_count, + batch_size=self.batch_size, + grad_norm=self.grad_norm, + learning_rate=self.learning_rate) + return sess.run([self.update, output_tuple]) def eval(self, sess): + """Execute eval graph.""" assert self.mode == tf.contrib.learn.ModeKeys.EVAL - return sess.run([self.eval_loss, - self.predict_count, - self.batch_size]) + output_tuple = EvalOutputTuple(eval_loss=self.eval_loss, + predict_count=self.predict_count, + batch_size=self.batch_size) + return sess.run(output_tuple) def build_graph(self, hparams, scope=None): """Subclass must implement this method. @@ -280,11 +333,12 @@ def build_graph(self, hparams, scope=None): scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: - A tuple of the form (logits, loss, final_context_state), + A tuple of the form (logits, loss_tuple, final_context_state, sample_id), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. - loss: the total loss / batch_size. - final_context_state: The final state of decoder RNN. + loss: loss = the total loss / batch_size. + final_context_state: the final state of decoder RNN. + sample_id: sampling indices. Raises: ValueError: if encoder_type differs from mono and bi, or @@ -308,7 +362,7 @@ def build_graph(self, hparams, scope=None): self.num_gpus)): loss = self._compute_loss(logits) else: - loss = None + loss = tf.constant(0.0) return logits, loss, final_context_state, sample_id @@ -426,7 +480,6 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): length_penalty_weight = hparams.length_penalty_weight start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id - if beam_width > 0: my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, @@ -513,13 +566,16 @@ def _compute_loss(self, logits): return loss def _get_infer_summary(self, hparams): + del hparams return tf.no_op() def infer(self, sess): assert self.mode == tf.contrib.learn.ModeKeys.INFER - return sess.run([ - self.infer_logits, self.infer_summary, self.sample_id, self.sample_words - ]) + output_tuple = InferOutputTuple(infer_logits=self.infer_logits, + infer_summary=self.infer_summary, + sample_id=self.sample_id, + sample_words=self.sample_words) + return sess.run(output_tuple) def decode(self, sess): """Decode a batch. @@ -531,7 +587,9 @@ def decode(self, sess): A tuple consiting of outputs, infer_summary. outputs: of size [batch_size, time] """ - _, infer_summary, _, sample_words = self.infer(sess) + output_tuple = self.infer(sess) + sample_words = output_tuple.sample_words + infer_summary = output_tuple.infer_summary # make sure outputs is of shape [batch_size, time] or [beam_width, # batch_size, time] when using beam search. @@ -650,7 +708,7 @@ def _build_bidirectional_rnn(self, inputs, sequence_length, return tf.concat(bi_outputs, -1), bi_state def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, - source_sequence_length): + source_sequence_length, base_gpu=0): """Build an RNN cell that can be used by decoder.""" # We only make use of encoder_outputs in attention-based models if hparams.attention: @@ -665,7 +723,9 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, dropout=hparams.dropout, num_gpus=self.num_gpus, mode=self.mode, - single_cell_fn=self.single_cell_fn) + single_cell_fn=self.single_cell_fn, + base_gpu=base_gpu + ) # For beam search, we need to replicate encoder infos beam_width times if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0: diff --git a/nmt/model_helper.py b/nmt/model_helper.py index efe18c1e8..1cf61778c 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -2,20 +2,17 @@ from __future__ import print_function import collections -import six import os import time - import numpy as np +import six import tensorflow as tf from tensorflow.python.ops import lookup_ops - from .utils import iterator_utils from .utils import misc_utils as utils from .utils import vocab_utils - __all__ = [ "get_initializer", "get_device_str", "create_train_model", "create_eval_model", "create_infer_model", @@ -136,6 +133,9 @@ def create_eval_model(model_creator, hparams, scope=None, extra_args=None): with graph.as_default(), tf.container(scope or "eval"): src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) + reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file( + tgt_vocab_file, default_value=vocab_utils.UNK) + src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) src_dataset = tf.data.TextLineDataset(src_file_placeholder) @@ -158,6 +158,7 @@ def create_eval_model(model_creator, hparams, scope=None, extra_args=None): mode=tf.contrib.learn.ModeKeys.EVAL, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, + reverse_target_vocab_table=reverse_tgt_vocab_table, scope=scope, extra_args=extra_args) return EvalModel( @@ -594,9 +595,9 @@ def compute_perplexity(model, sess, name): while True: try: - loss, predict_count, batch_size = model.eval(sess) - total_loss += loss * batch_size - total_predict_count += predict_count + output_tuple = model.eval(sess) + total_loss += output_tuple.eval_loss * output_tuple.batch_size + total_predict_count += output_tuple.predict_count except tf.errors.OutOfRangeError: break diff --git a/nmt/model_test.py b/nmt/model_test.py index 5af64df7f..5c2659117 100644 --- a/nmt/model_test.py +++ b/nmt/model_test.py @@ -250,8 +250,8 @@ def _assertModelVariable(self, variable, sess, name): def _assertTrainStepsLoss(self, m, sess, name, num_steps=1): for _ in range(num_steps): - _, loss, _, _, _, _, _, _, _ = m.train(sess) - + _, output_tuple = m.train(sess) + loss = output_tuple.train_loss print('{} {}-th step loss is: '.format(name, num_steps), loss) expected_loss = self.expected_train_values[name + '/loss'] self.actual_train_values[name + '/loss'] = loss @@ -259,8 +259,9 @@ def _assertTrainStepsLoss(self, m, sess, name, num_steps=1): self.assertAllClose(expected_loss, loss) def _assertEvalLossAndPredictCount(self, m, sess, name): - loss, predict_count, _ = m.eval(sess) - + output_tuple = m.eval(sess) + loss = output_tuple.eval_loss + predict_count = output_tuple.predict_count print('{} eval loss is: '.format(name), loss) print('{} predict count is: '.format(name), predict_count) expected_loss = self.expected_eval_values[name + '/loss'] @@ -272,8 +273,8 @@ def _assertEvalLossAndPredictCount(self, m, sess, name): self.assertAllClose(expected_predict_count, predict_count) def _assertInferLogits(self, m, sess, name): - results = m.infer(sess) - logits_sum = np.sum(results[0]) + output_tuple = m.infer(sess) + logits_sum = np.sum(output_tuple.infer_logits) print('{} infer logits sum is: '.format(name), logits_sum) expected_logits_sum = self.expected_infer_values[name + '/logits_sum'] diff --git a/nmt/train.py b/nmt/train.py index 75978ec44..aa1aeeae1 100644 --- a/nmt/train.py +++ b/nmt/train.py @@ -204,17 +204,17 @@ def init_stats(): def update_stats(stats, start_time, step_result): """Update stats: write summary and accumulate statistics.""" - (_, step_loss, step_predict_count, step_summary, global_step, - step_word_count, batch_size, grad_norm, learning_rate) = step_result + _, output_tuple = step_result # Update statistics stats["step_time"] += (time.time() - start_time) - stats["loss"] += (step_loss * batch_size) - stats["predict_count"] += step_predict_count - stats["total_count"] += float(step_word_count) - stats["grad_norm"] += grad_norm + stats["loss"] += (output_tuple.train_loss * output_tuple.batch_size) + stats["predict_count"] += output_tuple.predict_count + stats["total_count"] += float(output_tuple.word_count) + stats["grad_norm"] += output_tuple.grad_norm - return global_step, learning_rate, step_summary + return (output_tuple.global_step, output_tuple.learning_rate, + output_tuple.train_summary) def print_step_info(prefix, global_step, info, result_summary, log_f): From 80ff1c9fbad7dcfe801188e01f7aed51f5175c38 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Fri, 5 Jan 2018 12:26:59 -0800 Subject: [PATCH 03/38] Wrap try/except for saver.restore and print variables in loaded checkpoints PiperOrigin-RevId: 180960478 --- nmt/model_helper.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/nmt/model_helper.py b/nmt/model_helper.py index 1cf61778c..02a5f0aeb 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -479,9 +479,25 @@ def gradient_clip(gradients, max_gradient_norm): return clipped_gradients, gradient_norm_summary, gradient_norm +def print_variables_in_ckpt(ckpt): + """Print a list of variables in a checkpoint together with their shapes.""" + utils.print_out("# Variables in ckpt %s" % ckpt) + reader = tf.train.NewCheckpointReader(ckpt) + variable_map = reader.get_variable_to_shape_map() + for key in sorted(variable_map.keys()): + utils.print_out(" %s: %s" % (key, variable_map[key])) + + def load_model(model, ckpt, session, name): + """Load model from a checkpoint.""" start_time = time.time() - model.saver.restore(session, ckpt) + try: + model.saver.restore(session, ckpt) + except tf.errors.NotFoundError as e: + utils.print_out("Can't load checkpoint") + print_variables_in_ckpt(ckpt) + utils.print_out("%s" % str(e)) + session.run(tf.tables_initializer()) utils.print_out( " loaded %s model parameters from %s, time %.2fs" % From 653da78682f851ef7be14945cfbbec0ddb0cb8f9 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Fri, 5 Jan 2018 16:24:41 -0800 Subject: [PATCH 04/38] Improve several behaviors regarding loading hparams for train/inference: (a) During inference, given --ckpt, we can try to load hparams in the same dir (b) When loading models and override_loaded_hparams=False, we still overwrite ["beam_width", "length_penalty_weight", "sampling_temperature", "num_translations_per_input"] (c) Introduce _add_argument to smartly add argument to hparams, so extend_hparams can be called when loading hparams. This is useful for old checkpoints. (d) Handle old checkpoints before the separation of num_layers into num_encoder_layers and num_decoder_layers. Minor clean-ups of misc_utils.py. PiperOrigin-RevId: 180989949 --- nmt/nmt.py | 131 +++++++++++++++++++++++++--------------- nmt/utils/misc_utils.py | 6 +- 2 files changed, 82 insertions(+), 55 deletions(-) diff --git a/nmt/nmt.py b/nmt/nmt.py index aa18acd95..7d92582d0 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -257,6 +257,8 @@ def add_arguments(parser): help=("""\ Reference file to compute evaluation scores (if provided).\ """)) + + # Advanced inference arguments parser.add_argument("--beam_width", type=int, default=0, help=("""\ beam width when using beam search decoder. If 0 (default), use standard @@ -341,6 +343,8 @@ def create_hparams(flags): src_max_len_infer=flags.src_max_len_infer, tgt_max_len_infer=flags.tgt_max_len_infer, infer_batch_size=flags.infer_batch_size, + + # Advanced inference arguments beam_width=flags.beam_width, length_penalty_weight=flags.length_penalty_weight, sampling_temperature=flags.sampling_temperature, @@ -370,16 +374,17 @@ def create_hparams(flags): ) -def extend_hparams(hparams): - """Extend training hparams.""" - assert hparams.num_encoder_layers and hparams.num_decoder_layers - if hparams.num_encoder_layers != hparams.num_decoder_layers: - hparams.pass_hidden_state = False - utils.print_out("Num encoder layer %d is different from num decoder layer" - " %d, so set pass_hidden_state to False" % ( - hparams.num_encoder_layers, - hparams.num_decoder_layers)) +def _add_argument(hparams, key, value, update=True): + """Add an argument to hparams; if exists, change the value if update==True.""" + if hasattr(hparams, key): + if update: + setattr(hparams, key, value) + else: + hparams.add_hparam(key, value) + +def extend_hparams(hparams): + """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % @@ -389,6 +394,17 @@ def extend_hparams(hparams): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) + if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: + raise ValueError("subword option must be either spm, or bpe") + + # Different number of encoder / decoder layers + assert hparams.num_encoder_layers and hparams.num_decoder_layers + if hparams.num_encoder_layers != hparams.num_decoder_layers: + hparams.pass_hidden_state = False + utils.print_out("Num encoder layer %d is different from num decoder layer" + " %d, so set pass_hidden_state to False" % ( + hparams.num_encoder_layers, + hparams.num_decoder_layers)) # Set residual layers num_encoder_residual_layers = 0 @@ -408,20 +424,10 @@ def extend_hparams(hparams): # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers - hparams.add_hparam("num_encoder_residual_layers", num_encoder_residual_layers) - hparams.add_hparam("num_decoder_residual_layers", num_decoder_residual_layers) - - if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: - raise ValueError("subword option must be either spm, or bpe") - - # Flags - utils.print_out("# hparams:") - utils.print_out(" src=%s" % hparams.src) - utils.print_out(" tgt=%s" % hparams.tgt) - utils.print_out(" train_prefix=%s" % hparams.train_prefix) - utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) - utils.print_out(" test_prefix=%s" % hparams.test_prefix) - utils.print_out(" out_dir=%s" % hparams.out_dir) + _add_argument(hparams, "num_encoder_residual_layers", + num_encoder_residual_layers) + _add_argument(hparams, "num_decoder_residual_layers", + num_decoder_residual_layers) ## Vocab # Get vocab file names first @@ -453,14 +459,14 @@ def extend_hparams(hparams): sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) - hparams.add_hparam("src_vocab_size", src_vocab_size) - hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) - hparams.add_hparam("src_vocab_file", src_vocab_file) - hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) + _add_argument(hparams, "src_vocab_size", src_vocab_size) + _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) + _add_argument(hparams, "src_vocab_file", src_vocab_file) + _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Pretrained Embeddings: - hparams.add_hparam("src_embed_file", "") - hparams.add_hparam("tgt_embed_file", "") + _add_argument(hparams, "src_embed_file", "") + _add_argument(hparams, "tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt @@ -471,23 +477,18 @@ def extend_hparams(hparams): if tf.gfile.Exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file - # Check out_dir - if not tf.gfile.Exists(hparams.out_dir): - utils.print_out("# Creating output directory %s ..." % hparams.out_dir) - tf.gfile.MakeDirs(hparams.out_dir) - # Evaluation for metric in hparams.metrics: - hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) - hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) + _add_argument(hparams, "best_" + metric, 0, update=False) + _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) if hparams.avg_ckpts: - hparams.add_hparam("avg_best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) - hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) + _add_argument(hparams, "avg_best_" + metric, 0, update=False) + _add_argument(hparams, "avg_best_" + metric + "_dir", best_metric_dir) return hparams @@ -496,6 +497,11 @@ def ensure_compatible_hparams(hparams, default_hparams, hparams_path): """Make sure the loaded hparams is compatible with new changes.""" default_hparams = utils.maybe_parse_standard_hparams( default_hparams, hparams_path) + # Set num encoder/decoder layers (for old checkpoints) + if not hasattr(hparams, "num_encoder_layers"): + hparams.add_hparam("num_encoder_layers", hparams.num_layers) + if not hasattr(hparams, "num_decoder_layers"): + hparams.add_hparam("num_decoder_layers", hparams.num_layers) # For compatible reason, if there are new fields in default_hparams, # we add them to the current hparams @@ -507,12 +513,17 @@ def ensure_compatible_hparams(hparams, default_hparams, hparams_path): # Update all hparams' keys if override_loaded_hparams=True if default_hparams.override_loaded_hparams: - for key in default_config: - if getattr(hparams, key) != default_config[key]: - utils.print_out("# Updating hparams.%s: %s -> %s" % - (key, str(getattr(hparams, key)), - str(default_config[key]))) - setattr(hparams, key, default_config[key]) + overwritten_keys = default_config.keys() + else: + # For inference + overwritten_keys = ["infer_batch_size", "beam_width", "length_penalty_weight", + "sampling_temperature", "num_translations_per_input"] + for key in overwritten_keys: + if getattr(hparams, key) != default_config[key]: + utils.print_out("# Updating hparams.%s: %s -> %s" % + (key, str(getattr(hparams, key)), + str(default_config[key]))) + setattr(hparams, key, default_config[key]) return hparams @@ -524,9 +535,9 @@ def create_or_load_hparams( hparams = default_hparams hparams = utils.maybe_parse_standard_hparams( hparams, hparams_path) - hparams = extend_hparams(hparams) else: hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path) + hparams = extend_hparams(hparams) # Save HParams if save_hparams: @@ -553,15 +564,36 @@ def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): random.seed(random_seed + jobid) np.random.seed(random_seed + jobid) - ## Train / Decode + # Model output directory out_dir = flags.out_dir - if not tf.gfile.Exists(out_dir): tf.gfile.MakeDirs(out_dir) + if out_dir and not tf.gfile.Exists(out_dir): + utils.print_out("# Creating output directory %s ..." % out_dir) + tf.gfile.MakeDirs(out_dir) # Load hparams. - hparams = create_or_load_hparams( - out_dir, default_hparams, flags.hparams_path, save_hparams=(jobid == 0)) + loaded_hparams = False + if flags.ckpt: # Try to load hparams from the same directory as ckpt + ckpt_dir = os.path.dirname(flags.ckpt) + ckpt_hparams_file = os.path.join(ckpt_dir, "hparams") + if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path: + hparams = create_or_load_hparams( + ckpt_dir, default_hparams, flags.hparams_path, + save_hparams=False) + loaded_hparams = True + if not loaded_hparams: # Try to load from out_dir + assert out_dir + hparams = create_or_load_hparams( + out_dir, default_hparams, flags.hparams_path, + save_hparams=(jobid == 0)) + ## Train / Decode if flags.inference_input_file: + # Inference output directory + trans_file = flags.inference_output_file + assert trans_file + trans_dir = os.path.dirname(trans_file) + if not tf.gfile.Exists(trans_dir): tf.gfile.MakeDirs(trans_dir) + # Inference indices hparams.inference_indices = None if flags.inference_list: @@ -569,7 +601,6 @@ def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): [int(token) for token in flags.inference_list.split(",")]) # Inference - trans_file = flags.inference_output_file ckpt = flags.ckpt if not ckpt: ckpt = tf.train.latest_checkpoint(out_dir) diff --git a/nmt/utils/misc_utils.py b/nmt/utils/misc_utils.py index a680a5cf2..dc9903601 100644 --- a/nmt/utils/misc_utils.py +++ b/nmt/utils/misc_utils.py @@ -100,14 +100,10 @@ def load_hparams(model_dir): def maybe_parse_standard_hparams(hparams, hparams_path): """Override hparams values with existing standard hparams config.""" - if not hparams_path: - return hparams - - if tf.gfile.Exists(hparams_path): + if hparams_path and tf.gfile.Exists(hparams_path): print_out("# Loading standard hparams from %s" % hparams_path) with tf.gfile.GFile(hparams_path, "r") as f: hparams.parse_json(f.read()) - return hparams From 64e04363e3662bee27d34730e9c33dc367bd0cee Mon Sep 17 00:00:00 2001 From: Anonymous Date: Sun, 7 Jan 2018 13:33:59 -0800 Subject: [PATCH 05/38] pretrained our models on PiperOrigin-RevId: 181096467 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a16ffbf9b..809db9ab3 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ with the latest research ideas. We achieve this goal by: We believe that it is important to provide benchmarks that people can easily replicate. As a result, we have provided full experimental results and -pretrained on models on the following publicly available datasets: +pretrained our models on the following publicly available datasets: 1. *Small-scale*: English-Vietnamese parallel corpus of TED talks (133K sentence pairs) provided by From cfa4f48efeb5dc877e42da046305212f5a3b02f9 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Sun, 7 Jan 2018 20:57:09 -0800 Subject: [PATCH 06/38] Factor out model creation code in train.py and inference.py. Update attention_model.py so that we can specify GNMT encoder without attention. PiperOrigin-RevId: 181117151 --- nmt/attention_model.py | 27 ++++++++++++++++----------- nmt/inference.py | 24 ++++++++++++++++-------- nmt/train.py | 31 ++++++++++++++++++------------- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/nmt/attention_model.py b/nmt/attention_model.py index af0ee0b7e..899b73637 100644 --- a/nmt/attention_model.py +++ b/nmt/attention_model.py @@ -44,11 +44,14 @@ def __init__(self, reverse_target_vocab_table=None, scope=None, extra_args=None): + self.has_attention = hparams.attention_architecture and hparams.attention + # Set attention_mechanism_fn - if extra_args and extra_args.attention_mechanism_fn: - self.attention_mechanism_fn = extra_args.attention_mechanism_fn - else: - self.attention_mechanism_fn = create_attention_mechanism + if self.has_attention: + if extra_args and extra_args.attention_mechanism_fn: + self.attention_mechanism_fn = extra_args.attention_mechanism_fn + else: + self.attention_mechanism_fn = create_attention_mechanism super(AttentionModel, self).__init__( hparams=hparams, @@ -66,12 +69,13 @@ def __init__(self, def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" - attention_option = hparams.attention - attention_architecture = hparams.attention_architecture - - if attention_architecture != "standard": + # No Attention + if not self.has_attention: + return super(AttentionModel, self)._build_decoder_cell( + hparams, encoder_outputs, encoder_state, source_sequence_length) + elif hparams.attention_architecture != "standard": raise ValueError( - "Unknown attention architecture %s" % attention_architecture) + "Unknown attention architecture %s" % hparams.attention_architecture) num_units = hparams.num_units num_layers = self.num_decoder_layers @@ -97,8 +101,9 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, else: batch_size = self.batch_size + # Attention attention_mechanism = self.attention_mechanism_fn( - attention_option, num_units, memory, source_sequence_length, self.mode) + hparams.attention, num_units, memory, source_sequence_length, self.mode) cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, @@ -136,7 +141,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, return cell, decoder_initial_state def _get_infer_summary(self, hparams): - if hparams.beam_width > 0: + if not self.has_attention or hparams.beam_width > 0: return tf.no_op() return _create_attention_images_summary(self.final_context_state) diff --git a/nmt/inference.py b/nmt/inference.py index 6f589337a..2a9876c2f 100644 --- a/nmt/inference.py +++ b/nmt/inference.py @@ -80,6 +80,21 @@ def load_data(inference_input_file, hparams=None): return inference_data +def get_model_creator(hparams): + """Get the right model class depending on configuration.""" + if (hparams.encoder_type == "gnmt" or + hparams.attention_architecture in ["gnmt", "gnmt_v2"]): + model_creator = gnmt_model.GNMTModel + elif hparams.attention_architecture == "standard": + model_creator = attention_model.AttentionModel + elif not hparams.attention: + model_creator = nmt_model.Model + else: + raise ValueError("Unknown attention architecture %s" % + hparams.attention_architecture) + return model_creator + + def inference(ckpt, inference_input_file, inference_output_file, @@ -91,14 +106,7 @@ def inference(ckpt, if hparams.inference_indices: assert num_workers == 1 - if not hparams.attention: - model_creator = nmt_model.Model - elif hparams.attention_architecture == "standard": - model_creator = attention_model.AttentionModel - elif hparams.attention_architecture in ["gnmt", "gnmt_v2"]: - model_creator = gnmt_model.GNMTModel - else: - raise ValueError("Unknown model architecture") + model_creator = get_model_creator(hparams) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) if num_workers == 1: diff --git a/nmt/train.py b/nmt/train.py index aa1aeeae1..20c2375b0 100644 --- a/nmt/train.py +++ b/nmt/train.py @@ -35,7 +35,7 @@ __all__ = [ "run_sample_decode", "run_internal_eval", "run_external_eval", "run_avg_external_eval", "run_full_eval", "init_stats", "update_stats", - "print_step_info", "process_stats", "train" + "print_step_info", "process_stats", "train", "get_model_creator" ] @@ -268,6 +268,21 @@ def before_train(loaded_train_model, train_model, train_sess, global_step, return stats, info, start_train_time +def get_model_creator(hparams): + """Get the right model class depending on configuration.""" + if (hparams.encoder_type == "gnmt" or + hparams.attention_architecture in ["gnmt", "gnmt_v2"]): + model_creator = gnmt_model.GNMTModel + elif hparams.attention_architecture == "standard": + model_creator = attention_model.AttentionModel + elif not hparams.attention: + model_creator = nmt_model.Model + else: + raise ValueError("Unknown attention architecture %s" % + hparams.attention_architecture) + return model_creator + + def train(hparams, scope=None, target_session=""): """Train a translation model.""" log_device_placement = hparams.log_device_placement @@ -281,18 +296,8 @@ def train(hparams, scope=None, target_session=""): if not steps_per_external_eval: steps_per_external_eval = 5 * steps_per_eval - if not hparams.attention: - model_creator = nmt_model.Model - else: # Attention - if (hparams.encoder_type == "gnmt" or - hparams.attention_architecture in ["gnmt", "gnmt_v2"]): - model_creator = gnmt_model.GNMTModel - elif hparams.attention_architecture == "standard": - model_creator = attention_model.AttentionModel - else: - raise ValueError("Unknown attention architecture %s" % - hparams.attention_architecture) - + # Create model + model_creator = get_model_creator(hparams) train_model = model_helper.create_train_model(model_creator, hparams, scope) eval_model = model_helper.create_eval_model(model_creator, hparams, scope) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) From 989683db00f63e1d777f5e0fd71b67529fe3dadd Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Mon, 8 Jan 2018 17:37:39 -0800 Subject: [PATCH 07/38] Standardize vocab in test to use , , PiperOrigin-RevId: 181244462 --- nmt/model_test.py | 11 +++++++---- nmt/testdata/test_infer_vocab.src | 6 +++--- nmt/testdata/test_infer_vocab.tgt | 7 +++---- nmt/utils/common_test_utils.py | 6 +++--- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/nmt/model_test.py b/nmt/model_test.py index 5c2659117..35548e329 100644 --- a/nmt/model_test.py +++ b/nmt/model_test.py @@ -34,6 +34,9 @@ int32 = np.int32 array = np.array +SOS = '' +EOS = '' + class ModelTest(tf.test.TestCase): @@ -186,15 +189,15 @@ def setUpClass(cls): cls.actual_beam_sentences = {} cls.expected_beam_sentences = { 'BeamSearchAttentionModel: batch 0 of beam 0': '', - 'BeamSearchAttentionModel: batch 0 of beam 1': 'sos a sos a', + 'BeamSearchAttentionModel: batch 0 of beam 1': '%s a %s a' % (SOS, SOS), 'BeamSearchAttentionModel: batch 1 of beam 0': '', 'BeamSearchAttentionModel: batch 1 of beam 1': 'b', 'BeamSearchBasicModel: batch 0 of beam 0': 'b b b b', - 'BeamSearchBasicModel: batch 0 of beam 1': 'b b b sos', + 'BeamSearchBasicModel: batch 0 of beam 1': 'b b b %s' % SOS, 'BeamSearchBasicModel: batch 0 of beam 2': 'b b b c', 'BeamSearchBasicModel: batch 1 of beam 0': 'b b b b', 'BeamSearchBasicModel: batch 1 of beam 1': 'a b b b', - 'BeamSearchBasicModel: batch 1 of beam 2': 'b b b sos', + 'BeamSearchBasicModel: batch 1 of beam 2': 'b b b %s' % SOS, 'BeamSearchGNMTModel: batch 0 of beam 0': '', 'BeamSearchGNMTModel: batch 1 of beam 0': '', } @@ -289,7 +292,7 @@ def _assertBeamSearchOutputs(self, m, sess, assert_top_k_sentence, name): output_words = nmt_outputs[i] for j in range(output_words.shape[0]): sentence = nmt_utils.get_translation( - output_words, j, tgt_eos='eos', subword_option='') + output_words, j, tgt_eos=EOS, subword_option='') sentence_key = ('%s: batch %d of beam %d' % (name, j, i)) self.actual_beam_sentences[sentence_key] = sentence expected_sentence = self.expected_beam_sentences[sentence_key] diff --git a/nmt/testdata/test_infer_vocab.src b/nmt/testdata/test_infer_vocab.src index ccecabbca..0e441f86b 100644 --- a/nmt/testdata/test_infer_vocab.src +++ b/nmt/testdata/test_infer_vocab.src @@ -1,5 +1,5 @@ -unk -eos -sos + + + test1 test2 diff --git a/nmt/testdata/test_infer_vocab.tgt b/nmt/testdata/test_infer_vocab.tgt index 6c60b1194..279be587b 100644 --- a/nmt/testdata/test_infer_vocab.tgt +++ b/nmt/testdata/test_infer_vocab.tgt @@ -1,6 +1,5 @@ -unk -eos -test1 -test2 + + + test3 test4 diff --git a/nmt/utils/common_test_utils.py b/nmt/utils/common_test_utils.py index f76fd3b10..528ac0691 100644 --- a/nmt/utils/common_test_utils.py +++ b/nmt/utils/common_test_utils.py @@ -47,7 +47,7 @@ def create_test_hparams(unit_type="lstm", standard_hparams = standard_hparams_utils.create_standard_hparams() # Networks - standard_hparams.num_units = 5 + standard_hparams.num_units = 5 standard_hparams.num_encoder_layers = num_layers standard_hparams.num_decoder_layers = num_layers standard_hparams.dropout = 0.5 @@ -77,8 +77,8 @@ def create_test_hparams(unit_type="lstm", # Vocab standard_hparams.src_vocab_size = 5 standard_hparams.tgt_vocab_size = 5 - standard_hparams.eos = "eos" - standard_hparams.sos = "sos" + standard_hparams.eos = "" + standard_hparams.sos = "" standard_hparams.src_vocab_file = "" standard_hparams.tgt_vocab_file = "" standard_hparams.src_embed_file = "" From 656fb12eaefdd293165f5dcf90a383d74a3dffcd Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Mon, 8 Jan 2018 20:37:26 -0800 Subject: [PATCH 08/38] Clean up inference_test.py and add more sharing code to _createTestInferCheckpoint(); Rename ckpt into ckpt_path in inference.py and model_helper.py PiperOrigin-RevId: 181260899 --- nmt/inference.py | 14 ++--- nmt/inference_test.py | 116 ++++++++++++++---------------------------- nmt/model_helper.py | 14 ++--- 3 files changed, 51 insertions(+), 93 deletions(-) diff --git a/nmt/inference.py b/nmt/inference.py index 2a9876c2f..a5ad45d89 100644 --- a/nmt/inference.py +++ b/nmt/inference.py @@ -95,7 +95,7 @@ def get_model_creator(hparams): return model_creator -def inference(ckpt, +def inference(ckpt_path, inference_input_file, inference_output_file, hparams, @@ -112,14 +112,14 @@ def inference(ckpt, if num_workers == 1: single_worker_inference( infer_model, - ckpt, + ckpt_path, inference_input_file, inference_output_file, hparams) else: multi_worker_inference( infer_model, - ckpt, + ckpt_path, inference_input_file, inference_output_file, hparams, @@ -128,7 +128,7 @@ def inference(ckpt, def single_worker_inference(infer_model, - ckpt, + ckpt_path, inference_input_file, inference_output_file, hparams): @@ -141,7 +141,7 @@ def single_worker_inference(infer_model, with tf.Session( graph=infer_model.graph, config=utils.get_config_proto()) as sess: loaded_infer_model = model_helper.load_model( - infer_model.model, ckpt, sess, "infer") + infer_model.model, ckpt_path, sess, "infer") sess.run( infer_model.iterator.initializer, feed_dict={ @@ -174,7 +174,7 @@ def single_worker_inference(infer_model, def multi_worker_inference(infer_model, - ckpt, + ckpt_path, inference_input_file, inference_output_file, hparams, @@ -200,7 +200,7 @@ def multi_worker_inference(infer_model, with tf.Session( graph=infer_model.graph, config=utils.get_config_proto()) as sess: loaded_infer_model = model_helper.load_model( - infer_model.model, ckpt, sess, "infer") + infer_model.model, ckpt_path, sess, "infer") sess.run(infer_model.iterator.initializer, { infer_model.src_placeholder: infer_data, diff --git a/nmt/inference_test.py b/nmt/inference_test.py index 7c342f9f9..046048cec 100644 --- a/nmt/inference_test.py +++ b/nmt/inference_test.py @@ -23,11 +23,8 @@ import numpy as np import tensorflow as tf -from . import attention_model -from . import model_helper -from . import model as nmt_model -from . import gnmt_model from . import inference +from . import model_helper from .utils import common_test_utils float32 = np.float32 @@ -37,24 +34,26 @@ class InferenceTest(tf.test.TestCase): - def _createTestInferCheckpoint(self, hparams, out_dir): - if not hparams.attention: - model_creator = nmt_model.Model - elif hparams.attention_architecture == "standard": - model_creator = attention_model.AttentionModel - elif hparams.attention_architecture in ["gnmt", "gnmt_v2"]: - model_creator = gnmt_model.GNMTModel - else: - raise ValueError("Unknown model architecture") + def _createTestInferCheckpoint(self, hparams, name): + # Prepare + hparams.vocab_prefix = ( + "nmt/testdata/test_infer_vocab") + hparams.src_vocab_file = hparams.vocab_prefix + "." + hparams.src + hparams.tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt + out_dir = os.path.join(tf.test.get_temp_dir(), name) + os.makedirs(out_dir) + hparams.out_dir = out_dir + # Create check point + model_creator = inference.get_model_creator(hparams) infer_model = model_helper.create_infer_model(model_creator, hparams) with self.test_session(graph=infer_model.graph) as sess: loaded_model, global_step = model_helper.create_or_load_model( infer_model.model, out_dir, sess, "infer_name") - ckpt = loaded_model.saver.save( + ckpt_path = loaded_model.saver.save( sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step) - return ckpt + return ckpt_path def testBasicModel(self): hparams = common_test_utils.create_test_hparams( @@ -63,17 +62,10 @@ def testBasicModel(self): attention="", attention_architecture="", use_residual=False,) - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - + ckpt_path = self._createTestInferCheckpoint(hparams, "basic_infer") infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), "basic_infer") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") - ckpt = self._createTestInferCheckpoint(hparams, out_dir) - inference.inference(ckpt, infer_file, output_infer, hparams) + output_infer = os.path.join(hparams.out_dir, "output_infer") + inference.inference(ckpt_path, infer_file, output_infer, hparams) with open(output_infer) as f: self.assertEqual(5, len(list(f))) @@ -87,17 +79,10 @@ def testBasicModelWithMultipleTranslations(self): num_translations_per_input=2, beam_width=2, ) - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - + ckpt_path = self._createTestInferCheckpoint(hparams, "multi_basic_infer") infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), "multi_basic_infer") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") - ckpt = self._createTestInferCheckpoint(hparams, out_dir) - inference.inference(ckpt, infer_file, output_infer, hparams) + output_infer = os.path.join(hparams.out_dir, "output_infer") + inference.inference(ckpt_path, infer_file, output_infer, hparams) with open(output_infer) as f: self.assertEqual(10, len(list(f))) @@ -108,17 +93,10 @@ def testAttentionModel(self): attention="scaled_luong", attention_architecture="standard", use_residual=False,) - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - + ckpt_path = self._createTestInferCheckpoint(hparams, "attention_infer") infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), "attention_infer") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") - ckpt = self._createTestInferCheckpoint(hparams, out_dir) - inference.inference(ckpt, infer_file, output_infer, hparams) + output_infer = os.path.join(hparams.out_dir, "output_infer") + inference.inference(ckpt_path, infer_file, output_infer, hparams) with open(output_infer) as f: self.assertEqual(5, len(list(f))) @@ -129,15 +107,6 @@ def testMultiWorkers(self): attention="scaled_luong", attention_architecture="standard", use_residual=False,) - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - - infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), "multi_worker_infer") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") num_workers = 3 @@ -146,17 +115,19 @@ def testMultiWorkers(self): # cases. hparams.batch_size = 3 - ckpt = self._createTestInferCheckpoint(hparams, out_dir) + ckpt_path = self._createTestInferCheckpoint(hparams, "multi_worker_infer") + infer_file = "nmt/testdata/test_infer_file" + output_infer = os.path.join(hparams.out_dir, "output_infer") inference.inference( - ckpt, infer_file, output_infer, hparams, num_workers, jobid=1) + ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=1) inference.inference( - ckpt, infer_file, output_infer, hparams, num_workers, jobid=2) + ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=2) # Note: Need to start job 0 at the end; otherwise, it will block the testing # thread. inference.inference( - ckpt, infer_file, output_infer, hparams, num_workers, jobid=0) + ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=0) with open(output_infer) as f: self.assertEqual(5, len(list(f))) @@ -169,17 +140,11 @@ def testBasicModelWithInferIndices(self): attention_architecture="", use_residual=False, inference_indices=[0]) - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - + ckpt_path = self._createTestInferCheckpoint(hparams, + "basic_infer_with_indices") infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), "basic_infer_with_indices") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") - ckpt = self._createTestInferCheckpoint(hparams, out_dir) - inference.inference(ckpt, infer_file, output_infer, hparams) + output_infer = os.path.join(hparams.out_dir, "output_infer") + inference.inference(ckpt_path, infer_file, output_infer, hparams) with open(output_infer) as f: self.assertEqual(1, len(list(f))) @@ -193,18 +158,11 @@ def testAttentionModelWithInferIndices(self): inference_indices=[1, 2]) # TODO(rzhao): Make infer indices support batch_size > 1. hparams.infer_batch_size = 1 - vocab_prefix = "nmt/testdata/test_infer_vocab" - hparams.src_vocab_file = vocab_prefix + "." + hparams.src - hparams.tgt_vocab_file = vocab_prefix + "." + hparams.tgt - + ckpt_path = self._createTestInferCheckpoint(hparams, + "attention_infer_with_indices") infer_file = "nmt/testdata/test_infer_file" - out_dir = os.path.join(tf.test.get_temp_dir(), - "attention_infer_with_indices") - hparams.out_dir = out_dir - os.makedirs(out_dir) - output_infer = os.path.join(out_dir, "output_infer") - ckpt = self._createTestInferCheckpoint(hparams, out_dir) - inference.inference(ckpt, infer_file, output_infer, hparams) + output_infer = os.path.join(hparams.out_dir, "output_infer") + inference.inference(ckpt_path, infer_file, output_infer, hparams) with open(output_infer) as f: self.assertEqual(2, len(list(f))) self.assertTrue(os.path.exists(output_infer+str(1)+".png")) diff --git a/nmt/model_helper.py b/nmt/model_helper.py index 02a5f0aeb..ef9e8c277 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -479,29 +479,29 @@ def gradient_clip(gradients, max_gradient_norm): return clipped_gradients, gradient_norm_summary, gradient_norm -def print_variables_in_ckpt(ckpt): +def print_variables_in_ckpt(ckpt_path): """Print a list of variables in a checkpoint together with their shapes.""" - utils.print_out("# Variables in ckpt %s" % ckpt) - reader = tf.train.NewCheckpointReader(ckpt) + utils.print_out("# Variables in ckpt %s" % ckpt_path) + reader = tf.train.NewCheckpointReader(ckpt_path) variable_map = reader.get_variable_to_shape_map() for key in sorted(variable_map.keys()): utils.print_out(" %s: %s" % (key, variable_map[key])) -def load_model(model, ckpt, session, name): +def load_model(model, ckpt_path, session, name): """Load model from a checkpoint.""" start_time = time.time() try: - model.saver.restore(session, ckpt) + model.saver.restore(session, ckpt_path) except tf.errors.NotFoundError as e: utils.print_out("Can't load checkpoint") - print_variables_in_ckpt(ckpt) + print_variables_in_ckpt(ckpt_path) utils.print_out("%s" % str(e)) session.run(tf.tables_initializer()) utils.print_out( " loaded %s model parameters from %s, time %.2fs" % - (name, ckpt, time.time() - start_time)) + (name, ckpt_path, time.time() - start_time)) return model From bdbba21abe2c8cdb468a542c752e5fb32e7340d0 Mon Sep 17 00:00:00 2001 From: Daniel De Freitas Adiwardana Date: Mon, 8 Jan 2018 20:49:04 -0800 Subject: [PATCH 09/38] NMT: Improving GPU availability debugging. op.device actually returns what the user requested not the actual device. This can be misleading as it can return "GPU0" even if no GPU is available. For context see: https://github.com/tensorflow/tensorflow/issues/1344 PiperOrigin-RevId: 181261953 --- nmt/model.py | 1 + nmt/nmt.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/nmt/model.py b/nmt/model.py index 0b2affa85..2cc2ec27d 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -155,6 +155,7 @@ def __init__(self, # Print trainable variables utils.print_out("# Trainable variables") + utils.print_out("Format: , , <(soft) device placement>") for param in params: utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) diff --git a/nmt/nmt.py b/nmt/nmt.py index 7d92582d0..8f1a8acff 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -557,6 +557,10 @@ def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""): num_workers = flags.num_workers utils.print_out("# Job id %d" % jobid) + # GPU device + utils.print_out( + "# Devices visible to TensorFlow: %s" % repr(tf.Session().list_devices())) + # Random random_seed = flags.random_seed if random_seed is not None and random_seed > 0: From e73f83e563bdf9e42b37f23fd190a3d82af04104 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Tue, 9 Jan 2018 11:17:37 -0800 Subject: [PATCH 10/38] Add add_info_summaries to automatically add summaries from info dict. Rename _get_best_results to get_best_results. Update avg_grad_norm computation to divide by the number of examples instead. PiperOrigin-RevId: 181346178 --- nmt/train.py | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/nmt/train.py b/nmt/train.py index 20c2375b0..75432d282 100644 --- a/nmt/train.py +++ b/nmt/train.py @@ -35,7 +35,8 @@ __all__ = [ "run_sample_decode", "run_internal_eval", "run_external_eval", "run_avg_external_eval", "run_full_eval", "init_stats", "update_stats", - "print_step_info", "process_stats", "train", "get_model_creator" + "print_step_info", "process_stats", "train", "get_model_creator", + "add_info_summaries", "get_best_results" ] @@ -198,8 +199,11 @@ def run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess, def init_stats(): """Initialize statistics that we want to accumulate.""" - return {"step_time": 0.0, "loss": 0.0, "predict_count": 0.0, - "total_count": 0.0, "grad_norm": 0.0} + return {"step_time": 0.0, "train_loss": 0.0, + "predict_count": 0.0, # word count on the target side + "word_count": 0.0, # word counts for both source and target + "sequence_count": 0.0, # number of training examples processed + "grad_norm": 0.0} def update_stats(stats, start_time, step_result): @@ -207,11 +211,13 @@ def update_stats(stats, start_time, step_result): _, output_tuple = step_result # Update statistics - stats["step_time"] += (time.time() - start_time) - stats["loss"] += (output_tuple.train_loss * output_tuple.batch_size) - stats["predict_count"] += output_tuple.predict_count - stats["total_count"] += float(output_tuple.word_count) + batch_size = output_tuple.batch_size + stats["step_time"] += time.time() - start_time + stats["train_loss"] += output_tuple.train_loss * batch_size stats["grad_norm"] += output_tuple.grad_norm + stats["predict_count"] += output_tuple.predict_count + stats["word_count"] += output_tuple.word_count + stats["sequence_count"] += batch_size return (output_tuple.global_step, output_tuple.learning_rate, output_tuple.train_summary) @@ -227,13 +233,25 @@ def print_step_info(prefix, global_step, info, result_summary, log_f): log_f) +def add_info_summaries(summary_writer, global_step, info): + """Add stuffs in info to summaries.""" + excluded_list = ["learning_rate"] + for key in info: + if key not in excluded_list: + utils.add_summary(summary_writer, global_step, key, info[key]) + + def process_stats(stats, info, global_step, steps_per_stats, log_f): """Update info and check for overflow.""" - # Update info + # Per-step info info["avg_step_time"] = stats["step_time"] / steps_per_stats info["avg_grad_norm"] = stats["grad_norm"] / steps_per_stats - info["train_ppl"] = utils.safe_exp(stats["loss"] / stats["predict_count"]) - info["speed"] = stats["total_count"] / (1000 * stats["step_time"]) + info["avg_sequence_count"] = stats["sequence_count"] / steps_per_stats + info["speed"] = stats["word_count"] / (1000 * stats["step_time"]) + + # Per-predict info + info["train_ppl"] = ( + utils.safe_exp(stats["train_loss"] / stats["predict_count"])) # Check for overflow is_overflow = False @@ -250,8 +268,10 @@ def before_train(loaded_train_model, train_model, train_sess, global_step, hparams, log_f): """Misc tasks to do before training.""" stats = init_stats() - info = {"train_ppl": 0.0, "speed": 0.0, "avg_step_time": 0.0, + info = {"train_ppl": 0.0, "speed": 0.0, + "avg_step_time": 0.0, "avg_grad_norm": 0.0, + "avg_sequence_count": 0.0, "learning_rate": loaded_train_model.learning_rate.eval( session=train_sess)} start_train_time = time.time() @@ -386,7 +406,7 @@ def train(hparams, scope=None, target_session=""): last_stats_step = global_step is_overflow = process_stats( stats, info, global_step, steps_per_stats, log_f) - print_step_info(" ", global_step, info, _get_best_results(hparams), + print_step_info(" ", global_step, info, get_best_results(hparams), log_f) if is_overflow: break @@ -397,8 +417,7 @@ def train(hparams, scope=None, target_session=""): if global_step - last_eval_step >= steps_per_eval: last_eval_step = global_step utils.print_out("# Save eval, global step %d" % global_step) - utils.add_summary(summary_writer, global_step, "train_ppl", - info["train_ppl"]) + add_info_summaries(summary_writer, global_step, info) # Save checkpoint loaded_train_model.saver.save( @@ -487,7 +506,7 @@ def _format_results(name, ppl, scores, metrics): return result_str -def _get_best_results(hparams): +def get_best_results(hparams): """Summary of the current best results.""" tokens = [] for metric in hparams.metrics: From ae33cf1d287ba3df0b438b1f8b61a104c8a1b674 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Tue, 9 Jan 2018 17:22:25 -0800 Subject: [PATCH 11/38] Internal change only PiperOrigin-RevId: 181399302 --- nmt/model.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index 2cc2ec27d..a4b86992f 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -351,11 +351,11 @@ def build_graph(self, hparams, scope=None): with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder - encoder_outputs, encoder_state = self._build_encoder(hparams) + self.encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder logits, sample_id, final_context_state = self._build_decoder( - encoder_outputs, encoder_state, hparams) + self.encoder_outputs, encoder_state, hparams) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: @@ -601,6 +601,16 @@ def decode(self, sess): sample_words = sample_words.transpose([2, 0, 1]) return sample_words, infer_summary + def compute_encoder_states(self, sess): + """Compute encoder states. Return tensor [batch, length, layer, size].""" + assert self.mode == tf.contrib.learn.ModeKeys.INFER + encoder_states = self.encoder_outputs + + # We only return the top layer for now, so set the third dim to 1. + if len(encoder_states.shape) == 3: + encoder_states = tf.expand_dims(encoder_states, 2) + return sess.run(encoder_states) + class Model(BaseModel): """Sequence-to-sequence dynamic model. From 4eba2d882b94da43c4f8d7bf4b039da28f8a766a Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Fri, 12 Jan 2018 10:57:45 -0800 Subject: [PATCH 12/38] Make compute_encoder_states() in model.py more general PiperOrigin-RevId: 181765024 --- nmt/model.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index a4b86992f..b1c01f183 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -604,12 +604,7 @@ def decode(self, sess): def compute_encoder_states(self, sess): """Compute encoder states. Return tensor [batch, length, layer, size].""" assert self.mode == tf.contrib.learn.ModeKeys.INFER - encoder_states = self.encoder_outputs - - # We only return the top layer for now, so set the third dim to 1. - if len(encoder_states.shape) == 3: - encoder_states = tf.expand_dims(encoder_states, 2) - return sess.run(encoder_states) + return sess.run(tf.stack(self.encoder_state_list, 2)) class Model(BaseModel): @@ -675,6 +670,10 @@ def _build_encoder(self, hparams): encoder_state = tuple(encoder_state) else: raise ValueError("Unknown encoder_type %s" % hparams.encoder_type) + + # Use the top layer for now + self.encoder_state_list = [encoder_outputs] + return encoder_outputs, encoder_state def _build_bidirectional_rnn(self, inputs, sequence_length, From 9be88ea0f99d3bee0615fd65d0d1c7f88ae28fba Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Wed, 17 Jan 2018 22:24:54 -0800 Subject: [PATCH 13/38] Set self.encoder_state_list in build_encoder() for gnmt_model.py PiperOrigin-RevId: 182319861 --- nmt/gnmt_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index 6f1219ec3..eec452f92 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -116,6 +116,9 @@ def _build_encoder(self, hparams): encoder_state = (bi_encoder_state[1],) + ( (encoder_state,) if num_uni_layers == 1 else encoder_state) + # Use the top layer for now + self.encoder_state_list = [encoder_outputs] + return encoder_outputs, encoder_state def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, From 438e29ad118baa02a7b64499c61995fab43d237b Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Thu, 18 Jan 2018 13:10:44 -0800 Subject: [PATCH 14/38] Add language_model flag to train a language model by ignoring the encoder. PiperOrigin-RevId: 182426171 --- nmt/model.py | 16 +++++++++++++--- nmt/nmt.py | 19 ++++++++++++++++++- nmt/utils/common_test_utils.py | 1 + nmt/utils/standard_hparams_utils.py | 3 +++ 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index b1c01f183..363299f6f 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -179,6 +179,7 @@ def _set_params_initializer(self, self.tgt_vocab_size = hparams.tgt_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major + self.dtype = tf.float32 # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None @@ -347,11 +348,14 @@ def build_graph(self, hparams, scope=None): bahdanau | normed_bahdanau). """ utils.print_out("# creating %s graph ..." % self.mode) - dtype = tf.float32 - with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): + with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder - self.encoder_outputs, encoder_state = self._build_encoder(hparams) + if hparams.language_model: # no encoder for language modeling + self.encoder_outputs = None + encoder_state = None + else: + self.encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder logits, sample_id, final_context_state = self._build_decoder( @@ -737,6 +741,12 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, base_gpu=base_gpu ) + if hparams.language_model: + encoder_state = cell.zero_state(self.batch_size, self.dtype) + elif not hparams.pass_hidden_state: + raise ValueError("For non-attentional model, " + "pass_hidden_state needs to be set to True") + # For beam search, we need to replicate encoder infos beam_width times if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0: decoder_initial_state = tf.contrib.seq2seq.tile_batch( diff --git a/nmt/nmt.py b/nmt/nmt.py index 8f1a8acff..af9185797 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -240,6 +240,9 @@ def add_arguments(parser): Average the last N checkpoints for external evaluation. N can be controlled by setting --num_keep_ckpts.\ """)) + parser.add_argument("--language_model", type="bool", nargs="?", + const=True, default=False, + help="True to train a language model, ignoring encoder") # Inference parser.add_argument("--ckpt", type=str, default="", @@ -369,6 +372,7 @@ def create_hparams(flags): override_loaded_hparams=flags.override_loaded_hparams, num_keep_ckpts=flags.num_keep_ckpts, avg_ckpts=flags.avg_ckpts, + language_model=flags.language_model, num_intra_threads=flags.num_intra_threads, num_inter_threads=flags.num_inter_threads, ) @@ -429,6 +433,16 @@ def extend_hparams(hparams): _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) + # Language modeling + if hparams.language_model: + hparams.attention = "" + hparams.attention_architecture = "" + hparams.pass_hidden_state = False + hparams.share_vocab = True + hparams.src = hparams.tgt + utils.print_out("For language modeling, we turn off attention and " + "pass_hidden_state; turn on share_vocab; set src to tgt.") + ## Vocab # Get vocab file names first if hparams.vocab_prefix: @@ -464,10 +478,13 @@ def extend_hparams(hparams): _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) - # Pretrained Embeddings: + # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if hparams.embed_prefix: + hparams.num_embeddings_partitions = 1 + utils.print_out( + "For pretrained embeddings, set num_embeddings_partitions to 1") src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt diff --git a/nmt/utils/common_test_utils.py b/nmt/utils/common_test_utils.py index 528ac0691..28d4681db 100644 --- a/nmt/utils/common_test_utils.py +++ b/nmt/utils/common_test_utils.py @@ -73,6 +73,7 @@ def create_test_hparams(unit_type="lstm", # Misc standard_hparams.forget_bias = 0.0 standard_hparams.random_seed = 3 + language_model=False # Vocab standard_hparams.src_vocab_size = 5 diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index 15f294a5d..f93f707af 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -101,4 +101,7 @@ def create_standard_hparams(): infer_batch_size=32, sampling_temperature=0.0, num_translations_per_input=1, + + # Language model + language_model=False, ) From 3bb4930639b4380500dac7fcc236748e287d8884 Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Tue, 23 Jan 2018 11:05:04 -0800 Subject: [PATCH 15/38] Add infer_mode option to specify which type of decoder to use during inference. PiperOrigin-RevId: 182960914 --- nmt/attention_model.py | 33 ++++++++++++++++-------- nmt/gnmt_model.py | 20 +++++++-------- nmt/inference.py | 6 +++-- nmt/inference_test.py | 2 ++ nmt/model.py | 39 +++++++++++++++++------------ nmt/model_test.py | 3 +++ nmt/nmt.py | 15 +++++++++-- nmt/train.py | 5 ++-- nmt/utils/nmt_utils.py | 12 ++++++--- nmt/utils/standard_hparams_utils.py | 1 + 10 files changed, 88 insertions(+), 48 deletions(-) diff --git a/nmt/attention_model.py b/nmt/attention_model.py index 899b73637..38120c3cb 100644 --- a/nmt/attention_model.py +++ b/nmt/attention_model.py @@ -66,6 +66,19 @@ def __init__(self, if self.mode == tf.contrib.learn.ModeKeys.INFER: self.infer_summary = self._get_infer_summary(hparams) + + def _prepare_beam_search_decoder_inputs( + self, beam_width, memory, source_sequence_length, encoder_state): + memory = tf.contrib.seq2seq.tile_batch( + memory, multiplier=beam_width) + source_sequence_length = tf.contrib.seq2seq.tile_batch( + source_sequence_length, multiplier=beam_width) + encoder_state = tf.contrib.seq2seq.tile_batch( + encoder_state, multiplier=beam_width) + batch_size = self.batch_size * beam_width + return memory, source_sequence_length, encoder_state, batch_size + + def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" @@ -80,7 +93,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, num_units = hparams.num_units num_layers = self.num_decoder_layers num_residual_layers = self.num_decoder_residual_layers - beam_width = hparams.beam_width + infer_mode = hparams.infer_mode dtype = tf.float32 @@ -90,14 +103,12 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, else: memory = encoder_outputs - if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: - memory = tf.contrib.seq2seq.tile_batch( - memory, multiplier=beam_width) - source_sequence_length = tf.contrib.seq2seq.tile_batch( - source_sequence_length, multiplier=beam_width) - encoder_state = tf.contrib.seq2seq.tile_batch( - encoder_state, multiplier=beam_width) - batch_size = self.batch_size * beam_width + if (self.mode == tf.contrib.learn.ModeKeys.INFER and + infer_mode == "beam_search"): + memory, source_sequence_length, encoder_state, batch_size = ( + self._prepare_beam_search_decoder_inputs( + hparams.beam_width, memory, source_sequence_length, + encoder_state)) else: batch_size = self.batch_size @@ -118,7 +129,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and - beam_width == 0) + infer_mode != "beam_search") cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, @@ -141,7 +152,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, return cell, decoder_initial_state def _get_infer_summary(self, hparams): - if not self.has_attention or hparams.beam_width > 0: + if not self.has_attention or hparams.infer_mode == "beam_search": return tf.no_op() return _create_attention_images_summary(self.final_context_state) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index eec452f92..1d3f6e68c 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -133,7 +133,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, attention_option = hparams.attention attention_architecture = hparams.attention_architecture num_units = hparams.num_units - beam_width = hparams.beam_width + infer_mode = hparams.infer_mode dtype = tf.float32 @@ -142,14 +142,12 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, else: memory = encoder_outputs - if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0: - memory = tf.contrib.seq2seq.tile_batch( - memory, multiplier=beam_width) - source_sequence_length = tf.contrib.seq2seq.tile_batch( - source_sequence_length, multiplier=beam_width) - encoder_state = tf.contrib.seq2seq.tile_batch( - encoder_state, multiplier=beam_width) - batch_size = self.batch_size * beam_width + if (self.mode == tf.contrib.learn.ModeKeys.INFER and + infer_mode == "beam_search"): + memory, source_sequence_length, encoder_state, batch_size = ( + self._prepare_beam_search_decoder_inputs( + hparams.beam_width, memory, source_sequence_length, + encoder_state)) else: batch_size = self.batch_size @@ -174,7 +172,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, # Only generate alignment in greedy INFER mode. alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and - beam_width == 0) + infer_mode != "beam_search") attention_cell = tf.contrib.seq2seq.AttentionWrapper( attention_cell, attention_mechanism, @@ -210,7 +208,7 @@ def _get_infer_summary(self, hparams): return super(GNMTModel, self)._get_infer_summary(hparams) # GNMT attention - if hparams.beam_width > 0: + if hparams.infer_mode == "beam_search": return tf.no_op() return attention_model._create_attention_images_summary( self.final_context_state[0]) diff --git a/nmt/inference.py b/nmt/inference.py index a5ad45d89..b0759a205 100644 --- a/nmt/inference.py +++ b/nmt/inference.py @@ -170,7 +170,8 @@ def single_worker_inference(infer_model, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, - num_translations_per_input=hparams.num_translations_per_input) + num_translations_per_input=hparams.num_translations_per_input, + infer_mode=hparams.infer_mode) def multi_worker_inference(infer_model, @@ -218,7 +219,8 @@ def multi_worker_inference(infer_model, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, - num_translations_per_input=hparams.num_translations_per_input) + num_translations_per_input=hparams.num_translations_per_input, + infer_mode=hparams.infer_mode) # Change file name to indicate the file writing is completed. tf.gfile.Rename(output_infer, output_infer_done, overwrite=True) diff --git a/nmt/inference_test.py b/nmt/inference_test.py index 046048cec..317024b81 100644 --- a/nmt/inference_test.py +++ b/nmt/inference_test.py @@ -79,6 +79,8 @@ def testBasicModelWithMultipleTranslations(self): num_translations_per_input=2, beam_width=2, ) + hparams.infer_mode = "beam_search" + ckpt_path = self._createTestInferCheckpoint(hparams, "multi_basic_infer") infer_file = "nmt/testdata/test_infer_file" output_infer = os.path.join(hparams.out_dir, "output_infer") diff --git a/nmt/model.py b/nmt/model.py index 363299f6f..734d9967a 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -481,11 +481,13 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): ## Inference else: - beam_width = hparams.beam_width - length_penalty_weight = hparams.length_penalty_weight + infer_mode = hparams.infer_mode start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id - if beam_width > 0: + if infer_mode == "beam_search": + beam_width = hparams.beam_width + length_penalty_weight = hparams.length_penalty_weight + my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=self.embedding_decoder, @@ -495,19 +497,23 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): beam_width=beam_width, output_layer=self.output_layer, length_penalty_weight=length_penalty_weight) - else: + elif infer_mode == "sample": # Helper sampling_temperature = hparams.sampling_temperature - if sampling_temperature > 0.0: - helper = tf.contrib.seq2seq.SampleEmbeddingHelper( - self.embedding_decoder, start_tokens, end_token, - softmax_temperature=sampling_temperature, - seed=hparams.random_seed) - else: - helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( - self.embedding_decoder, start_tokens, end_token) - - # Decoder + assert sampling_temperature > 0.0, ( + "sampling_temperature must greater than 0.0 when using sample" + " decoder.") + helper = tf.contrib.seq2seq.SampleEmbeddingHelper( + self.embedding_decoder, start_tokens, end_token, + softmax_temperature=sampling_temperature, + seed=hparams.random_seed) + elif infer_mode == "greedy": + helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( + self.embedding_decoder, start_tokens, end_token) + else: + raise ValueError("Unknown infer_mode '%s'", infer_mode) + + if infer_mode != "beam_search": my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, @@ -523,7 +529,7 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): swap_memory=True, scope=decoder_scope) - if beam_width > 0: + if infer_mode == "beam_search": logits = tf.no_op() sample_id = outputs.predicted_ids else: @@ -748,7 +754,8 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, "pass_hidden_state needs to be set to True") # For beam search, we need to replicate encoder infos beam_width times - if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0: + if (self.mode == tf.contrib.learn.ModeKeys.INFER and + hparams.infer_mode == "beam_search"): decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=hparams.beam_width) else: diff --git a/nmt/model_test.py b/nmt/model_test.py index 35548e329..41f7b64ab 100644 --- a/nmt/model_test.py +++ b/nmt/model_test.py @@ -943,6 +943,7 @@ def testBeamSearchBasicModel(self): attention_architecture='', use_residual=False,) hparams.beam_width = 3 + hparams.infer_mode = "beam_search" hparams.tgt_max_len_infer = 4 assert_top_k_sentence = 3 @@ -960,6 +961,7 @@ def testBeamSearchAttentionModel(self): num_layers=2, use_residual=False,) hparams.beam_width = 3 + hparams.infer_mode = "beam_search" hparams.tgt_max_len_infer = 4 assert_top_k_sentence = 2 @@ -976,6 +978,7 @@ def testBeamSearchGNMTModel(self): attention='scaled_luong', attention_architecture='gnmt') hparams.beam_width = 3 + hparams.infer_mode = "beam_search" hparams.tgt_max_len_infer = 4 assert_top_k_sentence = 1 diff --git a/nmt/nmt.py b/nmt/nmt.py index af9185797..27ba77ed7 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -262,6 +262,9 @@ def add_arguments(parser): """)) # Advanced inference arguments + parser.add_argument("--infer_mode", type=str, default="greedy", + choices=["greedy", "sample", "beam_search"], + help="Which type of decoder to use during inference.") parser.add_argument("--beam_width", type=int, default=0, help=("""\ beam width when using beam search decoder. If 0 (default), use standard @@ -348,6 +351,7 @@ def create_hparams(flags): infer_batch_size=flags.infer_batch_size, # Advanced inference arguments + infer_mode=flags.infer_mode, beam_width=flags.beam_width, length_penalty_weight=flags.length_penalty_weight, sampling_temperature=flags.sampling_temperature, @@ -400,6 +404,12 @@ def extend_hparams(hparams): hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") + if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: + raise ValueError("beam_width must greater than 0 when using beam_search" + "decoder.") + if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0: + raise ValueError("sampling_temperature must greater than 0.0 when using" + "sample decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers @@ -533,8 +543,9 @@ def ensure_compatible_hparams(hparams, default_hparams, hparams_path): overwritten_keys = default_config.keys() else: # For inference - overwritten_keys = ["infer_batch_size", "beam_width", "length_penalty_weight", - "sampling_temperature", "num_translations_per_input"] + overwritten_keys = ["infer_batch_size", "beam_width", + "length_penalty_weight", "sampling_temperature", + "num_translations_per_input", "infer_mode"] for key in overwritten_keys: if getattr(hparams, key) != default_config[key]: utils.print_out("# Updating hparams.%s: %s -> %s" % diff --git a/nmt/train.py b/nmt/train.py index 75432d282..bad2c7a97 100644 --- a/nmt/train.py +++ b/nmt/train.py @@ -538,7 +538,7 @@ def _sample_decode(model, global_step, sess, hparams, iterator, src_data, nmt_outputs, attention_summary = model.decode(sess) - if hparams.beam_width > 0: + if hparams.infer_mode == "beam_search": # get the top translation. nmt_outputs = nmt_outputs[0] @@ -582,7 +582,8 @@ def _external_eval(model, global_step, sess, hparams, iterator, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, - decode=decode) + decode=decode, + infer_mode=hparams.infer_mode) # Save on best metrics if decode: for metric in hparams.metrics: diff --git a/nmt/utils/nmt_utils.py b/nmt/utils/nmt_utils.py index 72f71b5c2..524b293f9 100644 --- a/nmt/utils/nmt_utils.py +++ b/nmt/utils/nmt_utils.py @@ -37,7 +37,8 @@ def decode_and_evaluate(name, beam_width, tgt_eos, num_translations_per_input=1, - decode=True): + decode=True, + infer_mode="greedy"): """Decode a test set and compute a score according to the evaluation task.""" # Decode if decode: @@ -49,12 +50,15 @@ def decode_and_evaluate(name, tf.gfile.GFile(trans_file, mode="wb")) as trans_f: trans_f.write("") # Write empty string to ensure file is created. - num_translations_per_input = max( - min(num_translations_per_input, beam_width), 1) + if infer_mode == "greedy": + num_translations_per_input = 1 + elif infer_mode == "beam_search": + num_translations_per_input = min(num_translations_per_input, beam_width) + while True: try: nmt_outputs, _ = model.decode(sess) - if beam_width == 0: + if infer_mode != "beam_search" : nmt_outputs = np.expand_dims(nmt_outputs, 0) batch_size = nmt_outputs.shape[1] diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index f93f707af..7a675e511 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -101,6 +101,7 @@ def create_standard_hparams(): infer_batch_size=32, sampling_temperature=0.0, num_translations_per_input=1, + infer_mode="greedy", # Language model language_model=False, From eeed098fce18824ebd4cf3c91384cfc8181d8c34 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Wed, 24 Jan 2018 11:42:43 -0800 Subject: [PATCH 16/38] Replace compute_encoder_states with build_encoder_states (no sess.run). Add an option include_embeddings to allow for appending embedding layer in front of encoder state list. Properly handle the case when time_major=True. PiperOrigin-RevId: 183117301 --- nmt/gnmt_model.py | 6 +++--- nmt/model.py | 22 ++++++++++++++++------ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index 1d3f6e68c..c2a479006 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -77,12 +77,12 @@ def _build_encoder(self, hparams): # Look up embedding, emp_inp: [max_time, batch_size, num_units] # when time_major = True - encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, - source) + self.encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, + source) # Execute _build_bidirectional_rnn from Model class bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn( - inputs=encoder_emb_inp, + inputs=self.encoder_emb_inp, sequence_length=iterator.source_sequence_length, dtype=dtype, hparams=hparams, diff --git a/nmt/model.py b/nmt/model.py index 734d9967a..f01fc2540 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -611,10 +611,20 @@ def decode(self, sess): sample_words = sample_words.transpose([2, 0, 1]) return sample_words, infer_summary - def compute_encoder_states(self, sess): - """Compute encoder states. Return tensor [batch, length, layer, size].""" + def build_encoder_states(self, include_embeddings=False): + """Stack encoder states and return tensor [batch, length, layer, size].""" assert self.mode == tf.contrib.learn.ModeKeys.INFER - return sess.run(tf.stack(self.encoder_state_list, 2)) + if include_embeddings: + stack_state_list = tf.stack( + [self.encoder_emb_inp] + self.encoder_state_list, 2) + else: + stack_state_list = tf.stack(self.encoder_state_list, 2) + + # transform from [length, batch, ...] -> [batch, length, ...] + if self.time_major: + stack_state_list = tf.transpose(stack_state_list, [1, 0, 2, 3]) + + return stack_state_list class Model(BaseModel): @@ -637,7 +647,7 @@ def _build_encoder(self, hparams): with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] - encoder_emb_inp = tf.nn.embedding_lookup( + self.encoder_emb_inp = tf.nn.embedding_lookup( self.embedding_encoder, source) # Encoder_outputs: [max_time, batch_size, num_units] @@ -649,7 +659,7 @@ def _build_encoder(self, hparams): encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, - encoder_emb_inp, + self.encoder_emb_inp, dtype=dtype, sequence_length=iterator.source_sequence_length, time_major=self.time_major, @@ -662,7 +672,7 @@ def _build_encoder(self, hparams): encoder_outputs, bi_encoder_state = ( self._build_bidirectional_rnn( - inputs=encoder_emb_inp, + inputs=self.encoder_emb_inp, sequence_length=iterator.source_sequence_length, dtype=dtype, hparams=hparams, From 17c42723e7a50763621d931a31601be3853355ab Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Wed, 24 Jan 2018 20:33:15 -0800 Subject: [PATCH 17/38] Add sampled_softmax_loss and minor cleanup. Useful when vocab size is very large. PiperOrigin-RevId: 183184262 --- nmt/model.py | 88 ++++++++++++++++++++++------- nmt/model_test.py | 14 +++++ nmt/nmt.py | 4 ++ nmt/utils/standard_hparams_utils.py | 1 + 4 files changed, 88 insertions(+), 19 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index f01fc2540..7b4070971 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -22,8 +22,6 @@ import collections import tensorflow as tf -from tensorflow.python.layers import core as layers_core - from . import model_helper from .utils import iterator_utils from .utils import misc_utils as utils @@ -89,8 +87,8 @@ def __init__(self, # Projection with tf.variable_scope(scope or "build_network"): with tf.variable_scope("decoder/output_projection"): - self.output_layer = layers_core.Dense( - hparams.tgt_vocab_size, use_bias=False, name="output_projection") + self.output_layer = tf.layers.Dense( + self.tgt_vocab_size, use_bias=False, name="output_projection") ## Train graph res = self.build_graph(hparams, scope=scope) @@ -180,12 +178,16 @@ def _set_params_initializer(self, self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major self.dtype = tf.float32 + self.num_sampled_softmax = hparams.num_sampled_softmax # extra_args: to make it flexible for adding external customizable code self.single_cell_fn = None if extra_args: self.single_cell_fn = extra_args.single_cell_fn + # Set num units + self.num_units = hparams.num_units + # Set num layers self.num_encoder_layers = hparams.num_encoder_layers self.num_decoder_layers = hparams.num_decoder_layers @@ -207,8 +209,9 @@ def _set_params_initializer(self, self.global_step = tf.Variable(0, trainable=False) # Initializer + self.random_seed = hparams.random_seed initializer = model_helper.get_initializer( - hparams.init_op, hparams.random_seed, hparams.init_weight) + hparams.init_op, self.random_seed, hparams.init_weight) tf.get_variable_scope().set_initializer(initializer) # Embeddings @@ -288,8 +291,8 @@ def init_embeddings(self, hparams, scope): share_vocab=hparams.share_vocab, src_vocab_size=self.src_vocab_size, tgt_vocab_size=self.tgt_vocab_size, - src_embed_size=hparams.num_units, - tgt_embed_size=hparams.num_units, + src_embed_size=self.num_units, + tgt_embed_size=self.num_units, num_partitions=hparams.num_embeddings_partitions, src_vocab_file=hparams.src_vocab_file, tgt_vocab_file=hparams.tgt_vocab_file, @@ -358,14 +361,14 @@ def build_graph(self, hparams, scope=None): self.encoder_outputs, encoder_state = self._build_encoder(hparams) ## Decoder - logits, sample_id, final_context_state = self._build_decoder( - self.encoder_outputs, encoder_state, hparams) + logits, decoder_cell_outputs, sample_id, final_context_state = ( + self._build_decoder(self.encoder_outputs, encoder_state, hparams)) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device(model_helper.get_device_str(self.num_encoder_layers - 1, self.num_gpus)): - loss = self._compute_loss(logits) + loss = self._compute_loss(logits, decoder_cell_outputs) else: loss = tf.constant(0.0) @@ -391,7 +394,7 @@ def _build_encoder_cell(self, hparams, num_layers, num_residual_layers, return model_helper.create_rnn_cell( unit_type=hparams.unit_type, - num_units=hparams.num_units, + num_units=self.num_units, num_layers=num_layers, num_residual_layers=num_residual_layers, forget_bias=hparams.forget_bias, @@ -442,6 +445,11 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): hparams, encoder_outputs, encoder_state, iterator.source_sequence_length) + # Optional ops depends on which mode we are in and which loss function we + # are using. + logits = tf.no_op() + decoder_cell_outputs = None + ## Train or eval if self.mode != tf.contrib.learn.ModeKeys.INFER: # decoder_emp_inp: [max_time, batch_size, num_units] @@ -471,6 +479,10 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): sample_id = outputs.sample_id + if self.num_sampled_softmax > 0: + # Note: this is required when using sampled_softmax_loss. + decoder_cell_outputs = outputs.rnn_output + # Note: there's a subtle difference here between train and inference. # We could have set output_layer when create my_decoder # and shared more code between train and inference. @@ -478,6 +490,8 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): # 10% improvements for small models & 20% for larger ones. # If memory is a concern, we should apply output_layer per timestep. logits = self.output_layer(outputs.rnn_output) + if self.num_sampled_softmax > 0: + logits = tf.no_op() # unused when using sampled softmax loss. ## Inference else: @@ -506,7 +520,7 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): helper = tf.contrib.seq2seq.SampleEmbeddingHelper( self.embedding_decoder, start_tokens, end_token, softmax_temperature=sampling_temperature, - seed=hparams.random_seed) + seed=self.random_seed) elif infer_mode == "greedy": helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding_decoder, start_tokens, end_token) @@ -530,13 +544,12 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): scope=decoder_scope) if infer_mode == "beam_search": - logits = tf.no_op() sample_id = outputs.predicted_ids else: logits = outputs.rnn_output sample_id = outputs.sample_id - return logits, sample_id, final_context_state + return logits, decoder_cell_outputs, sample_id, final_context_state def get_max_time(self, tensor): time_axis = 0 if self.time_major else 1 @@ -559,16 +572,53 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, """ pass - def _compute_loss(self, logits): + + def _softmax_cross_entropy_loss( + self, logits, decoder_cell_outputs, labels): + """Compute softmax loss or sampled softmax loss.""" + if self.num_sampled_softmax > 0: + + is_sequence = (decoder_cell_outputs.shape.ndims == 3) + + if is_sequence: + labels = tf.reshape(labels, [-1, 1]) + inputs = tf.reshape(decoder_cell_outputs, [-1, self.num_units]) + + crossent = tf.nn.sampled_softmax_loss( + weights=tf.transpose(self.output_layer.kernel), + biases=self.output_layer.bias or tf.zeros([self.tgt_vocab_size]), + labels=labels, + inputs=inputs, + num_sampled=self.num_sampled_softmax, + num_classes=self.tgt_vocab_size, + partition_strategy='div', + seed=self.random_seed) + + if is_sequence: + if self.time_major: + crossent = tf.reshape(crossent, [-1, self.batch_size]) + else: + crossent = tf.reshape(crossent, [self.batch_size, -1]) + + else: + crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=labels, logits=logits) + + return crossent + + + def _compute_loss(self, logits, decoder_cell_outputs): """Compute optimization loss.""" target_output = self.iterator.target_output if self.time_major: target_output = tf.transpose(target_output) max_time = self.get_max_time(target_output) - crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=target_output, logits=logits) + + crossent = self._softmax_cross_entropy_loss( + logits, decoder_cell_outputs, target_output) + target_weights = tf.sequence_mask( - self.iterator.target_sequence_length, max_time, dtype=logits.dtype) + self.iterator.target_sequence_length, max_time, dtype=self.dtype) if self.time_major: target_weights = tf.transpose(target_weights) @@ -746,7 +796,7 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, cell = model_helper.create_rnn_cell( unit_type=hparams.unit_type, - num_units=hparams.num_units, + num_units=self.num_units, num_layers=self.num_decoder_layers, num_residual_layers=self.num_decoder_residual_layers, forget_bias=hparams.forget_bias, diff --git a/nmt/model_test.py b/nmt/model_test.py index 41f7b64ab..168895844 100644 --- a/nmt/model_test.py +++ b/nmt/model_test.py @@ -146,6 +146,7 @@ def setUpClass(cls): 'UniEncoderStandardAttentionArchitecture/loss': 8.8519087, 'InitializerGlorotNormal/loss': 8.9779415, 'InitializerGlorotUniform/loss': 8.7643699, + 'SampledSoftmaxLoss/loss': 5.83928, } cls.actual_eval_values = {} @@ -1016,5 +1017,18 @@ def testInitializerGlorotUniform(self): self._assertTrainStepsLoss(train_m, sess, 'InitializerGlorotUniform') + def testSampledSoftmaxLoss(self): + hparams = common_test_utils.create_test_hparams( + encoder_type='gnmt', + num_layers=4, + attention='scaled_luong', + attention_architecture='gnmt') + hparams.num_sampled_softmax = 3 + + with self.test_session() as sess: + train_m = self._createTestTrainModel(gnmt_model.GNMTModel, hparams, sess) + self._assertTrainStepsLoss(train_m, sess, + 'SampledSoftmaxLoss') + if __name__ == '__main__': tf.test.main() diff --git a/nmt/nmt.py b/nmt/nmt.py index 27ba77ed7..193b51b56 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -202,6 +202,9 @@ def add_arguments(parser): help="Limit on the size of training data (0: no limit).") parser.add_argument("--num_buckets", type=int, default=5, help="Put data into similar-length buckets.") + parser.add_argument("--num_sampled_softmax", type=int, default=0, + help=("Use sampled_softmax_loss if > 0." + "Otherwise, use full softmax loss.")) # SPM parser.add_argument("--subword_option", type=str, default="", @@ -338,6 +341,7 @@ def create_hparams(flags): warmup_scheme=flags.warmup_scheme, decay_scheme=flags.decay_scheme, colocate_gradients_with_ops=flags.colocate_gradients_with_ops, + num_sampled_softmax=flags.num_sampled_softmax, # Data constraints num_buckets=flags.num_buckets, diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index 7a675e511..c4c2e3329 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -64,6 +64,7 @@ def create_standard_hparams(): decay_scheme="luong234", colocate_gradients_with_ops=True, num_train_steps=12000, + num_sampled_softmax=0, # Data constraints num_buckets=5, From 4781773b1bf5def883510541c69538cee10dd549 Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Mon, 29 Jan 2018 12:20:06 -0800 Subject: [PATCH 18/38] Update standard hparams as num_layers is no longer a valid hparam. PiperOrigin-RevId: 183706548 --- nmt/standard_hparams/iwslt15.json | 3 ++- nmt/standard_hparams/wmt16.json | 3 ++- nmt/standard_hparams/wmt16_gnmt_4_layer.json | 3 ++- nmt/standard_hparams/wmt16_gnmt_8_layer.json | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/nmt/standard_hparams/iwslt15.json b/nmt/standard_hparams/iwslt15.json index ff5f46ec6..2b658eca1 100644 --- a/nmt/standard_hparams/iwslt15.json +++ b/nmt/standard_hparams/iwslt15.json @@ -13,7 +13,8 @@ "max_gradient_norm": 5.0, "metrics": ["bleu"], "num_buckets": 5, - "num_layers": 2, + "num_encoder_layers": 2, + "num_decoder_layers": 2, "num_train_steps": 12000, "decay_scheme": "luong234", "num_units": 512, diff --git a/nmt/standard_hparams/wmt16.json b/nmt/standard_hparams/wmt16.json index 8c1cb3fb0..ba57dc5ef 100644 --- a/nmt/standard_hparams/wmt16.json +++ b/nmt/standard_hparams/wmt16.json @@ -13,7 +13,8 @@ "max_gradient_norm": 5.0, "metrics": ["bleu"], "num_buckets": 5, - "num_layers": 4, + "num_encoder_layers": 4, + "num_decoder_layers": 4, "num_train_steps": 340000, "decay_scheme": "luong10", "num_units": 1024, diff --git a/nmt/standard_hparams/wmt16_gnmt_4_layer.json b/nmt/standard_hparams/wmt16_gnmt_4_layer.json index 0031a54e9..1274f3db0 100644 --- a/nmt/standard_hparams/wmt16_gnmt_4_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_4_layer.json @@ -13,7 +13,8 @@ "max_gradient_norm": 5.0, "metrics": ["bleu"], "num_buckets": 5, - "num_layers": 4, + "num_encoder_layers": 4, + "num_decoder_layers": 4, "num_train_steps": 340000, "decay_scheme": "luong10", "num_units": 1024, diff --git a/nmt/standard_hparams/wmt16_gnmt_8_layer.json b/nmt/standard_hparams/wmt16_gnmt_8_layer.json index 438ddcf55..0d668e0dd 100644 --- a/nmt/standard_hparams/wmt16_gnmt_8_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_8_layer.json @@ -13,7 +13,8 @@ "max_gradient_norm": 5.0, "metrics": ["bleu"], "num_buckets": 5, - "num_layers": 8, + "num_encoder_layers": 8, + "num_decoder_layers": 8, "num_train_steps": 340000, "decay_scheme": "luong10", "num_units": 1024, From 005fef0983f1046cde4bb1f41fa06a78b0db25c3 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Mon, 29 Jan 2018 21:06:48 -0800 Subject: [PATCH 19/38] Add copyright text for model_helper.py PiperOrigin-RevId: 183778701 --- nmt/model_helper.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/nmt/model_helper.py b/nmt/model_helper.py index ef9e8c277..8e50b8358 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -1,3 +1,18 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + """Utility functions for building models.""" from __future__ import print_function From 0642c53e314bb90613e2c389f88aae31465695c2 Mon Sep 17 00:00:00 2001 From: Daniel De Freitas Adiwardana Date: Mon, 29 Jan 2018 21:41:15 -0800 Subject: [PATCH 20/38] Refactoring internal and external eval to allow injection of placeholders tensors. PiperOrigin-RevId: 183781004 --- nmt/train.py | 208 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 176 insertions(+), 32 deletions(-) diff --git a/nmt/train.py b/nmt/train.py index bad2c7a97..1f061486b 100644 --- a/nmt/train.py +++ b/nmt/train.py @@ -53,20 +53,49 @@ def run_sample_decode(infer_model, infer_sess, model_dir, hparams, infer_model.batch_size_placeholder, summary_writer) -def run_internal_eval( - eval_model, eval_sess, model_dir, hparams, summary_writer, - use_test_set=True): - """Compute internal evaluation (perplexity) for both dev / test.""" +def run_internal_eval(eval_model, + eval_sess, + model_dir, + hparams, + summary_writer, + use_test_set=True, + dev_eval_iterator_feed_dict=None, + test_eval_iterator_feed_dict=None): + """Compute internal evaluation (perplexity) for both dev / test. + + Computes development and testing perplexities for given model. + + Args: + eval_model: Evaluation model for which to compute perplexities. + eval_sess: Evaluation TensorFlow session. + model_dir: Directory from which to load evaluation model from. + hparams: Model hyper-parameters. + summary_writer: Summary writer for logging metrics to TensorBoard. + use_test_set: Computes testing perplexity if true; does not otherwise. + Note that the development perplexity is always computed regardless of + value of this parameter. + dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + development evaluation. + test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + testing evaluation. + Returns: + Pair containing development perplexity and testing perplexity, in this + order. + """ + if dev_eval_iterator_feed_dict is None: + dev_eval_iterator_feed_dict = {} + if test_eval_iterator_feed_dict is None: + test_eval_iterator_feed_dict = {} with eval_model.graph.as_default(): loaded_eval_model, global_step = model_helper.create_or_load_model( eval_model.model, model_dir, eval_sess, "eval") dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) - dev_eval_iterator_feed_dict = { - eval_model.src_file_placeholder: dev_src_file, - eval_model.tgt_file_placeholder: dev_tgt_file - } + dev_eval_iterator_feed_dict[eval_model.src_file_placeholder] = dev_src_file + dev_eval_iterator_feed_dict[eval_model.tgt_file_placeholder] = dev_tgt_file dev_ppl = _internal_eval(loaded_eval_model, global_step, eval_sess, eval_model.iterator, dev_eval_iterator_feed_dict, @@ -75,30 +104,64 @@ def run_internal_eval( if use_test_set and hparams.test_prefix: test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src) test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) - test_eval_iterator_feed_dict = { - eval_model.src_file_placeholder: test_src_file, - eval_model.tgt_file_placeholder: test_tgt_file - } + test_eval_iterator_feed_dict[ + eval_model.src_file_placeholder] = test_src_file + test_eval_iterator_feed_dict[ + eval_model.tgt_file_placeholder] = test_tgt_file test_ppl = _internal_eval(loaded_eval_model, global_step, eval_sess, eval_model.iterator, test_eval_iterator_feed_dict, summary_writer, "test") return dev_ppl, test_ppl -def run_external_eval(infer_model, infer_sess, model_dir, hparams, - summary_writer, save_best_dev=True, use_test_set=True, - avg_ckpts=False): - """Compute external evaluation (bleu, rouge, etc.) for both dev / test.""" +def run_external_eval(infer_model, + infer_sess, + model_dir, + hparams, + summary_writer, + save_best_dev=True, + use_test_set=True, + avg_ckpts=False, + dev_infer_iterator_feed_dict=None, + test_infer_iterator_feed_dict=None): + """Compute external evaluation for both dev / test. + + Computes development and testing external evaluation (e.g. bleu, rouge) for + given model. + + Args: + infer_model: Inference model for which to compute perplexities. + infer_sess: Inference TensorFlow session. + model_dir: Directory from which to load inference model from. + hparams: Model hyper-parameters. + summary_writer: Summary writer for logging metrics to TensorBoard. + use_test_set: Computes testing external evaluation if true; does not + otherwise. Note that the development external evaluation is always + computed regardless of value of this parameter. + dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + development external evaluation. + test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + testing external evaluation. + Returns: + Triple containing development scores, testing scores and the TensorFlow + Variable for the global step number, in this order. + """ + if dev_infer_iterator_feed_dict is None: + dev_infer_iterator_feed_dict = {} + if test_infer_iterator_feed_dict is None: + test_infer_iterator_feed_dict = {} with infer_model.graph.as_default(): loaded_infer_model, global_step = model_helper.create_or_load_model( infer_model.model, model_dir, infer_sess, "infer") dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src) dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt) - dev_infer_iterator_feed_dict = { - infer_model.src_placeholder: inference.load_data(dev_src_file), - infer_model.batch_size_placeholder: hparams.infer_batch_size, - } + dev_infer_iterator_feed_dict[ + infer_model.src_placeholder] = inference.load_data(dev_src_file) + dev_infer_iterator_feed_dict[ + infer_model.batch_size_placeholder] = hparams.infer_batch_size dev_scores = _external_eval( loaded_infer_model, global_step, @@ -116,10 +179,10 @@ def run_external_eval(infer_model, infer_sess, model_dir, hparams, if use_test_set and hparams.test_prefix: test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src) test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt) - test_infer_iterator_feed_dict = { - infer_model.src_placeholder: inference.load_data(test_src_file), - infer_model.batch_size_placeholder: hparams.infer_batch_size, - } + test_infer_iterator_feed_dict[ + infer_model.src_placeholder] = inference.load_data(test_src_file) + test_infer_iterator_feed_dict[ + infer_model.batch_size_placeholder] = hparams.infer_batch_size test_scores = _external_eval( loaded_infer_model, global_step, @@ -157,16 +220,63 @@ def run_avg_external_eval(infer_model, infer_sess, model_dir, hparams, return avg_dev_scores, avg_test_scores -def run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess, - hparams, summary_writer, sample_src_data, sample_tgt_data, - avg_ckpts=False): - """Wrapper for running sample_decode, internal_eval and external_eval.""" - run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, - sample_src_data, sample_tgt_data) +def run_internal_and_external_eval(model_dir, + infer_model, + infer_sess, + eval_model, + eval_sess, + hparams, + summary_writer, + avg_ckpts=False, + dev_eval_iterator_feed_dict=None, + test_eval_iterator_feed_dict=None, + dev_infer_iterator_feed_dict=None, + test_infer_iterator_feed_dict=None): + """Compute internal evaluation (perplexity) for both dev / test. + + Computes development and testing perplexities for given model. + + Args: + model_dir: Directory from which to load models from. + infer_model: Inference model for which to compute perplexities. + infer_sess: Inference TensorFlow session. + eval_model: Evaluation model for which to compute perplexities. + eval_sess: Evaluation TensorFlow session. + hparams: Model hyper-parameters. + summary_writer: Summary writer for logging metrics to TensorBoard. + avg_ckpts: Whether to compute average external evaluation scores. + dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + internal development evaluation. + test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + internal testing evaluation. + dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + external development evaluation. + test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session. + Can be used to pass in additional inputs necessary for running the + external testing evaluation. + Returns: + Triple containing results summary, global step Tensorflow Variable and + metrics in this order. + """ dev_ppl, test_ppl = run_internal_eval( - eval_model, eval_sess, model_dir, hparams, summary_writer) + eval_model, + eval_sess, + model_dir, + hparams, + summary_writer, + dev_eval_iterator_feed_dict=dev_eval_iterator_feed_dict, + test_eval_iterator_feed_dict=test_eval_iterator_feed_dict) dev_scores, test_scores, global_step = run_external_eval( - infer_model, infer_sess, model_dir, hparams, summary_writer) + infer_model, + infer_sess, + model_dir, + hparams, + summary_writer, + dev_infer_iterator_feed_dict=dev_infer_iterator_feed_dict, + test_infer_iterator_feed_dict=test_infer_iterator_feed_dict) metrics = { "dev_ppl": dev_ppl, @@ -197,6 +307,40 @@ def run_full_eval(model_dir, infer_model, infer_sess, eval_model, eval_sess, return result_summary, global_step, metrics +def run_full_eval(model_dir, + infer_model, + infer_sess, + eval_model, + eval_sess, + hparams, + summary_writer, + sample_src_data, + sample_tgt_data, + avg_ckpts=False): + """Wrapper for running sample_decode, internal_eval and external_eval. + + Args: + model_dir: Directory from which to load models from. + infer_model: Inference model for which to compute perplexities. + infer_sess: Inference TensorFlow session. + eval_model: Evaluation model for which to compute perplexities. + eval_sess: Evaluation TensorFlow session. + hparams: Model hyper-parameters. + summary_writer: Summary writer for logging metrics to TensorBoard. + sample_src_data: sample of source data for sample decoding. + sample_tgt_data: sample of target data for sample decoding. + avg_ckpts: Whether to compute average external evaluation scores. + Returns: + Triple containing results summary, global step Tensorflow Variable and + metrics in this order. + """ + run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer, + sample_src_data, sample_tgt_data) + return run_internal_and_external_eval(model_dir, infer_model, infer_sess, + eval_model, eval_sess, hparams, + summary_writer, avg_ckpts) + + def init_stats(): """Initialize statistics that we want to accumulate.""" return {"step_time": 0.0, "train_loss": 0.0, From bd936ddabed7e23e4dcdf152709f28d00beb55c9 Mon Sep 17 00:00:00 2001 From: Daniel De Freitas Adiwardana Date: Fri, 2 Feb 2018 10:34:23 -0800 Subject: [PATCH 21/38] Refactoring Model._build_encoder. - Allow the construction of encoders from sequences different from the default source sequence. - Cleanups. PiperOrigin-RevId: 184301964 --- nmt/model.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index 7b4070971..0187fe86e 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -656,8 +656,8 @@ def decode(self, sess): # batch_size, time] when using beam search. if self.time_major: sample_words = sample_words.transpose() - elif sample_words.ndim == 3: # beam search output in [batch_size, - # time, beam_width] shape. + elif sample_words.ndim == 3: + # beam search output in [batch_size, time, beam_width] shape. sample_words = sample_words.transpose([2, 0, 1]) return sample_words, infer_summary @@ -684,34 +684,40 @@ class Model(BaseModel): and a multi-layer recurrent neural network decoder. """ - def _build_encoder(self, hparams): - """Build an encoder.""" + def _build_encoder_from_sequence(self, hparams, sequence, sequence_length): + """Build an encoder from a sequence. + + Args: + sequence: tensor with input sequence data. + sequence_length: tensor with length of the input sequence. + Returns: + encoder_outputs: RNN encoder outputs. + encoder_state: RNN encoder state. + """ num_layers = self.num_encoder_layers num_residual_layers = self.num_encoder_residual_layers - iterator = self.iterator - source = iterator.source if self.time_major: - source = tf.transpose(source) + sequence = tf.transpose(sequence) with tf.variable_scope("encoder") as scope: dtype = scope.dtype # Look up embedding, emp_inp: [max_time, batch_size, num_units] - self.encoder_emb_inp = tf.nn.embedding_lookup( - self.embedding_encoder, source) + self.encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, + sequence) # Encoder_outputs: [max_time, batch_size, num_units] if hparams.encoder_type == "uni": utils.print_out(" num_layers = %d, num_residual_layers=%d" % (num_layers, num_residual_layers)) - cell = self._build_encoder_cell( - hparams, num_layers, num_residual_layers) + cell = self._build_encoder_cell(hparams, num_layers, + num_residual_layers) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, self.encoder_emb_inp, dtype=dtype, - sequence_length=iterator.source_sequence_length, + sequence_length=sequence_length, time_major=self.time_major, swap_memory=True) elif hparams.encoder_type == "bi": @@ -723,7 +729,7 @@ def _build_encoder(self, hparams): encoder_outputs, bi_encoder_state = ( self._build_bidirectional_rnn( inputs=self.encoder_emb_inp, - sequence_length=iterator.source_sequence_length, + sequence_length=sequence_length, dtype=dtype, hparams=hparams, num_bi_layers=num_bi_layers, @@ -746,6 +752,11 @@ def _build_encoder(self, hparams): return encoder_outputs, encoder_state + def _build_encoder(self, hparams): + """Build encoder from source.""" + return self._build_encoder_from_sequence( + hparams, self.iterator.source, self.iterator.source_sequence_length) + def _build_bidirectional_rnn(self, inputs, sequence_length, dtype, hparams, num_bi_layers, From 781201adf218589c1f4a6d806b06be33a329610f Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Tue, 6 Feb 2018 23:24:48 -0800 Subject: [PATCH 22/38] Minor clean-ups, update docstring PiperOrigin-RevId: 184795279 --- nmt/gnmt_model.py | 1 + nmt/model.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index c2a479006..5854f9900 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -64,6 +64,7 @@ def _build_encoder(self, hparams): # Build GNMT encoder. num_bi_layers = 1 num_uni_layers = self.num_encoder_layers - num_bi_layers + utils.print_out("# Build a GNMT encoder") utils.print_out(" num_bi_layers = %d" % num_bi_layers) utils.print_out(" num_uni_layers = %d" % num_uni_layers) diff --git a/nmt/model.py b/nmt/model.py index 0187fe86e..32fb5bce2 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -112,7 +112,7 @@ def __init__(self, params = tf.trainable_variables() # Gradients and SGD update operation for training the model. - # Arrage for the embedding vars to appear at the beginning. + # Arrange for the embedding vars to appear at the beginning. if self.mode == tf.contrib.learn.ModeKeys.TRAIN: self.learning_rate = tf.constant(hparams.learning_rate) # warm-up @@ -350,11 +350,12 @@ def build_graph(self, hparams, scope=None): attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ - utils.print_out("# creating %s graph ..." % self.mode) + utils.print_out("# Creating %s graph ..." % self.mode) with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder if hparams.language_model: # no encoder for language modeling + utils.print_out(" language modeling: no encoder") self.encoder_outputs = None encoder_state = None else: @@ -572,7 +573,6 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, """ pass - def _softmax_cross_entropy_loss( self, logits, decoder_cell_outputs, labels): """Compute softmax loss or sampled softmax loss.""" @@ -591,7 +591,7 @@ def _softmax_cross_entropy_loss( inputs=inputs, num_sampled=self.num_sampled_softmax, num_classes=self.tgt_vocab_size, - partition_strategy='div', + partition_strategy="div", seed=self.random_seed) if is_sequence: @@ -606,7 +606,6 @@ def _softmax_cross_entropy_loss( return crossent - def _compute_loss(self, logits, decoder_cell_outputs): """Compute optimization loss.""" target_output = self.iterator.target_output @@ -688,11 +687,16 @@ def _build_encoder_from_sequence(self, hparams, sequence, sequence_length): """Build an encoder from a sequence. Args: + hparams: hyperparameters. sequence: tensor with input sequence data. sequence_length: tensor with length of the input sequence. + Returns: encoder_outputs: RNN encoder outputs. encoder_state: RNN encoder state. + + Raises: + ValueError: if encoder_type is neither "uni" nor "bi". """ num_layers = self.num_encoder_layers num_residual_layers = self.num_encoder_residual_layers @@ -754,6 +758,7 @@ def _build_encoder_from_sequence(self, hparams, sequence, sequence_length): def _build_encoder(self, hparams): """Build encoder from source.""" + utils.print_out("# Build a basic encoder") return self._build_encoder_from_sequence( hparams, self.iterator.source, self.iterator.source_sequence_length) From 3cd5d33d5351964a8e89423bcab7d99c47ff59c6 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Sun, 11 Feb 2018 13:16:10 -0800 Subject: [PATCH 23/38] Update vocab_utils.py to load embedding files under word2vec format. Minor updates to nmt.py to print logging info on embedding files PiperOrigin-RevId: 185313574 --- nmt/nmt.py | 6 ++++++ nmt/utils/vocab_utils.py | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/nmt/nmt.py b/nmt/nmt.py index 193b51b56..932e4f6b9 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -503,10 +503,16 @@ def extend_hparams(hparams): tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): + utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file + else: + utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file + utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) + else: + utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation for metric in hparams.metrics: diff --git a/nmt/utils/vocab_utils.py b/nmt/utils/vocab_utils.py index d5de9a11d..cea865553 100644 --- a/nmt/utils/vocab_utils.py +++ b/nmt/utils/vocab_utils.py @@ -91,13 +91,15 @@ def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): def load_embed_txt(embed_file): """Load embed_file into a python dictionary. - Note: the embed_file should be a Glove formated txt file. Assuming - embed_size=5, for example: + Note: the embed_file should be a Glove/word2vec formated txt file. Assuming + Here is an exampe assuming embed_size=5: the -0.071549 0.093459 0.023738 -0.090339 0.056123 to 0.57346 0.5417 -0.23477 -0.3624 0.4037 and 0.20327 0.47348 0.050877 0.002103 0.060547 + For word2vec format, the first line will be: . + Args: embed_file: file path to the embedding file. Returns: @@ -105,14 +107,22 @@ def load_embed_txt(embed_file): """ emb_dict = dict() emb_size = None - with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f: + + is_first_line = True + with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f: for line in f: - tokens = line.strip().split(" ") + tokens = line.rstrip().split(" ") + if is_first_line: + is_first_line = False + if len(tokens) == 2: # header line + emb_size = int(tokens[1]) + continue word = tokens[0] vec = list(map(float, tokens[1:])) emb_dict[word] = vec if emb_size: - assert emb_size == len(vec), "All embedding size should be same." + assert emb_size == len(vec), "All embedding size should be same %s." % ( + line) else: emb_size = len(vec) return emb_dict, emb_size From d2f66dff3a455caa59b71444ec036a717a1807db Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Mon, 12 Feb 2018 23:38:37 -0800 Subject: [PATCH 24/38] Allow embedding file to have some misformated entry. Simply ignore the entry that doesn't have the correct size. Handle attention_architecture == "" same as attention_architecture == "standard". Use separate embedding partitioner for encoder and decoder. PiperOrigin-RevId: 185489121 --- nmt/gnmt_model.py | 6 +++-- nmt/model.py | 3 ++- nmt/model_helper.py | 38 +++++++++++++++++++++-------- nmt/nmt.py | 17 ++++++++++--- nmt/utils/standard_hparams_utils.py | 2 ++ nmt/utils/vocab_utils.py | 6 +++-- 6 files changed, 53 insertions(+), 19 deletions(-) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index 5854f9900..5de609a6a 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -126,7 +126,8 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with GNMT attention architecture.""" # Standard attention - if hparams.attention_architecture == "standard": + if (hparams.attention_architecture == "standard" or + hparams.attention_architecture == ""): return super(GNMTModel, self)._build_decoder_cell( hparams, encoder_outputs, encoder_state, source_sequence_length) @@ -205,7 +206,8 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, def _get_infer_summary(self, hparams): # Standard attention - if hparams.attention_architecture == "standard": + if (hparams.attention_architecture == "standard" or + hparams.attention_architecture == ""): return super(GNMTModel, self)._get_infer_summary(hparams) # GNMT attention diff --git a/nmt/model.py b/nmt/model.py index 32fb5bce2..96edae7d8 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -293,7 +293,8 @@ def init_embeddings(self, hparams, scope): tgt_vocab_size=self.tgt_vocab_size, src_embed_size=self.num_units, tgt_embed_size=self.num_units, - num_partitions=hparams.num_embeddings_partitions, + num_enc_partitions=hparams.num_enc_emb_partitions, + num_dec_partitions=hparams.num_dec_emb_partitions, src_vocab_file=hparams.src_vocab_file, tgt_vocab_file=hparams.tgt_vocab_file, src_embed_file=hparams.src_embed_file, diff --git a/nmt/model_helper.py b/nmt/model_helper.py index 8e50b8358..2ffdc0517 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -290,7 +290,8 @@ def create_emb_for_encoder_and_decoder(share_vocab, src_embed_size, tgt_embed_size, dtype=tf.float32, - num_partitions=0, + num_enc_partitions=0, + num_dec_partitions=0, src_vocab_file=None, tgt_vocab_file=None, src_embed_file=None, @@ -308,7 +309,10 @@ def create_emb_for_encoder_and_decoder(share_vocab, tgt_embed_size: An integer. The embedding dimension for the decoder's embedding. dtype: dtype of the embedding matrix. Default to float32. - num_partitions: number of partitions used for the embedding vars. + num_enc_partitions: number of partitions used for the encoder's embedding + vars. + num_dec_partitions: number of partitions used for the decoder's embedding + vars. scope: VariableScope for the created subgraph. Default to "embedding". Returns: @@ -319,22 +323,36 @@ def create_emb_for_encoder_and_decoder(share_vocab, ValueError: if use share_vocab but source and target have different vocab size. """ + if num_enc_partitions <= 1: + enc_partitioner = None + else: + # Note: num_partitions > 1 is required for distributed training due to + # embedding_lookup tries to colocate single partition-ed embedding variable + # with lookup ops. This may cause embedding variables being placed on worker + # jobs. + enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions) - if num_partitions <= 1: - partitioner = None + if num_dec_partitions <= 1: + dec_partitioner = None else: # Note: num_partitions > 1 is required for distributed training due to # embedding_lookup tries to colocate single partition-ed embedding variable # with lookup ops. This may cause embedding variables being placed on worker # jobs. - partitioner = tf.fixed_size_partitioner(num_partitions) + dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions) + + if src_embed_file and enc_partitioner: + raise ValueError( + "Can't set num_enc_partitions > 1 when using pretrained encoder " + "embedding") - if (src_embed_file or tgt_embed_file) and partitioner: + if tgt_embed_file and dec_partitioner: raise ValueError( - "Can't set num_partitions > 1 when using pretrained embedding") + "Can't set num_dec_partitions > 1 when using pretrained decdoer " + "embedding") with tf.variable_scope( - scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope: + scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope: # Share embedding if share_vocab: if src_vocab_size != tgt_vocab_size: @@ -350,12 +368,12 @@ def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: - with tf.variable_scope("encoder", partitioner=partitioner): + with tf.variable_scope("encoder", partitioner=enc_partitioner): embedding_encoder = _create_or_load_embed( "embedding_encoder", src_vocab_file, src_embed_file, src_vocab_size, src_embed_size, dtype) - with tf.variable_scope("decoder", partitioner=partitioner): + with tf.variable_scope("decoder", partitioner=dec_partitioner): embedding_decoder = _create_or_load_embed( "embedding_decoder", tgt_vocab_file, tgt_embed_file, tgt_vocab_size, tgt_embed_size, dtype) diff --git a/nmt/nmt.py b/nmt/nmt.py index 932e4f6b9..28dc40df3 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -492,25 +492,34 @@ def extend_hparams(hparams): _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) + # Num embedding partitions + _add_argument(hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) + _add_argument(hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) + # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if hparams.embed_prefix: - hparams.num_embeddings_partitions = 1 - utils.print_out( - "For pretrained embeddings, set num_embeddings_partitions to 1") src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file + + utils.print_out( + "For pretrained embeddings, set num_enc_emb_partitions to 1") + hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): - hparams.tgt_embed_file = tgt_embed_file utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) + hparams.tgt_embed_file = tgt_embed_file + + utils.print_out( + "For pretrained embeddings, set num_dec_emb_partitions to 1") + hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index c4c2e3329..fe203b438 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -45,6 +45,8 @@ def create_standard_hparams(): residual=False, time_major=True, num_embeddings_partitions=0, + num_enc_emb_partitions=0, + num_dec_emb_partitions=0, # Attention mechanisms attention="scaled_luong", diff --git a/nmt/utils/vocab_utils.py b/nmt/utils/vocab_utils.py index cea865553..9771e3258 100644 --- a/nmt/utils/vocab_utils.py +++ b/nmt/utils/vocab_utils.py @@ -121,8 +121,10 @@ def load_embed_txt(embed_file): vec = list(map(float, tokens[1:])) emb_dict[word] = vec if emb_size: - assert emb_size == len(vec), "All embedding size should be same %s." % ( - line) + if emb_size != len(vec): + utils.print_out( + "Ignoring %s since embeding size is inconsistent." % word) + del emb_dict[word] else: emb_size = len(vec) return emb_dict, emb_size From 6caa9946445f8ceac736b988f36a7cd11c51a48f Mon Sep 17 00:00:00 2001 From: Anonymous Date: Sat, 17 Feb 2018 05:10:40 -0800 Subject: [PATCH 25/38] Unify reading and writing of hparams file. PiperOrigin-RevId: 186098897 --- nmt/utils/misc_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmt/utils/misc_utils.py b/nmt/utils/misc_utils.py index dc9903601..00ed4e086 100644 --- a/nmt/utils/misc_utils.py +++ b/nmt/utils/misc_utils.py @@ -102,7 +102,7 @@ def maybe_parse_standard_hparams(hparams, hparams_path): """Override hparams values with existing standard hparams config.""" if hparams_path and tf.gfile.Exists(hparams_path): print_out("# Loading standard hparams from %s" % hparams_path) - with tf.gfile.GFile(hparams_path, "r") as f: + with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_path, "rb")) as f: hparams.parse_json(f.read()) return hparams From 8f20c6999018d3122fbc02bc83320153655a8c49 Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Tue, 20 Feb 2018 17:46:55 -0800 Subject: [PATCH 26/38] Colocate output_layer with last LSTM cell to improve training speed. PiperOrigin-RevId: 186391226 --- nmt/model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/nmt/model.py b/nmt/model.py index 96edae7d8..bbad442f4 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -491,7 +491,14 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): # We chose to apply the output_layer to all timesteps for speed: # 10% improvements for small models & 20% for larger ones. # If memory is a concern, we should apply output_layer per timestep. - logits = self.output_layer(outputs.rnn_output) + num_layers = self.num_decoder_layers + num_gpus = self.num_gpus + device_id = num_layers if num_layers < num_gpus else (num_layers - 1) + # Colocate output layer with the last RNN cell if there is no extra GPU + # avaliable. Otherwise, put last layer on a separate GPU. + with tf.device(model_helper.get_device_str(device_id, num_gpus)): + logits = self.output_layer(outputs.rnn_output) + if self.num_sampled_softmax > 0: logits = tf.no_op() # unused when using sampled softmax loss. From 735b8b7319d777f4e2e0dd88ddb5c9edca255543 Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Mon, 2 Apr 2018 18:45:55 -0700 Subject: [PATCH 27/38] Add char-level embeddings for encoder only. PiperOrigin-RevId: 191382249 --- nmt/gnmt_model.py | 23 +++++---- nmt/model.py | 25 +++++++--- nmt/model_helper.py | 23 ++++++--- nmt/nmt.py | 15 +++++- nmt/utils/iterator_utils.py | 77 +++++++++++++++++++++++------ nmt/utils/standard_hparams_utils.py | 1 + nmt/utils/vocab_utils.py | 68 ++++++++++++++++++++++++- 7 files changed, 185 insertions(+), 47 deletions(-) diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index 5de609a6a..00c8bb23f 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -20,12 +20,10 @@ import tensorflow as tf -# TODO(rzhao): Use tf.contrib.framework.nest once 1.3 is out. -from tensorflow.python.util import nest - from . import attention_model from . import model_helper from .utils import misc_utils as utils +from .utils import vocab_utils __all__ = ["GNMTModel"] @@ -76,10 +74,8 @@ def _build_encoder(self, hparams): with tf.variable_scope("encoder") as scope: dtype = scope.dtype - # Look up embedding, emp_inp: [max_time, batch_size, num_units] - # when time_major = True - self.encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, - source) + self.encoder_emb_inp = self.encoder_emb_lookup_fn( + self.embedding_encoder, source) # Execute _build_bidirectional_rnn from Model class bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn( @@ -235,7 +231,7 @@ def __init__(self, attention_cell, cells, use_new_attention=False): def __call__(self, inputs, state, scope=None): """Run the cell with bottom layer's attention copied to all upper layers.""" - if not nest.is_sequence(state): + if not tf.contrib.framework.nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) @@ -281,9 +277,12 @@ def split_input(inp, out): out_dim = out.get_shape().as_list()[-1] inp_dim = inp.get_shape().as_list()[-1] return tf.split(inp, [out_dim, inp_dim - out_dim], axis=-1) - actual_inputs, _ = nest.map_structure(split_input, inputs, outputs) + actual_inputs, _ = tf.contrib.framework.nest.map_structure( + split_input, inputs, outputs) def assert_shape_match(inp, out): inp.get_shape().assert_is_compatible_with(out.get_shape()) - nest.assert_same_structure(actual_inputs, outputs) - nest.map_structure(assert_shape_match, actual_inputs, outputs) - return nest.map_structure(lambda inp, out: inp + out, actual_inputs, outputs) + tf.contrib.framework.nest.assert_same_structure(actual_inputs, outputs) + tf.contrib.framework.nest.map_structure( + assert_shape_match, actual_inputs, outputs) + return tf.contrib.framework.nest.map_structure( + lambda inp, out: inp + out, actual_inputs, outputs) diff --git a/nmt/model.py b/nmt/model.py index bbad442f4..4d777e6c7 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -20,11 +20,14 @@ import abc import collections +import numpy as np + import tensorflow as tf from . import model_helper from .utils import iterator_utils from .utils import misc_utils as utils +from .utils import vocab_utils utils.check_tensorflow_version() @@ -177,6 +180,11 @@ def _set_params_initializer(self, self.tgt_vocab_size = hparams.tgt_vocab_size self.num_gpus = hparams.num_gpus self.time_major = hparams.time_major + + if hparams.use_char_encode: + assert (not self.time_major), ("Can't use time major for" + " char-level inputs.") + self.dtype = tf.float32 self.num_sampled_softmax = hparams.num_sampled_softmax @@ -215,9 +223,12 @@ def _set_params_initializer(self, tf.get_variable_scope().set_initializer(initializer) # Embeddings + if extra_args and extra_args.encoder_emb_lookup_fn: + self.encoder_emb_lookup_fn = extra_args.encoder_emb_lookup_fn + else: + self.encoder_emb_lookup_fn = tf.nn.embedding_lookup self.init_embeddings(hparams, scope) - def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps @@ -299,6 +310,7 @@ def init_embeddings(self, hparams, scope): tgt_vocab_file=hparams.tgt_vocab_file, src_embed_file=hparams.src_embed_file, tgt_embed_file=hparams.tgt_embed_file, + use_char_encode=hparams.use_char_encode, scope=scope,)) def _get_train_summary(self): @@ -576,8 +588,8 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length: sequence length of encoder_outputs. Returns: - A tuple of a multi-layer RNN cell used by decoder - and the intial state of the decoder RNN. + A tuple of a multi-layer RNN cell used by decoder and the intial state of + the decoder RNN. """ pass @@ -690,7 +702,6 @@ class Model(BaseModel): This class implements a multi-layer recurrent neural network as encoder, and a multi-layer recurrent neural network decoder. """ - def _build_encoder_from_sequence(self, hparams, sequence, sequence_length): """Build an encoder from a sequence. @@ -714,9 +725,9 @@ def _build_encoder_from_sequence(self, hparams, sequence, sequence_length): with tf.variable_scope("encoder") as scope: dtype = scope.dtype - # Look up embedding, emp_inp: [max_time, batch_size, num_units] - self.encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder, - sequence) + + self.encoder_emb_inp = self.encoder_emb_lookup_fn( + self.embedding_encoder, sequence) # Encoder_outputs: [max_time, batch_size, num_units] if hparams.encoder_type == "uni": diff --git a/nmt/model_helper.py b/nmt/model_helper.py index 2ffdc0517..826d7e4b5 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -66,7 +66,7 @@ def get_device_str(device_id, num_gpus): class ExtraArgs(collections.namedtuple( "ExtraArgs", ("single_cell_fn", "model_device_fn", - "attention_mechanism_fn"))): + "attention_mechanism_fn", "encoder_emb_lookup_fn"))): pass @@ -109,7 +109,8 @@ def create_train_model( tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder, num_shards=num_workers, - shard_index=jobid) + shard_index=jobid, + use_char_encode=hparams.use_char_encode) # Note: One can set model_device_fn to # `tf.train.replica_device_setter(ps_tasks)` for distributed training. @@ -166,7 +167,8 @@ def create_eval_model(model_creator, hparams, scope=None, extra_args=None): random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len_infer, - tgt_max_len=hparams.tgt_max_len_infer) + tgt_max_len=hparams.tgt_max_len_infer, + use_char_encode=hparams.use_char_encode) model = model_creator( hparams, iterator=iterator, @@ -213,7 +215,8 @@ def create_infer_model(model_creator, hparams, scope=None, extra_args=None): src_vocab_table, batch_size=batch_size_placeholder, eos=hparams.eos, - src_max_len=hparams.src_max_len_infer) + src_max_len=hparams.src_max_len_infer, + use_char_encode=hparams.use_char_encode) model = model_creator( hparams, iterator=iterator, @@ -296,6 +299,7 @@ def create_emb_for_encoder_and_decoder(share_vocab, tgt_vocab_file=None, src_embed_file=None, tgt_embed_file=None, + use_char_encode=False, scope=None): """Create embedding matrix for both encoder and decoder. @@ -368,10 +372,13 @@ def create_emb_for_encoder_and_decoder(share_vocab, src_vocab_size, src_embed_size, dtype) embedding_decoder = embedding_encoder else: - with tf.variable_scope("encoder", partitioner=enc_partitioner): - embedding_encoder = _create_or_load_embed( - "embedding_encoder", src_vocab_file, src_embed_file, - src_vocab_size, src_embed_size, dtype) + if not use_char_encode: + with tf.variable_scope("encoder", partitioner=enc_partitioner): + embedding_encoder = _create_or_load_embed( + "embedding_encoder", src_vocab_file, src_embed_file, + src_vocab_size, src_embed_size, dtype) + else: + embedding_encoder = None with tf.variable_scope("decoder", partitioner=dec_partitioner): embedding_decoder = _create_or_load_embed( diff --git a/nmt/nmt.py b/nmt/nmt.py index 28dc40df3..512b18a66 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -213,6 +213,14 @@ def add_arguments(parser): Set to bpe or spm to activate subword desegmentation.\ """) + # Experimental encoding feature. + parser.add_argument("--use_char_encode", type="bool", default=False, + help="""\ + Whether to split each word or bpe into character, and then + generate the word-level representation from the character + reprentation. + """) + # Misc parser.add_argument("--num_gpus", type=int, default=1, help="Number of gpus in each worker.") @@ -366,6 +374,7 @@ def create_hparams(flags): eos=flags.eos if flags.eos else vocab_utils.EOS, subword_option=flags.subword_option, check_special_token=flags.check_special_token, + use_char_encode=flags.use_char_encode, # Misc forget_bias=flags.forget_bias, @@ -493,8 +502,10 @@ def extend_hparams(hparams): _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions - _add_argument(hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) - _add_argument(hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) + _add_argument( + hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) + _add_argument( + hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") diff --git a/nmt/utils/iterator_utils.py b/nmt/utils/iterator_utils.py index 623bf461a..31efb11ff 100644 --- a/nmt/utils/iterator_utils.py +++ b/nmt/utils/iterator_utils.py @@ -19,6 +19,9 @@ import tensorflow as tf +from ..utils import vocab_utils + + __all__ = ["BatchedInput", "get_iterator", "get_infer_iterator"] @@ -35,17 +38,34 @@ def get_infer_iterator(src_dataset, src_vocab_table, batch_size, eos, - src_max_len=None): - src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) + src_max_len=None, + use_char_encode=False): + if use_char_encode: + src_eos_id = vocab_utils.EOS_CHAR_ID + else: + src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values) if src_max_len: src_dataset = src_dataset.map(lambda src: src[:src_max_len]) - # Convert the word strings to ids - src_dataset = src_dataset.map( - lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) + + if use_char_encode: + # Convert the word strings to character ids + src_dataset = src_dataset.map( + lambda src: tf.reshape(vocab_utils.tokens_to_bytes(src), [-1])) + else: + # Convert the word strings to ids + src_dataset = src_dataset.map( + lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) + # Add in the word counts. - src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) + if use_char_encode: + src_dataset = src_dataset.map( + lambda src: (src, + tf.to_int32( + tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN))) + else: + src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( @@ -91,10 +111,16 @@ def get_iterator(src_dataset, skip_count=None, num_shards=1, shard_index=0, - reshuffle_each_iteration=True): + reshuffle_each_iteration=True, + use_char_encode=False): if not output_buffer_size: output_buffer_size = batch_size * 1000 - src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) + + if use_char_encode: + src_eos_id = vocab_utils.EOS_CHAR_ID + else: + src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) + tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32) tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32) @@ -124,12 +150,21 @@ def get_iterator(src_dataset, src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, tgt[:tgt_max_len]), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) + # Convert the word strings to ids. Word strings that are not in the # vocab get the lookup table's default_value integer. - src_tgt_dataset = src_tgt_dataset.map( - lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32), - tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), - num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) + if use_char_encode: + src_tgt_dataset = src_tgt_dataset.map( + lambda src, tgt: (tf.reshape(vocab_utils.tokens_to_bytes(src), [-1]), + tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), + num_parallel_calls=num_parallel_calls) + else: + src_tgt_dataset = src_tgt_dataset.map( + lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32), + tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)), + num_parallel_calls=num_parallel_calls) + + src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) # Create a tgt_input prefixed with and a tgt_output suffixed with . src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: (src, @@ -137,10 +172,20 @@ def get_iterator(src_dataset, tf.concat((tgt, [tgt_eos_id]), 0)), num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) # Add in sequence lengths. - src_tgt_dataset = src_tgt_dataset.map( - lambda src, tgt_in, tgt_out: ( - src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)), - num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size) + if use_char_encode: + src_tgt_dataset = src_tgt_dataset.map( + lambda src, tgt_in, tgt_out: ( + src, tgt_in, tgt_out, + tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN), + tf.size(tgt_in)), + num_parallel_calls=num_parallel_calls) + else: + src_tgt_dataset = src_tgt_dataset.map( + lambda src, tgt_in, tgt_out: ( + src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)), + num_parallel_calls=num_parallel_calls) + + src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size) # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...) def batching_func(x): diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index fe203b438..2729cf662 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -80,6 +80,7 @@ def create_standard_hparams(): sos="", eos="", subword_option="", + use_char_encode=False, check_special_token=True, # Misc diff --git a/nmt/utils/vocab_utils.py b/nmt/utils/vocab_utils.py index 9771e3258..5063bf2ef 100644 --- a/nmt/utils/vocab_utils.py +++ b/nmt/utils/vocab_utils.py @@ -27,12 +27,76 @@ from ..utils import misc_utils as utils - +# word level special token UNK = "" SOS = "" EOS = "" UNK_ID = 0 +# char ids 0-255 come from utf-8 encoding bytes +# assign 256-300 to special chars +BOS_CHAR_ID = 256 # +EOS_CHAR_ID = 257 # +BOW_CHAR_ID = 258 # +EOW_CHAR_ID = 259 # +PAD_CHAR_ID = 260 # + +DEFAULT_CHAR_MAXLEN = 50 # max number of chars for each word. + + +def _string_to_bytes(text, max_length): + """Given string and length, convert to byte seq of at most max_length. + + This process mimics docqa/elmo's preprocessing: + https://github.com/allenai/document-qa/blob/master/docqa/elmo/data.py + + Note that we make use of BOS_CHAR_ID and EOS_CHAR_ID in iterator_utils.py & + our usage differs from docqa/elmo. + + Args: + text: tf.string tensor of shape [] + max_length: max number of chars for each word. + + Returns: + A tf.int32 tensor of the byte encoded text. + """ + byte_ids = tf.to_int32(tf.decode_raw(text, tf.uint8)) + byte_ids = byte_ids[:max_length - 2] + padding = tf.fill([max_length - tf.shape(byte_ids)[0] - 2], PAD_CHAR_ID) + byte_ids = tf.concat( + [[BOW_CHAR_ID], byte_ids, [EOW_CHAR_ID], padding], axis=0) + tf.logging.info(byte_ids) + + byte_ids = tf.reshape(byte_ids, [max_length]) + tf.logging.info(byte_ids.get_shape().as_list()) + return byte_ids + 1 + + +def tokens_to_bytes(tokens): + """Given a sequence of strings, map to sequence of bytes. + + Args: + tokens: A tf.string tensor + + Returns: + A tensor of shape words.shape + [bytes_per_word] containing byte versions + of each word. + """ + bytes_per_word = DEFAULT_CHAR_MAXLEN + with tf.device("/cpu:0"): + tf.assert_rank(tokens, 1) + shape = tf.shape(tokens) + tf.logging.info(tokens) + tokens_flat = tf.reshape(tokens, [-1]) + as_bytes_flat = tf.map_fn( + fn=lambda x: _string_to_bytes(x, max_length=bytes_per_word), + elems=tokens_flat, + dtype=tf.int32, + back_prop=False) + tf.logging.info(as_bytes_flat) + as_bytes = tf.reshape(as_bytes_flat, [shape[0], bytes_per_word]) + return as_bytes + def load_vocab(vocab_file): vocab = [] @@ -91,7 +155,7 @@ def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): def load_embed_txt(embed_file): """Load embed_file into a python dictionary. - Note: the embed_file should be a Glove/word2vec formated txt file. Assuming + Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming Here is an exampe assuming embed_size=5: the -0.071549 0.093459 0.023738 -0.090339 0.056123 From a35b1e344da4a4efb747128ff7b50be999804f3e Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Wed, 4 Apr 2018 18:10:16 -0700 Subject: [PATCH 28/38] Refactored *model.py files. Added a implicit flag extract_encoder_layers to get intermediate layers from GNMT models and skip decoder. PiperOrigin-RevId: 191678516 --- nmt/attention_model.py | 5 -- nmt/gnmt_model.py | 111 ++++++++++++++++++++++--------- nmt/model.py | 148 ++++++++++++++++++++++------------------- 3 files changed, 159 insertions(+), 105 deletions(-) diff --git a/nmt/attention_model.py b/nmt/attention_model.py index 38120c3cb..d262b8e8e 100644 --- a/nmt/attention_model.py +++ b/nmt/attention_model.py @@ -63,10 +63,6 @@ def __init__(self, scope=scope, extra_args=extra_args) - if self.mode == tf.contrib.learn.ModeKeys.INFER: - self.infer_summary = self._get_infer_summary(hparams) - - def _prepare_beam_search_decoder_inputs( self, beam_width, memory, source_sequence_length, encoder_state): memory = tf.contrib.seq2seq.tile_batch( @@ -78,7 +74,6 @@ def _prepare_beam_search_decoder_inputs( batch_size = self.batch_size * beam_width return memory, source_sequence_length, encoder_state, batch_size - def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with attention mechanism that can be used by decoder.""" diff --git a/nmt/gnmt_model.py b/nmt/gnmt_model.py index 00c8bb23f..468a5d00c 100644 --- a/nmt/gnmt_model.py +++ b/nmt/gnmt_model.py @@ -41,6 +41,9 @@ def __init__(self, reverse_target_vocab_table=None, scope=None, extra_args=None): + self.is_gnmt_attention = ( + hparams.attention_architecture in ["gnmt", "gnmt_v2"]) + super(GNMTModel, self).__init__( hparams=hparams, mode=mode, @@ -87,43 +90,88 @@ def _build_encoder(self, hparams): num_bi_residual_layers=0, # no residual connection ) - uni_cell = model_helper.create_rnn_cell( - unit_type=hparams.unit_type, - num_units=hparams.num_units, - num_layers=num_uni_layers, - num_residual_layers=self.num_encoder_residual_layers, - forget_bias=hparams.forget_bias, - dropout=hparams.dropout, - num_gpus=self.num_gpus, - base_gpu=1, - mode=self.mode, - single_cell_fn=self.single_cell_fn) - - # encoder_outputs: size [max_time, batch_size, num_units] - # when time_major = True - encoder_outputs, encoder_state = tf.nn.dynamic_rnn( - uni_cell, - bi_encoder_outputs, - dtype=dtype, - sequence_length=iterator.source_sequence_length, - time_major=self.time_major) + # Build unidirectional layers + if self.extract_encoder_layers: + encoder_state, encoder_outputs = self._build_individual_encoder_layers( + bi_encoder_outputs, num_uni_layers, dtype, hparams) + else: + encoder_state, encoder_outputs = self._build_all_encoder_layers( + bi_encoder_outputs, num_uni_layers, dtype, hparams) - # Pass all encoder state except the first bi-directional layer's state to - # decoder. + # Pass all encoder states to the decoder + # except the first bi-directional layer encoder_state = (bi_encoder_state[1],) + ( (encoder_state,) if num_uni_layers == 1 else encoder_state) + return encoder_outputs, encoder_state + + def _build_all_encoder_layers(self, bi_encoder_outputs, + num_uni_layers, dtype, hparams): + """Build encoder layers all at once.""" + uni_cell = model_helper.create_rnn_cell( + unit_type=hparams.unit_type, + num_units=hparams.num_units, + num_layers=num_uni_layers, + num_residual_layers=self.num_encoder_residual_layers, + forget_bias=hparams.forget_bias, + dropout=hparams.dropout, + num_gpus=self.num_gpus, + base_gpu=1, + mode=self.mode, + single_cell_fn=self.single_cell_fn) + encoder_outputs, encoder_state = tf.nn.dynamic_rnn( + uni_cell, + bi_encoder_outputs, + dtype=dtype, + sequence_length=self.iterator.source_sequence_length, + time_major=self.time_major) + # Use the top layer for now self.encoder_state_list = [encoder_outputs] - return encoder_outputs, encoder_state + return encoder_state, encoder_outputs + + def _build_individual_encoder_layers(self, bi_encoder_outputs, + num_uni_layers, dtype, hparams): + """Run each of the encoder layer separately, not used in general seq2seq.""" + uni_cell_lists = model_helper._cell_list( + unit_type=hparams.unit_type, + num_units=hparams.num_units, + num_layers=num_uni_layers, + num_residual_layers=self.num_encoder_residual_layers, + forget_bias=hparams.forget_bias, + dropout=hparams.dropout, + num_gpus=self.num_gpus, + base_gpu=1, + mode=self.mode, + single_cell_fn=self.single_cell_fn) + + encoder_inp = bi_encoder_outputs + encoder_states = [] + self.encoder_state_list = [bi_encoder_outputs[:, :, :hparams.num_units], + bi_encoder_outputs[:, :, hparams.num_units:]] + with tf.variable_scope("rnn/multi_rnn_cell"): + for i, cell in enumerate(uni_cell_lists): + with tf.variable_scope("cell_%d" % i) as scope: + encoder_inp, encoder_state = tf.nn.dynamic_rnn( + cell, + encoder_inp, + dtype=dtype, + sequence_length=self.iterator.source_sequence_length, + time_major=self.time_major, + scope=scope) + encoder_states.append(encoder_state) + self.encoder_state_list.append(encoder_inp) + + encoder_state = tuple(encoder_states) + encoder_outputs = self.encoder_state_list[-1] + return encoder_state, encoder_outputs def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length): """Build a RNN cell with GNMT attention architecture.""" # Standard attention - if (hparams.attention_architecture == "standard" or - hparams.attention_architecture == ""): + if not self.is_gnmt_attention: return super(GNMTModel, self)._build_decoder_cell( hparams, encoder_outputs, encoder_state, source_sequence_length) @@ -201,16 +249,13 @@ def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, return cell, decoder_initial_state def _get_infer_summary(self, hparams): - # Standard attention - if (hparams.attention_architecture == "standard" or - hparams.attention_architecture == ""): - return super(GNMTModel, self)._get_infer_summary(hparams) - - # GNMT attention if hparams.infer_mode == "beam_search": return tf.no_op() - return attention_model._create_attention_images_summary( - self.final_context_state[0]) + elif self.is_gnmt_attention: + return attention_model._create_attention_images_summary( + self.final_context_state[0]) + else: + return super(GNMTModel, self)._get_infer_summary(hparams) class GNMTAttentionMultiCell(tf.nn.rnn_cell.MultiRNNCell): diff --git a/nmt/model.py b/nmt/model.py index 4d777e6c7..113c17320 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -87,80 +87,19 @@ def __init__(self, source_vocab_table, target_vocab_table, scope, extra_args) - # Projection - with tf.variable_scope(scope or "build_network"): - with tf.variable_scope("decoder/output_projection"): - self.output_layer = tf.layers.Dense( - self.tgt_vocab_size, use_bias=False, name="output_projection") + # Not used in general seq2seq models; when True, ignore decoder & training + self.extract_encoder_layers = (hasattr(hparams, "extract_encoder_layers") + and hparams.extract_encoder_layers) - ## Train graph + # Train graph res = self.build_graph(hparams, scope=scope) - if self.mode == tf.contrib.learn.ModeKeys.TRAIN: - self.train_loss = res[1] - self.word_count = tf.reduce_sum( - self.iterator.source_sequence_length) + tf.reduce_sum( - self.iterator.target_sequence_length) - elif self.mode == tf.contrib.learn.ModeKeys.EVAL: - self.eval_loss = res[1] - elif self.mode == tf.contrib.learn.ModeKeys.INFER: - self.infer_logits, _, self.final_context_state, self.sample_id = res - self.sample_words = reverse_target_vocab_table.lookup( - tf.to_int64(self.sample_id)) - - if self.mode != tf.contrib.learn.ModeKeys.INFER: - ## Count the number of predicted words for compute ppl. - self.predict_count = tf.reduce_sum( - self.iterator.target_sequence_length) - - params = tf.trainable_variables() - - # Gradients and SGD update operation for training the model. - # Arrange for the embedding vars to appear at the beginning. - if self.mode == tf.contrib.learn.ModeKeys.TRAIN: - self.learning_rate = tf.constant(hparams.learning_rate) - # warm-up - self.learning_rate = self._get_learning_rate_warmup(hparams) - # decay - self.learning_rate = self._get_learning_rate_decay(hparams) - - # Optimizer - if hparams.optimizer == "sgd": - opt = tf.train.GradientDescentOptimizer(self.learning_rate) - elif hparams.optimizer == "adam": - opt = tf.train.AdamOptimizer(self.learning_rate) - else: - raise ValueError("Unknown optimizer type %s" % hparams.optimizer) - - # Gradients - gradients = tf.gradients( - self.train_loss, - params, - colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) - - clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( - gradients, max_gradient_norm=hparams.max_gradient_norm) - self.grad_norm_summary = grad_norm_summary - self.grad_norm = grad_norm - - self.update = opt.apply_gradients( - zip(clipped_grads, params), global_step=self.global_step) - - # Summary - self.train_summary = self._get_train_summary() - elif self.mode == tf.contrib.learn.ModeKeys.INFER: - self.infer_summary = self._get_infer_summary(hparams) + if not self.extract_encoder_layers: + self._set_train_or_infer(res, reverse_target_vocab_table, hparams) # Saver self.saver = tf.train.Saver( tf.global_variables(), max_to_keep=hparams.num_keep_ckpts) - # Print trainable variables - utils.print_out("# Trainable variables") - utils.print_out("Format: , , <(soft) device placement>") - for param in params: - utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), - param.op.device)) - def _set_params_initializer(self, hparams, mode, @@ -229,6 +168,70 @@ def _set_params_initializer(self, self.encoder_emb_lookup_fn = tf.nn.embedding_lookup self.init_embeddings(hparams, scope) + def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams): + """Set up training and inference.""" + if self.mode == tf.contrib.learn.ModeKeys.TRAIN: + self.train_loss = res[1] + self.word_count = tf.reduce_sum( + self.iterator.source_sequence_length) + tf.reduce_sum( + self.iterator.target_sequence_length) + elif self.mode == tf.contrib.learn.ModeKeys.EVAL: + self.eval_loss = res[1] + elif self.mode == tf.contrib.learn.ModeKeys.INFER: + self.infer_logits, _, self.final_context_state, self.sample_id = res + self.sample_words = reverse_target_vocab_table.lookup( + tf.to_int64(self.sample_id)) + + if self.mode != tf.contrib.learn.ModeKeys.INFER: + ## Count the number of predicted words for compute ppl. + self.predict_count = tf.reduce_sum( + self.iterator.target_sequence_length) + + params = tf.trainable_variables() + + # Gradients and SGD update operation for training the model. + # Arrange for the embedding vars to appear at the beginning. + if self.mode == tf.contrib.learn.ModeKeys.TRAIN: + self.learning_rate = tf.constant(hparams.learning_rate) + # warm-up + self.learning_rate = self._get_learning_rate_warmup(hparams) + # decay + self.learning_rate = self._get_learning_rate_decay(hparams) + + # Optimizer + if hparams.optimizer == "sgd": + opt = tf.train.GradientDescentOptimizer(self.learning_rate) + elif hparams.optimizer == "adam": + opt = tf.train.AdamOptimizer(self.learning_rate) + else: + raise ValueError("Unknown optimizer type %s" % hparams.optimizer) + + # Gradients + gradients = tf.gradients( + self.train_loss, + params, + colocate_gradients_with_ops=hparams.colocate_gradients_with_ops) + + clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip( + gradients, max_gradient_norm=hparams.max_gradient_norm) + self.grad_norm_summary = grad_norm_summary + self.grad_norm = grad_norm + + self.update = opt.apply_gradients( + zip(clipped_grads, params), global_step=self.global_step) + + # Summary + self.train_summary = self._get_train_summary() + elif self.mode == tf.contrib.learn.ModeKeys.INFER: + self.infer_summary = self._get_infer_summary(hparams) + + # Print trainable variables + utils.print_out("# Trainable variables") + utils.print_out("Format: , , <(soft) device placement>") + for param in params: + utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()), + param.op.device)) + def _get_learning_rate_warmup(self, hparams): """Get learning rate warmup.""" warmup_steps = hparams.warmup_steps @@ -365,6 +368,13 @@ def build_graph(self, hparams, scope=None): """ utils.print_out("# Creating %s graph ..." % self.mode) + # Projection + if not self.extract_encoder_layers: + with tf.variable_scope(scope or "build_network"): + with tf.variable_scope("decoder/output_projection"): + self.output_layer = tf.layers.Dense( + self.tgt_vocab_size, use_bias=False, name="output_projection") + with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype): # Encoder if hparams.language_model: # no encoder for language modeling @@ -374,6 +384,10 @@ def build_graph(self, hparams, scope=None): else: self.encoder_outputs, encoder_state = self._build_encoder(hparams) + # Skip decoder if extracting only encoder layers + if self.extract_encoder_layers: + return + ## Decoder logits, decoder_cell_outputs, sample_id, final_context_state = ( self._build_decoder(self.encoder_outputs, encoder_state, hparams)) From 6853265ff18d40498855a5231974b77c5ee7bbb0 Mon Sep 17 00:00:00 2001 From: Anonymous Date: Thu, 5 Apr 2018 03:43:45 -0700 Subject: [PATCH 29/38] Pretty print hparams when writing file. PiperOrigin-RevId: 191720585 --- nmt/utils/misc_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmt/utils/misc_utils.py b/nmt/utils/misc_utils.py index 00ed4e086..540f44100 100644 --- a/nmt/utils/misc_utils.py +++ b/nmt/utils/misc_utils.py @@ -112,7 +112,7 @@ def save_hparams(out_dir, hparams): hparams_file = os.path.join(out_dir, "hparams") print_out(" saving hparams to %s" % hparams_file) with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f: - f.write(hparams.to_json()) + f.write(hparams.to_json(indent=4, sort_keys=True)) def debug_tensor(s, msg=None, summarize=10): From 2411c44c71c5492ed6357a2e8e4f903abc2629e0 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Thu, 5 Apr 2018 15:01:17 -0700 Subject: [PATCH 30/38] Remove hparams.num_layers PiperOrigin-RevId: 191804041 --- nmt/nmt.py | 10 +++++----- nmt/utils/standard_hparams_utils.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nmt/nmt.py b/nmt/nmt.py index 512b18a66..c3c209129 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -321,7 +321,6 @@ def create_hparams(flags): # Networks num_units=flags.num_units, - num_layers=flags.num_layers, # Compatible num_encoder_layers=(flags.num_encoder_layers or flags.num_layers), num_decoder_layers=(flags.num_decoder_layers or flags.num_layers), dropout=flags.dropout, @@ -555,10 +554,11 @@ def ensure_compatible_hparams(hparams, default_hparams, hparams_path): default_hparams = utils.maybe_parse_standard_hparams( default_hparams, hparams_path) # Set num encoder/decoder layers (for old checkpoints) - if not hasattr(hparams, "num_encoder_layers"): - hparams.add_hparam("num_encoder_layers", hparams.num_layers) - if not hasattr(hparams, "num_decoder_layers"): - hparams.add_hparam("num_decoder_layers", hparams.num_layers) + if hasattr(hparams, "num_layers"): + if not hasattr(hparams, "num_encoder_layers"): + hparams.add_hparam("num_encoder_layers", hparams.num_layers) + if not hasattr(hparams, "num_decoder_layers"): + hparams.add_hparam("num_decoder_layers", hparams.num_layers) # For compatible reason, if there are new fields in default_hparams, # we add them to the current hparams diff --git a/nmt/utils/standard_hparams_utils.py b/nmt/utils/standard_hparams_utils.py index 2729cf662..c47a6f6b3 100644 --- a/nmt/utils/standard_hparams_utils.py +++ b/nmt/utils/standard_hparams_utils.py @@ -36,7 +36,6 @@ def create_standard_hparams(): # Networks num_units=512, - num_layers=2, num_encoder_layers=2, num_decoder_layers=2, dropout=0.2, From 9494594194892badf2ee510ef278500770c7c71c Mon Sep 17 00:00:00 2001 From: Rui Zhao Date: Fri, 11 May 2018 11:30:05 -0700 Subject: [PATCH 31/38] Remove unused standard hparams. PiperOrigin-RevId: 196283435 --- nmt/standard_hparams/wmt16_gnmt_8_layer.json | 1 - 1 file changed, 1 deletion(-) diff --git a/nmt/standard_hparams/wmt16_gnmt_8_layer.json b/nmt/standard_hparams/wmt16_gnmt_8_layer.json index 0d668e0dd..f3b217b5d 100644 --- a/nmt/standard_hparams/wmt16_gnmt_8_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_8_layer.json @@ -23,7 +23,6 @@ "share_vocab": false, "subword_option": "bpe", "sos": "", - "source_reverse": false, "src_max_len": 50, "src_max_len_infer": null, "steps_per_external_eval": null, From 3128daca8b71744d43b93ebc5f0914e09c35ffe1 Mon Sep 17 00:00:00 2001 From: Anonymous Date: Wed, 4 Jul 2018 07:33:52 -0700 Subject: [PATCH 32/38] Fix typo. PiperOrigin-RevId: 203278814 --- nmt/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmt/inference.py b/nmt/inference.py index b0759a205..517aba52c 100644 --- a/nmt/inference.py +++ b/nmt/inference.py @@ -234,7 +234,7 @@ def multi_worker_inference(infer_model, for worker_id in range(num_workers): worker_infer_done = "%s_done_%d" % (inference_output_file, worker_id) while not tf.gfile.Exists(worker_infer_done): - utils.print_out(" waitting job %d to complete." % worker_id) + utils.print_out(" waiting job %d to complete." % worker_id) time.sleep(10) with codecs.getreader("utf-8")( From eb88c1863ec34075f8b5151a6f3ace3506367505 Mon Sep 17 00:00:00 2001 From: Daniel De Freitas Adiwardana Date: Fri, 20 Jul 2018 16:22:19 -0700 Subject: [PATCH 33/38] [NMT] Adding support for sharded train sets. PiperOrigin-RevId: 205470789 --- nmt/model_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nmt/model_helper.py b/nmt/model_helper.py index 826d7e4b5..65e111414 100644 --- a/nmt/model_helper.py +++ b/nmt/model_helper.py @@ -91,8 +91,8 @@ def create_train_model( src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) - src_dataset = tf.data.TextLineDataset(src_file) - tgt_dataset = tf.data.TextLineDataset(tgt_file) + src_dataset = tf.data.TextLineDataset(tf.gfile.Glob(src_file)) + tgt_dataset = tf.data.TextLineDataset(tf.gfile.Glob(tgt_file)) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) iterator = iterator_utils.get_iterator( From 7932a5930b5b5d6cb7b179c0fab3fcaf51a747d3 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Thu, 2 Aug 2018 15:52:41 -0700 Subject: [PATCH 34/38] Decouple model loading from inference code PiperOrigin-RevId: 207180389 --- nmt/inference.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/nmt/inference.py b/nmt/inference.py index 517aba52c..2cbef07c2 100644 --- a/nmt/inference.py +++ b/nmt/inference.py @@ -95,6 +95,16 @@ def get_model_creator(hparams): return model_creator +def start_sess_and_load_model(infer_model, ckpt_path): + """Start session and load model.""" + sess = tf.Session( + graph=infer_model.graph, config=utils.get_config_proto()) + with infer_model.graph.as_default(): + loaded_infer_model = model_helper.load_model( + infer_model.model, ckpt_path, sess, "infer") + return sess, loaded_infer_model + + def inference(ckpt_path, inference_input_file, inference_output_file, @@ -108,27 +118,32 @@ def inference(ckpt_path, model_creator = get_model_creator(hparams) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) + sess, loaded_infer_model = start_sess_and_load_model(infer_model, ckpt_path) if num_workers == 1: single_worker_inference( + sess, infer_model, - ckpt_path, + loaded_infer_model, inference_input_file, inference_output_file, hparams) else: multi_worker_inference( + sess, infer_model, - ckpt_path, + loaded_infer_model, inference_input_file, inference_output_file, hparams, num_workers=num_workers, jobid=jobid) + sess.close() -def single_worker_inference(infer_model, - ckpt_path, +def single_worker_inference(sess, + infer_model, + loaded_infer_model, inference_input_file, inference_output_file, hparams): @@ -138,10 +153,7 @@ def single_worker_inference(infer_model, # Read data infer_data = load_data(inference_input_file, hparams) - with tf.Session( - graph=infer_model.graph, config=utils.get_config_proto()) as sess: - loaded_infer_model = model_helper.load_model( - infer_model.model, ckpt_path, sess, "infer") + with infer_model.graph.as_default(): sess.run( infer_model.iterator.initializer, feed_dict={ @@ -174,8 +186,9 @@ def single_worker_inference(infer_model, infer_mode=hparams.infer_mode) -def multi_worker_inference(infer_model, - ckpt_path, +def multi_worker_inference(sess, + infer_model, + loaded_infer_model, inference_input_file, inference_output_file, hparams, @@ -198,10 +211,7 @@ def multi_worker_inference(infer_model, end_position = min(start_position + load_per_worker, total_load) infer_data = infer_data[start_position:end_position] - with tf.Session( - graph=infer_model.graph, config=utils.get_config_proto()) as sess: - loaded_infer_model = model_helper.load_model( - infer_model.model, ckpt_path, sess, "infer") + with infer_model.graph.as_default(): sess.run(infer_model.iterator.initializer, { infer_model.src_placeholder: infer_data, From 471e48483388dec04628b1481153cb6fc46f2892 Mon Sep 17 00:00:00 2001 From: Anonymous Date: Mon, 6 Aug 2018 14:30:43 -0700 Subject: [PATCH 35/38] Use LooseVersion for version check PiperOrigin-RevId: 207608855 --- nmt/utils/misc_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nmt/utils/misc_utils.py b/nmt/utils/misc_utils.py index 540f44100..63dc5a69c 100644 --- a/nmt/utils/misc_utils.py +++ b/nmt/utils/misc_utils.py @@ -23,6 +23,7 @@ import os import sys import time +from distutils import version import numpy as np import tensorflow as tf @@ -30,7 +31,8 @@ def check_tensorflow_version(): min_tf_version = "1.4.0-dev20171024" - if tf.__version__ < min_tf_version: + if (version.LooseVersion(tf.__version__) < + version.LooseVersion(min_tf_version)): raise EnvironmentError("Tensorflow version must >= %s" % min_tf_version) From 1355e32ebeabf7affde76e59d1291142c1fda718 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Sat, 11 Aug 2018 12:52:53 -0700 Subject: [PATCH 36/38] Minor improvements and fixes PiperOrigin-RevId: 208349749 --- nmt/model.py | 6 +++++- nmt/nmt.py | 3 ++- nmt/utils/common_test_utils.py | 2 +- nmt/utils/nmt_utils.py | 4 ++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/nmt/model.py b/nmt/model.py index 113c17320..e0c4f4e03 100644 --- a/nmt/model.py +++ b/nmt/model.py @@ -521,7 +521,7 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): num_gpus = self.num_gpus device_id = num_layers if num_layers < num_gpus else (num_layers - 1) # Colocate output layer with the last RNN cell if there is no extra GPU - # avaliable. Otherwise, put last layer on a separate GPU. + # available. Otherwise, put last layer on a separate GPU. with tf.device(model_helper.get_device_str(device_id, num_gpus)): logits = self.output_layer(outputs.rnn_output) @@ -533,6 +533,10 @@ def _build_decoder(self, encoder_outputs, encoder_state, hparams): infer_mode = hparams.infer_mode start_tokens = tf.fill([self.batch_size], tgt_sos_id) end_token = tgt_eos_id + utils.print_out( + " decoder: infer_mode=%sbeam_width=%d, length_penalty=%f" % ( + infer_mode, hparams.beam_width, hparams.length_penalty_weight)) + if infer_mode == "beam_search": beam_width = hparams.beam_width length_penalty_weight = hparams.length_penalty_weight diff --git a/nmt/nmt.py b/nmt/nmt.py index c3c209129..080d077cd 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -549,10 +549,11 @@ def extend_hparams(hparams): return hparams -def ensure_compatible_hparams(hparams, default_hparams, hparams_path): +def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""): """Make sure the loaded hparams is compatible with new changes.""" default_hparams = utils.maybe_parse_standard_hparams( default_hparams, hparams_path) + # Set num encoder/decoder layers (for old checkpoints) if hasattr(hparams, "num_layers"): if not hasattr(hparams, "num_encoder_layers"): diff --git a/nmt/utils/common_test_utils.py b/nmt/utils/common_test_utils.py index 28d4681db..68ff209f9 100644 --- a/nmt/utils/common_test_utils.py +++ b/nmt/utils/common_test_utils.py @@ -73,7 +73,7 @@ def create_test_hparams(unit_type="lstm", # Misc standard_hparams.forget_bias = 0.0 standard_hparams.random_seed = 3 - language_model=False + standard_hparams.language_model = False # Vocab standard_hparams.src_vocab_size = 5 diff --git a/nmt/utils/nmt_utils.py b/nmt/utils/nmt_utils.py index 524b293f9..2115de942 100644 --- a/nmt/utils/nmt_utils.py +++ b/nmt/utils/nmt_utils.py @@ -42,7 +42,7 @@ def decode_and_evaluate(name, """Decode a test set and compute a score according to the evaluation task.""" # Decode if decode: - utils.print_out(" decoding to output %s." % trans_file) + utils.print_out(" decoding to output %s" % trans_file) start_time = time.time() num_sentences = 0 @@ -58,7 +58,7 @@ def decode_and_evaluate(name, while True: try: nmt_outputs, _ = model.decode(sess) - if infer_mode != "beam_search" : + if infer_mode != "beam_search": nmt_outputs = np.expand_dims(nmt_outputs, 0) batch_size = nmt_outputs.shape[1] From b278487980832417ad8ac701c672b5c3dc7fa553 Mon Sep 17 00:00:00 2001 From: Thang Luong Date: Thu, 23 Aug 2018 22:49:33 -0700 Subject: [PATCH 37/38] Add INFERENCE_KEYS and make extend_hparams() in nmt.py more robust PiperOrigin-RevId: 210055078 --- nmt/nmt.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/nmt/nmt.py b/nmt/nmt.py index 080d077cd..f5823d893 100644 --- a/nmt/nmt.py +++ b/nmt/nmt.py @@ -35,6 +35,11 @@ FLAGS = None +INFERENCE_KEYS = ["src_max_len_infer", "tgt_max_len_infer", "subword_option", + "infer_batch_size", "beam_width", + "length_penalty_weight", "sampling_temperature", + "num_translations_per_input", "infer_mode"] + def add_arguments(parser): """Build ArgumentParser.""" @@ -456,7 +461,7 @@ def extend_hparams(hparams): num_decoder_residual_layers) # Language modeling - if hparams.language_model: + if getattr(hparams, "language_model", None): hparams.attention = "" hparams.attention_architecture = "" hparams.pass_hidden_state = False @@ -474,10 +479,11 @@ def extend_hparams(hparams): raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab + check_special_token = getattr(hparams, "check_special_token", True) src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, - check_special_token=hparams.check_special_token, + check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) @@ -491,7 +497,7 @@ def extend_hparams(hparams): tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, - check_special_token=hparams.check_special_token, + check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) @@ -501,15 +507,14 @@ def extend_hparams(hparams): _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions - _add_argument( - hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) - _add_argument( - hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) + num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions", 0) + _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions) + _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") - if hparams.embed_prefix: + if getattr(hparams, "embed_prefix", None): src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt @@ -540,7 +545,7 @@ def extend_hparams(hparams): _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) - if hparams.avg_ckpts: + if getattr(hparams, "avg_ckpts", None): best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "avg_best_" + metric, 0, update=False) @@ -570,13 +575,12 @@ def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""): hparams.add_hparam(key, default_config[key]) # Update all hparams' keys if override_loaded_hparams=True - if default_hparams.override_loaded_hparams: + if getattr(default_hparams, "override_loaded_hparams", None): overwritten_keys = default_config.keys() else: # For inference - overwritten_keys = ["infer_batch_size", "beam_width", - "length_penalty_weight", "sampling_temperature", - "num_translations_per_input", "infer_mode"] + overwritten_keys = INFERENCE_KEYS + for key in overwritten_keys: if getattr(hparams, key) != default_config[key]: utils.print_out("# Updating hparams.%s: %s -> %s" % From c045042b85672dd8c29697a826c0249fe9ed326f Mon Sep 17 00:00:00 2001 From: Scarlat Tiberiu Date: Wed, 17 Oct 2018 17:39:40 +0300 Subject: [PATCH 38/38] Added beam_search as default inference mode in hparams. --- nmt/standard_hparams/wmt16_gnmt_4_layer.json | 1 + nmt/standard_hparams/wmt16_gnmt_8_layer.json | 1 + 2 files changed, 2 insertions(+) diff --git a/nmt/standard_hparams/wmt16_gnmt_4_layer.json b/nmt/standard_hparams/wmt16_gnmt_4_layer.json index 1274f3db0..8f36e4133 100644 --- a/nmt/standard_hparams/wmt16_gnmt_4_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_4_layer.json @@ -31,6 +31,7 @@ "tgt_max_len_infer": null, "time_major": true, "unit_type": "lstm", + "infer_mode": "beam_search", "beam_width": 10, "length_penalty_weight": 1.0 } diff --git a/nmt/standard_hparams/wmt16_gnmt_8_layer.json b/nmt/standard_hparams/wmt16_gnmt_8_layer.json index f3b217b5d..b96ec8782 100644 --- a/nmt/standard_hparams/wmt16_gnmt_8_layer.json +++ b/nmt/standard_hparams/wmt16_gnmt_8_layer.json @@ -31,6 +31,7 @@ "tgt_max_len_infer": null, "time_major": true, "unit_type": "lstm", + "infer_mode": "beam_search", "beam_width": 10, "length_penalty_weight": 1.0 }