pytorch/caffe2/python/examples/seq2seq_util.py
Deepak Gopinath 57ecd20197 seq2seq open source implementation
Summary:
OSS implementation of seq2seq model in Caffe2. The script uses Seq2SeqModelCaffe2 class to build and run the model. It takes in training data in the form of text file with one sentence in each line, builds a vocabulary, generates batches based on batch size and runs the net for a configurable number of epochs. It prints total scalar loss at the end of each epoch.

All FBLearner and neural_mt type system dependencies have been removed. Unimplemented and unnecessary methods have been removed to make the script simpler.
fblearner/flow/projects/langtech/translation/neural_mt/model_util_caffe2.py has been moved to caffe2/caffe2/python/examples/seq2seq_util.py and remains unchanged

Potential TODOs:
  - Get the model running in GPU. Only GatherOp does not have a corresponding GPU implementation. Try adding CopyGPUToCPU before and CopyCPUToGPU after Gather, and use CUDA DeviceOption.
  - Add evaluation on test data with suitable metric (perplexity? bleu?)

Reviewed By: urikz

Differential Revision: D4653333

fbshipit-source-id: 1c7d970ebc86afe23fad4d48854296bf54eb0f77
2017-03-09 16:18:08 -08:00

157 lines
4.7 KiB
Python

""" A bunch of util functions to build Seq2Seq models with Caffe2."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, recurrent
from caffe2.python.cnn import CNNModelHelper
class ModelHelper(CNNModelHelper):
def __init__(self, init_params=True):
super(ModelHelper, self).__init__(
order='NCHW', # this is only relevant for convolutional networks
init_params=init_params,
)
self.non_trainable_params = []
def AddParam(self, name, init=None, init_value=None, trainable=True):
"""Adds a parameter to the model's net and it's initializer if needed
Args:
init: a tuple (<initialization_op_name>, <initialization_op_kwargs>)
init_value: int, float or str. Can be used instead of `init` as a
simple constant initializer
trainable: bool, whether to compute gradient for this param or not
"""
if init_value is not None:
assert init is None
assert type(init_value) in [int, float, str]
init = ('ConstantFill', dict(
shape=[1],
value=init_value,
))
if self.init_params:
param = self.param_init_net.__getattr__(init[0])(
[],
name,
**init[1]
)
else:
param = self.net.AddExternalInput(name)
if trainable:
self.params.append(param)
else:
self.non_trainable_params.append(param)
return param
def rnn_unidirectional_encoder(
model,
embedded_inputs,
input_lengths,
initial_hidden_state,
initial_cell_state,
embedding_size,
encoder_num_units,
use_attention
):
""" Unidirectional (forward pass) LSTM encoder."""
with core.NameScope('', reset=True):
outputs, final_hidden_state, _, final_cell_state = recurrent.LSTM(
model=model,
input_blob=embedded_inputs,
seq_lengths=input_lengths,
initial_states=(initial_hidden_state, initial_cell_state),
dim_in=embedding_size,
dim_out=encoder_num_units,
scope='encoder',
outputs_with_grads=([0] if use_attention else [1, 3]),
)
return outputs, final_hidden_state, final_cell_state
def rnn_bidirectional_encoder(
model,
embedded_inputs,
input_lengths,
initial_hidden_state,
initial_cell_state,
embedding_size,
encoder_num_units,
use_attention
):
""" Bidirectional (forward pass and backward pass) LSTM encoder."""
with core.NameScope('', reset=True):
# Forward pass
(
outputs_fw,
final_hidden_state_fw,
_,
final_cell_state_fw,
) = recurrent.LSTM(
model=model,
input_blob=embedded_inputs,
seq_lengths=input_lengths,
initial_states=(initial_hidden_state, initial_cell_state),
dim_in=embedding_size,
dim_out=encoder_num_units,
scope='forward_encoder',
outputs_with_grads=([0] if use_attention else [1, 3]),
)
# Backward pass
reversed_embedded_inputs = model.net.ReversePackedSegs(
[embedded_inputs, input_lengths],
['reversed_embedded_inputs'],
)
(
outputs_bw,
final_hidden_state_bw,
_,
final_cell_state_bw,
) = recurrent.LSTM(
model=model,
input_blob=reversed_embedded_inputs,
seq_lengths=input_lengths,
initial_states=(initial_hidden_state, initial_cell_state),
dim_in=embedding_size,
dim_out=encoder_num_units,
scope='backward_encoder',
outputs_with_grads=([0] if use_attention else [1, 3]),
)
outputs_bw = model.net.ReversePackedSegs(
[outputs_bw, input_lengths],
['outputs_bw'],
)
# Concatenate forward and backward results
outputs, _ = model.net.Concat(
[outputs_fw, outputs_bw],
['outputs', 'outputs_dim'],
axis=2,
)
final_hidden_state, _ = model.net.Concat(
[final_hidden_state_fw, final_hidden_state_bw],
['final_hidden_state', 'final_hidden_state_dim'],
axis=2,
)
final_cell_state, _ = model.net.Concat(
[final_cell_state_fw, final_cell_state_bw],
['final_cell_state', 'final_cell_state_dim'],
axis=2,
)
return outputs, final_hidden_state, final_cell_state