mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: D4734505 part 2. Remove more instances of the batch_size parameter Reviewed By: urikz Differential Revision: D4736906 fbshipit-source-id: fc9d374e9308017d61c427890364c5ab9cec2edf
766 lines
26 KiB
Python
766 lines
26 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import collections
|
|
import logging
|
|
import math
|
|
import numpy as np
|
|
import time
|
|
import argparse
|
|
|
|
from itertools import izip
|
|
|
|
from caffe2.python import core, workspace, recurrent
|
|
from caffe2.python.examples import seq2seq_util
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
Batch = collections.namedtuple('Batch', [
|
|
'encoder_inputs',
|
|
'encoder_lengths',
|
|
'decoder_inputs',
|
|
'decoder_lengths',
|
|
'targets',
|
|
'target_weights',
|
|
])
|
|
|
|
_PAD_ID = 0
|
|
_GO_ID = 1
|
|
_EOS_ID = 2
|
|
EOS = '<EOS>'
|
|
UNK = '<UNK>'
|
|
GO = '<GO>'
|
|
PAD = '<PAD>'
|
|
|
|
|
|
def prepare_batch(batch):
|
|
encoder_lengths = [len(entry[0]) for entry in batch]
|
|
max_encoder_length = max(encoder_lengths)
|
|
decoder_lengths = []
|
|
max_decoder_length = max([len(entry[1]) for entry in batch])
|
|
|
|
batch_encoder_inputs = []
|
|
batch_decoder_inputs = []
|
|
batch_targets = []
|
|
batch_target_weights = []
|
|
|
|
for source_seq, target_seq in batch:
|
|
encoder_pads = (
|
|
[_PAD_ID] * (max_encoder_length - len(source_seq))
|
|
)
|
|
batch_encoder_inputs.append(
|
|
list(reversed(source_seq)) + encoder_pads
|
|
)
|
|
|
|
decoder_pads = (
|
|
[_PAD_ID] * (max_decoder_length - len(target_seq))
|
|
)
|
|
target_seq_with_go_token = [_GO_ID] + target_seq
|
|
decoder_lengths.append(len(target_seq_with_go_token))
|
|
batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
|
|
|
|
target_seq_with_eos = target_seq + [_EOS_ID]
|
|
targets = target_seq_with_eos + decoder_pads
|
|
batch_targets.append(targets)
|
|
|
|
if len(source_seq) + len(target_seq) == 0:
|
|
target_weights = [0] * len(targets)
|
|
else:
|
|
target_weights = [
|
|
1 if target != _PAD_ID else 0
|
|
for target in targets
|
|
]
|
|
batch_target_weights.append(target_weights)
|
|
|
|
return Batch(
|
|
encoder_inputs=np.array(
|
|
batch_encoder_inputs,
|
|
dtype=np.int32,
|
|
).transpose(),
|
|
encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
|
|
decoder_inputs=np.array(
|
|
batch_decoder_inputs,
|
|
dtype=np.int32,
|
|
).transpose(),
|
|
decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
|
|
targets=np.array(
|
|
batch_targets,
|
|
dtype=np.int32,
|
|
).transpose(),
|
|
target_weights=np.array(
|
|
batch_target_weights,
|
|
dtype=np.float32,
|
|
).transpose(),
|
|
)
|
|
|
|
|
|
class Seq2SeqModelCaffe2:
|
|
def _embedding_encoder(
|
|
self,
|
|
model,
|
|
encoder_type,
|
|
encoder_params,
|
|
inputs,
|
|
input_lengths,
|
|
vocab_size,
|
|
embedding_size,
|
|
use_attention,
|
|
):
|
|
# Initialize the word embeddings that will be learned during training.
|
|
sqrt3 = math.sqrt(3)
|
|
encoder_embeddings = model.param_init_net.UniformFill(
|
|
[],
|
|
'encoder_embeddings',
|
|
shape=[vocab_size, embedding_size],
|
|
min=-sqrt3,
|
|
max=sqrt3,
|
|
)
|
|
model.params.append(encoder_embeddings)
|
|
|
|
# Convert inputs to embedded inputs by the embeddings above.
|
|
embedded_encoder_inputs = model.net.Gather(
|
|
[encoder_embeddings, inputs],
|
|
['embedded_encoder_inputs'],
|
|
)
|
|
|
|
if encoder_type == 'rnn':
|
|
encoder_num_units = (
|
|
encoder_params['encoder_layer_configs'][0]['num_units']
|
|
)
|
|
|
|
encoder_initial_cell_state = model.param_init_net.ConstantFill(
|
|
[],
|
|
['encoder_initial_cell_state'],
|
|
shape=[encoder_num_units],
|
|
value=0.0,
|
|
)
|
|
|
|
encoder_initial_hidden_state = model.param_init_net.ConstantFill(
|
|
[],
|
|
'encoder_initial_hidden_state',
|
|
shape=[encoder_num_units],
|
|
value=0.0,
|
|
)
|
|
|
|
# Choose corresponding rnn encoder function
|
|
if encoder_params['use_bidirectional_encoder']:
|
|
rnn_encoder_func = seq2seq_util.rnn_bidirectional_encoder
|
|
else:
|
|
rnn_encoder_func = seq2seq_util.rnn_unidirectional_encoder
|
|
|
|
(
|
|
encoder_outputs,
|
|
final_encoder_hidden_state,
|
|
final_encoder_cell_state,
|
|
) = rnn_encoder_func(
|
|
model,
|
|
embedded_encoder_inputs,
|
|
input_lengths,
|
|
encoder_initial_hidden_state,
|
|
encoder_initial_cell_state,
|
|
embedding_size,
|
|
encoder_num_units,
|
|
use_attention
|
|
)
|
|
|
|
else:
|
|
raise ValueError('Unsupported encoder type {}'.format(encoder_type))
|
|
|
|
return (
|
|
encoder_outputs,
|
|
final_encoder_hidden_state,
|
|
final_encoder_cell_state,
|
|
)
|
|
|
|
def output_projection(
|
|
self,
|
|
model,
|
|
decoder_outputs,
|
|
decoder_output_size,
|
|
target_vocab_size,
|
|
decoder_softmax_size,
|
|
):
|
|
if decoder_softmax_size is not None:
|
|
decoder_outputs = model.FC(
|
|
decoder_outputs,
|
|
'decoder_outputs_scaled',
|
|
dim_in=decoder_output_size,
|
|
dim_out=decoder_softmax_size,
|
|
)
|
|
decoder_output_size = decoder_softmax_size
|
|
|
|
output_projection_w = model.param_init_net.XavierFill(
|
|
[],
|
|
'output_projection_w',
|
|
shape=[self.target_vocab_size, decoder_output_size],
|
|
)
|
|
|
|
output_projection_b = model.param_init_net.XavierFill(
|
|
[],
|
|
'output_projection_b',
|
|
shape=[self.target_vocab_size],
|
|
)
|
|
model.params.extend([
|
|
output_projection_w,
|
|
output_projection_b,
|
|
])
|
|
output_logits = model.net.FC(
|
|
[
|
|
decoder_outputs,
|
|
output_projection_w,
|
|
output_projection_b,
|
|
],
|
|
['output_logits'],
|
|
)
|
|
return output_logits
|
|
|
|
def _build_model(
|
|
self,
|
|
init_params,
|
|
):
|
|
model = seq2seq_util.ModelHelper(
|
|
init_params=init_params,
|
|
)
|
|
|
|
self.encoder_inputs = model.net.AddExternalInput('encoder_inputs')
|
|
self.encoder_lengths = model.net.AddExternalInput('encoder_lengths')
|
|
self.decoder_inputs = model.net.AddExternalInput('decoder_inputs')
|
|
self.decoder_lengths = model.net.AddExternalInput('decoder_lengths')
|
|
self.targets = model.net.AddExternalInput('targets')
|
|
self.target_weights = model.net.AddExternalInput('target_weights')
|
|
|
|
optimizer_params = self.model_params['optimizer_params']
|
|
attention_type = self.model_params['attention']
|
|
assert attention_type in ['none', 'regular']
|
|
|
|
self.learning_rate = model.AddParam(
|
|
name='learning_rate',
|
|
init_value=float(optimizer_params['learning_rate']),
|
|
trainable=False,
|
|
)
|
|
self.global_step = model.AddParam(
|
|
name='global_step',
|
|
init_value=0,
|
|
trainable=False,
|
|
)
|
|
self.start_time = model.AddParam(
|
|
name='start_time',
|
|
init_value=time.time(),
|
|
trainable=False,
|
|
)
|
|
|
|
assert self.num_gpus < 2
|
|
assert len(self.encoder_params['encoder_layer_configs']) == 1
|
|
assert len(self.model_params['decoder_layer_configs']) == 1
|
|
|
|
encoder_num_units = (
|
|
self.encoder_params['encoder_layer_configs'][0]['num_units']
|
|
)
|
|
decoder_num_units = (
|
|
self.model_params['decoder_layer_configs'][0]['num_units']
|
|
)
|
|
|
|
(
|
|
encoder_outputs,
|
|
final_encoder_hidden_state,
|
|
final_encoder_cell_state,
|
|
) = self._embedding_encoder(
|
|
model=model,
|
|
encoder_type=self.encoder_type,
|
|
encoder_params=self.encoder_params,
|
|
inputs=self.encoder_inputs,
|
|
input_lengths=self.encoder_lengths,
|
|
vocab_size=self.source_vocab_size,
|
|
embedding_size=self.model_params['encoder_embedding_size'],
|
|
use_attention=(attention_type != 'none'),
|
|
)
|
|
|
|
# For bidirectional RNN, the num of units doubles after encodeing
|
|
if (
|
|
self.encoder_type == 'rnn' and
|
|
self.encoder_params['use_bidirectional_encoder']
|
|
):
|
|
encoder_num_units *= 2
|
|
|
|
if attention_type == 'none':
|
|
decoder_initial_hidden_state = model.FC(
|
|
final_encoder_hidden_state,
|
|
'decoder_initial_hidden_state',
|
|
encoder_num_units,
|
|
decoder_num_units,
|
|
axis=2,
|
|
)
|
|
decoder_initial_cell_state = model.FC(
|
|
final_encoder_cell_state,
|
|
'decoder_initial_cell_state',
|
|
encoder_num_units,
|
|
decoder_num_units,
|
|
axis=2,
|
|
)
|
|
else:
|
|
decoder_initial_hidden_state = model.param_init_net.ConstantFill(
|
|
[],
|
|
'decoder_initial_hidden_state',
|
|
shape=[decoder_num_units],
|
|
value=0.0,
|
|
)
|
|
decoder_initial_cell_state = model.param_init_net.ConstantFill(
|
|
[],
|
|
'decoder_initial_cell_state',
|
|
shape=[decoder_num_units],
|
|
value=0.0,
|
|
)
|
|
initial_attention_weighted_encoder_context = (
|
|
model.param_init_net.ConstantFill(
|
|
[],
|
|
'initial_attention_weighted_encoder_context',
|
|
shape=[encoder_num_units],
|
|
value=0.0,
|
|
)
|
|
)
|
|
|
|
sqrt3 = math.sqrt(3)
|
|
decoder_embeddings = model.AddParam(
|
|
name='decoder_embeddings',
|
|
init=('UniformFill', dict(
|
|
shape=[
|
|
self.target_vocab_size,
|
|
self.model_params['decoder_embedding_size'],
|
|
],
|
|
min=-sqrt3,
|
|
max=sqrt3,
|
|
)),
|
|
)
|
|
|
|
embedded_decoder_inputs = model.net.Gather(
|
|
[decoder_embeddings, self.decoder_inputs],
|
|
['embedded_decoder_inputs'],
|
|
)
|
|
# seq_len x batch_size x decoder_embedding_size
|
|
with core.NameScope('', reset=True):
|
|
if attention_type == 'none':
|
|
decoder_outputs, _, _, _ = recurrent.LSTM(
|
|
model=model,
|
|
input_blob=embedded_decoder_inputs,
|
|
seq_lengths=self.decoder_lengths,
|
|
initial_states=(
|
|
decoder_initial_hidden_state,
|
|
decoder_initial_cell_state,
|
|
),
|
|
dim_in=self.model_params['decoder_embedding_size'],
|
|
dim_out=decoder_num_units,
|
|
scope='decoder',
|
|
outputs_with_grads=[0],
|
|
)
|
|
decoder_output_size = decoder_num_units
|
|
else:
|
|
(
|
|
decoder_outputs, _, _, _,
|
|
attention_weighted_encoder_contexts, _
|
|
) = recurrent.LSTMWithAttention(
|
|
model=model,
|
|
decoder_inputs=embedded_decoder_inputs,
|
|
decoder_input_lengths=self.decoder_lengths,
|
|
initial_decoder_hidden_state=decoder_initial_hidden_state,
|
|
initial_decoder_cell_state=decoder_initial_cell_state,
|
|
initial_attention_weighted_encoder_context=(
|
|
initial_attention_weighted_encoder_context
|
|
),
|
|
encoder_output_dim=encoder_num_units,
|
|
encoder_outputs=encoder_outputs,
|
|
decoder_input_dim=self.model_params['decoder_embedding_size'],
|
|
decoder_state_dim=decoder_num_units,
|
|
scope='decoder',
|
|
outputs_with_grads=[0, 4],
|
|
)
|
|
decoder_outputs, _ = model.net.Concat(
|
|
[decoder_outputs, attention_weighted_encoder_contexts],
|
|
[
|
|
'states_and_context_combination',
|
|
'_states_and_context_combination_concat_dims',
|
|
],
|
|
axis=2,
|
|
)
|
|
decoder_output_size = decoder_num_units + encoder_num_units
|
|
|
|
# we do softmax over the whole sequence
|
|
# (max_length in the batch * batch_size) x decoder embedding size
|
|
# -1 because we don't know max_length yet
|
|
decoder_outputs_flattened, _ = model.net.Reshape(
|
|
[decoder_outputs],
|
|
[
|
|
'decoder_outputs_flattened',
|
|
'decoder_outputs_and_contexts_combination_old_shape',
|
|
],
|
|
shape=[-1, decoder_output_size],
|
|
)
|
|
output_logits = self.output_projection(
|
|
model=model,
|
|
decoder_outputs=decoder_outputs_flattened,
|
|
decoder_output_size=decoder_output_size,
|
|
target_vocab_size=self.target_vocab_size,
|
|
decoder_softmax_size=self.model_params['decoder_softmax_size'],
|
|
)
|
|
targets, _ = model.net.Reshape(
|
|
[self.targets],
|
|
['targets', 'targets_old_shape'],
|
|
shape=[-1],
|
|
)
|
|
target_weights, _ = model.net.Reshape(
|
|
[self.target_weights],
|
|
['target_weights', 'target_weights_old_shape'],
|
|
shape=[-1],
|
|
)
|
|
|
|
output_probs, loss_per_word = model.net.SoftmaxWithLoss(
|
|
[output_logits, targets, target_weights],
|
|
['OutputProbs', 'loss_per_word'],
|
|
)
|
|
|
|
num_words = model.net.ReduceFrontSum(
|
|
target_weights,
|
|
'num_words',
|
|
)
|
|
self.total_loss_scalar = model.net.Mul(
|
|
[loss_per_word, num_words],
|
|
'total_loss_scalar',
|
|
)
|
|
self.forward_net = model.net.Clone(
|
|
name=model.net.Name() + '_forward_only',
|
|
)
|
|
# print loss only in the forward net which evaluates loss after every
|
|
# epoch
|
|
self.forward_net.Print([self.total_loss_scalar], [])
|
|
|
|
# Note: average over batch.
|
|
# It is tricky because of two problems:
|
|
# 1. ReduceFrontSum from 1-D tensor returns 0-D tensor
|
|
# 2. If you want to multiply 0-D by 1-D tensor
|
|
# (by scalar batch_size_inverse_tensor),
|
|
# you need to use broadcasting. But gradient propogation
|
|
# is broken for op with broadcasting.
|
|
# total_loss_scalar, _ = model.net.Reshape(
|
|
# [total_loss_scalar],
|
|
# [total_loss_scalar, 'total_loss_scalar_old_shape'],
|
|
# shape=[1],
|
|
# )
|
|
batch_size_inverse_tensor = (
|
|
model.param_init_net.ConstantFill(
|
|
[],
|
|
'batch_size_tensor',
|
|
shape=[],
|
|
value=1.0 / self.batch_size,
|
|
)
|
|
)
|
|
total_loss_scalar_average = model.net.Mul(
|
|
[self.total_loss_scalar, batch_size_inverse_tensor],
|
|
['total_loss_scalar_average'],
|
|
)
|
|
|
|
model.AddGradientOperators([
|
|
total_loss_scalar_average,
|
|
])
|
|
ONE = model.param_init_net.ConstantFill(
|
|
[],
|
|
'ONE',
|
|
shape=[1],
|
|
value=1.0,
|
|
)
|
|
logger.info('All trainable variables: ')
|
|
|
|
for param in model.params:
|
|
param_grad = model.param_to_grad[param]
|
|
if param in model.param_to_grad:
|
|
if isinstance(param_grad, core.GradientSlice):
|
|
param_grad_values = param_grad.values
|
|
param_grad_values = model.net.Clip(
|
|
[param_grad_values],
|
|
[param_grad_values],
|
|
min=0.0,
|
|
max=float(self.model_params['max_grad_value']),
|
|
)
|
|
model.net.ScatterWeightedSum(
|
|
[
|
|
param,
|
|
ONE,
|
|
param_grad.indices,
|
|
param_grad_values,
|
|
model.net.Negative(
|
|
[self.learning_rate],
|
|
'negative_learning_rate',
|
|
),
|
|
],
|
|
param,
|
|
)
|
|
else:
|
|
param_grad = model.net.Clip(
|
|
[param_grad],
|
|
[param_grad],
|
|
min=0.0,
|
|
max=float(self.model_params['max_grad_value']),
|
|
)
|
|
model.net.WeightedSum(
|
|
[
|
|
param,
|
|
ONE,
|
|
param_grad,
|
|
model.net.Negative(
|
|
[self.learning_rate],
|
|
'negative_learning_rate',
|
|
),
|
|
],
|
|
param,
|
|
)
|
|
self.model = model
|
|
|
|
def _init_model(self):
|
|
workspace.RunNetOnce(self.model.param_init_net)
|
|
|
|
def create_net(net):
|
|
workspace.CreateNet(
|
|
net,
|
|
input_blobs=map(str, net.external_inputs),
|
|
)
|
|
|
|
create_net(self.model.net)
|
|
create_net(self.forward_net)
|
|
|
|
def __init__(
|
|
self,
|
|
model_params,
|
|
source_vocab_size,
|
|
target_vocab_size,
|
|
num_gpus=1,
|
|
num_cpus=1,
|
|
):
|
|
self.model_params = model_params
|
|
self.encoder_type = 'rnn'
|
|
self.encoder_params = model_params['encoder_type']
|
|
self.source_vocab_size = source_vocab_size
|
|
self.target_vocab_size = target_vocab_size
|
|
self.num_gpus = num_gpus
|
|
self.num_cpus = num_cpus
|
|
self.batch_size = model_params['batch_size']
|
|
|
|
workspace.GlobalInit([
|
|
'caffe2',
|
|
# NOTE: modify log level for debugging purposes
|
|
'--caffe2_log_level=0',
|
|
# NOTE: modify log level for debugging purposes
|
|
'--v=0',
|
|
# Fail gracefully if one of the threads fails
|
|
'--caffe2_handle_executor_threads_exceptions=1',
|
|
'--caffe2_mkl_num_threads=' + str(self.num_cpus),
|
|
])
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
workspace.ResetWorkspace()
|
|
|
|
def initialize_from_scratch(self):
|
|
logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
|
|
self._build_model(init_params=True)
|
|
self._init_model()
|
|
logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
|
|
|
|
def get_current_step(self):
|
|
return workspace.FetchBlob(self.global_step)[0]
|
|
|
|
def inc_current_step(self):
|
|
workspace.FeedBlob(
|
|
self.global_step,
|
|
np.array([self.get_current_step() + 1]),
|
|
)
|
|
|
|
def step(
|
|
self,
|
|
batch,
|
|
forward_only
|
|
):
|
|
batch_obj = prepare_batch(batch)
|
|
for batch_obj_name, batch_obj_value in izip(Batch._fields, batch_obj):
|
|
workspace.FeedBlob(batch_obj_name, batch_obj_value)
|
|
|
|
if forward_only:
|
|
workspace.RunNet(self.forward_net)
|
|
else:
|
|
workspace.RunNet(self.model.net)
|
|
self.inc_current_step()
|
|
|
|
return workspace.FetchBlob(self.total_loss_scalar)
|
|
|
|
|
|
def gen_vocab(corpus, unk_threshold):
|
|
vocab = collections.defaultdict(lambda: len(vocab))
|
|
freqs = collections.defaultdict(lambda: 0)
|
|
# Adding padding tokens to the vocabulary to maintain consistency with IDs
|
|
vocab[PAD]
|
|
vocab[GO]
|
|
vocab[EOS]
|
|
vocab[UNK]
|
|
|
|
with open(corpus) as f:
|
|
for sentence in f:
|
|
tokens = sentence.strip().split()
|
|
for token in tokens:
|
|
freqs[token] += 1
|
|
for token, freq in freqs.items():
|
|
if freq > unk_threshold:
|
|
# TODO: Add reverse lookup dict when it becomes necessary
|
|
vocab[token]
|
|
|
|
return vocab
|
|
|
|
|
|
def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
|
|
batch_size):
|
|
batches = []
|
|
with open(source_corpus) as source, open(target_corpus) as target:
|
|
elems = 0
|
|
batch = []
|
|
for source_sentence, target_sentence in zip(source, target):
|
|
|
|
# Convert sentence into list of vocabulary indices
|
|
def get_numberized_sentence(sentence, vocab):
|
|
sentence_with_id = []
|
|
for token in sentence.strip().split():
|
|
if token in vocab:
|
|
sentence_with_id.append(vocab[token])
|
|
else:
|
|
sentence_with_id.append(vocab[UNK])
|
|
sentence_with_id.append(vocab[EOS])
|
|
return sentence_with_id
|
|
|
|
batch.append(tuple([get_numberized_sentence(sentence, vocab)
|
|
for vocab, sentence in
|
|
zip([source_vocab, target_vocab],
|
|
[source_sentence, target_sentence])
|
|
]))
|
|
elems += 1
|
|
if elems >= batch_size:
|
|
batches.append(batch)
|
|
batch = []
|
|
elems = 0
|
|
if len(batch) > 0:
|
|
while len(batch) < batch_size:
|
|
batch.append(batch[-1])
|
|
assert len(batch) == batch_size
|
|
batches.append(batch)
|
|
return batches
|
|
|
|
|
|
def run_seq2seq_model(args, model_params=None):
|
|
source_vocab = gen_vocab(args.source_corpus, args.unk_threshold)
|
|
target_vocab = gen_vocab(args.target_corpus, args.unk_threshold)
|
|
batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
|
|
target_vocab, model_params['batch_size'])
|
|
|
|
logger.info('Source vocab size', len(source_vocab))
|
|
logger.info('Target vocab size', len(target_vocab))
|
|
|
|
batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
|
|
source_vocab, target_vocab,
|
|
model_params['batch_size'])
|
|
|
|
with Seq2SeqModelCaffe2(
|
|
model_params=model_params,
|
|
source_vocab_size=len(source_vocab),
|
|
target_vocab_size=len(target_vocab),
|
|
num_gpus=0,
|
|
) as model_obj:
|
|
model_obj.initialize_from_scratch()
|
|
for _ in range(args.epochs):
|
|
for batch in batches:
|
|
model_obj.step(
|
|
batch=batch,
|
|
forward_only=False,
|
|
)
|
|
model_obj.step(
|
|
# merge all eval batches for scalar loss computation
|
|
batch=[entry for batch in batches_eval for entry in batch],
|
|
forward_only=True,
|
|
)
|
|
|
|
|
|
def run_seq2seq_rnn_unidirection_with_no_attention(args):
|
|
run_seq2seq_model(args, model_params=dict(
|
|
attention='none',
|
|
decoder_layer_configs=[
|
|
dict(
|
|
num_units=args.decoder_cell_num_units,
|
|
),
|
|
],
|
|
encoder_type=dict(
|
|
encoder_layer_configs=[
|
|
dict(
|
|
num_units=args.encoder_cell_num_units,
|
|
),
|
|
],
|
|
use_bidirectional_encoder=args.use_bidirectional_encoder,
|
|
),
|
|
batch_size=args.batch_size,
|
|
optimizer_params=dict(
|
|
learning_rate=args.learning_rate,
|
|
),
|
|
encoder_embedding_size=args.encoder_embedding_size,
|
|
decoder_embedding_size=args.decoder_embedding_size,
|
|
decoder_softmax_size=args.decoder_softmax_size,
|
|
max_grad_value=args.max_grad_value,
|
|
))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Caffe2: Seq2Seq Training'
|
|
)
|
|
parser.add_argument('--source-corpus', type=str, default=None,
|
|
help='Path to source corpus in a text file format. Each '
|
|
'line in the file should contain a single sentence',
|
|
required=True)
|
|
parser.add_argument('--target-corpus', type=str, default=None,
|
|
help='Path to target corpus in a text file format',
|
|
required=True)
|
|
parser.add_argument('--batch-size', type=int, default=500,
|
|
help='Training batch size')
|
|
parser.add_argument('--epochs', type=int, default=10,
|
|
help='Number of iterations over training data')
|
|
parser.add_argument('--learning-rate', type=float, default=0.01,
|
|
help='Learning rate')
|
|
parser.add_argument('--unk-threshold', type=int, default=50,
|
|
help='Threshold frequency under which token becomes '
|
|
'labelled unknown token')
|
|
parser.add_argument('--max-grad-value', type=float, default=0.1,
|
|
help='Max clip value of gradients at the end of each '
|
|
'backward pass')
|
|
parser.add_argument('--use-bidirectional-encoder', action='store_true',
|
|
help='Set flag to use bidirectional recurrent network '
|
|
'in encoder')
|
|
parser.add_argument('--source-corpus-eval', type=str, default=None,
|
|
help='Path to source corpus for evaluation in a text '
|
|
'file format', required=True)
|
|
parser.add_argument('--target-corpus-eval', type=str, default=None,
|
|
help='Path to target corpus for evaluation in a text '
|
|
'file format', required=True)
|
|
parser.add_argument('--encoder-cell-num-units', type=int, default=25,
|
|
help='Number of cell units in the encoder layer')
|
|
parser.add_argument('--decoder-cell-num-units', type=int, default=50,
|
|
help='Number of cell units in the decoder layer')
|
|
parser.add_argument('--encoder-embedding-size', type=int, default=25,
|
|
help='Size of embedding in the encoder layer')
|
|
parser.add_argument('--decoder-embedding-size', type=int, default=25,
|
|
help='Size of embedding in the decoder layer')
|
|
parser.add_argument('--decoder-softmax-size', type=int, default=25,
|
|
help='Size of softmax layer in the decoder')
|
|
|
|
args = parser.parse_args()
|
|
|
|
run_seq2seq_rnn_unidirection_with_no_attention(args)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|