pytorch/caffe2/python/lstm_benchmark.py
Aapo Kyrola 9cb901caf0 Forward-only rnns
Summary:
Added option to recurrent_net and RNNCell's for forward_only. If this is set, the backward_step_net is not passed to the operator.
When backward_step_net is not available, operator knows it is in forward_only mode and does not create workspaces for each step but cycles
through only one private workspace.

Note: we could avoid doing a lot of work in recurrent.py:recurrent_network call when backward step is not needed, but doing that nicely requires
more refactoring that I did not want to do now. Thus, we create the backward step nets etc, but just don't pass it to the op.

This can be used to create more efficient inference models. You can also sanitize existing inference nets and remove the backward_step_net argument to
get the benefits.

Reviewed By: salexspb

Differential Revision: D4916482

fbshipit-source-id: c99b93c9cb897c32b0f449253f7f6d6a942618ad
2017-04-24 15:52:27 -07:00

260 lines
7.5 KiB
Python

## @package lstm_benchmark
# Module caffe2.python.lstm_benchmark
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.proto import caffe2_pb2
from caffe2.python import cnn, workspace, core, utils, rnn_cell
import argparse
import numpy as np
import time
import logging
logging.basicConfig()
log = logging.getLogger("lstm_bench")
log.setLevel(logging.DEBUG)
def generate_data(T, shape, num_labels):
'''
Fill a queue with input data
'''
log.info("Generating T={} sequence batches".format(T))
generate_input_init_net = core.Net('generate_input_init')
queue = generate_input_init_net.CreateBlobsQueue(
[], "inputqueue", num_blobs=1, capacity=T,
)
label_queue = generate_input_init_net.CreateBlobsQueue(
[], "labelqueue", num_blobs=1, capacity=T,
)
workspace.RunNetOnce(generate_input_init_net)
generate_input_net = core.Net('generate_input')
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
np.random.seed(2603)
for t in range(T):
if (t % 50 == 0):
print("Generating data {}/{}".format(t, T))
# Randomize the seqlength
random_shape = [np.random.randint(1, shape[0])] + shape[1:]
X = np.random.rand(*random_shape).astype(np.float32)
batch_size = random_shape[1]
L = num_labels * batch_size
labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
workspace.FeedBlob("scratch", X)
workspace.FeedBlob("label_scr", labels)
workspace.RunNetOnce(generate_input_net.Proto())
log.info("Finished data generation")
return queue, label_queue
def create_model(args, queue, label_queue, input_shape):
model = cnn.CNNModelHelper(name="LSTM_bench")
seq_lengths, hidden_init, cell_init, target = \
model.net.AddExternalInputs(
'seq_lengths',
'hidden_init',
'cell_init',
'target',
)
input_blob = model.DequeueBlobs(queue, "input_data")
labels = model.DequeueBlobs(label_queue, "label")
if args.implementation == "own":
output, last_hidden, _, last_state = rnn_cell.LSTM(
model=model,
input_blob=input_blob,
seq_lengths=seq_lengths,
initial_states=(hidden_init, cell_init),
dim_in=args.input_dim,
dim_out=args.hidden_dim,
scope="lstm1",
memory_optimization=args.memory_optimization,
forward_only=args.forward_only,
)
elif args.implementation == "cudnn":
# We need to feed a placeholder input so that RecurrentInitOp
# can infer the dimensions.
model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
output, last_hidden, _ = rnn_cell.cudnn_LSTM(
model=model,
input_blob=input_blob,
initial_states=(hidden_init, cell_init),
dim_in=args.input_dim,
dim_out=args.hidden_dim,
scope="cudnnlstm",
num_layers=1,
)
else:
assert False, "Unknown implementation"
weights = model.UniformFill(labels, "weights")
softmax, loss = model.SoftmaxWithLoss(
[model.Flatten(output), labels, weights],
['softmax', 'loss'],
)
if not args.forward_only:
model.AddGradientOperators([loss])
# carry states over
model.net.Copy(last_hidden, hidden_init)
model.net.Copy(last_hidden, cell_init)
workspace.FeedBlob(hidden_init, np.zeros(
[1, args.batch_size, args.hidden_dim], dtype=np.float32
))
workspace.FeedBlob(cell_init, np.zeros(
[1, args.batch_size, args.hidden_dim], dtype=np.float32
))
return model, output
def Caffe2LSTM(args):
T = args.data_size // args.batch_size
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
queue, label_queue = generate_data(T // args.seq_length,
input_blob_shape,
args.hidden_dim)
workspace.FeedBlob(
"seq_lengths",
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
)
model, output = create_model(args, queue, label_queue, input_blob_shape)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
last_time = time.time()
start_time = last_time
num_iters = T // args.seq_length
entries_per_iter = args.seq_length * args.batch_size
# Run the Benchmark
log.info("------ Starting benchmark ------")
for iteration in range(0, num_iters, args.iters_to_report):
iters_once = min(args.iters_to_report, num_iters - iteration)
workspace.RunNet(model.net.Proto().name, iters_once)
new_time = time.time()
log.info("Iter: {} / {}. Entries Per Second: {}k.". format(
iteration,
num_iters,
entries_per_iter * iters_once / (new_time - last_time) // 1000,
))
last_time = new_time
log.info("Done. Total EPS: {}k".format(
entries_per_iter * num_iters / (time.time() - start_time) // 1000,
))
if (args.gpu):
log.info("Memory stats:")
stats = utils.GetGPUMemoryUsageStats()
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
if (stats['max_total'] != stats['total']):
log.warning(
"Max usage differs from current total usage: {} > {}".
format(stats['max_total'], stats['total'])
)
log.warning("This means that costly deallocations occured.")
@utils.debug
def Benchmark(args):
Caffe2LSTM(args)
def GetArgumentParser():
parser = argparse.ArgumentParser(description="LSTM benchmark.")
parser.add_argument(
"--hidden_dim",
type=int,
default=40,
help="Hidden dimension",
)
parser.add_argument(
"--input_dim",
type=int,
default=40,
help="Input dimension",
)
parser.add_argument(
"--batch_size",
type=int,
default=256,
help="The batch size."
)
parser.add_argument(
"--seq_length",
type=int,
default=20,
help="Max sequence length"
)
parser.add_argument(
"--data_size",
type=int,
default=10000000,
help="Number of data points to generate"
)
parser.add_argument(
"--iters_to_report",
type=int,
default=100,
help="Number of iteration to report progress"
)
parser.add_argument(
"--gpu",
action="store_true",
help="Run all on GPU",
)
parser.add_argument(
"--implementation",
type=str,
default="own",
help="'cudnn' or 'own'",
)
parser.add_argument(
"--memory_optimization",
action="store_true",
help="Whether to use memory optimized LSTM or not",
)
parser.add_argument(
"--forward_only",
action="store_true",
help="Whether to run only forward pass"
)
return parser
if __name__ == '__main__':
args = GetArgumentParser().parse_args()
workspace.GlobalInit([
'caffe2',
'--caffe2_log_level=0',
'--caffe2_print_blob_sizes_at_exit=0',
'--caffe2_gpu_memory_tracking=1'])
device = core.DeviceOption(
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
with core.DeviceScope(device):
Benchmark(args)