pytorch/caffe2/python/lstm_benchmark.py
Aapo Kyrola f84e5360cc LSTM benchmark (Caffe2 RNN based)
Summary: Just generate some random data and put it through LSTM (Cafef2 RNN based) using its own output as gradient value for benchmark purposes. With default parameters it fits my dev GPU memory. On default parameters provided in this diff I have got 300k entries per second processed. These entries are split into blocks of seq_length * block_size. Each entry is of size hidden_dim, LSTM takes in hidden_dim sized input and produces output of the same size.

Reviewed By: salexspb

Differential Revision: D4605815

fbshipit-source-id: dd529302a0a93e8711784c67e4c777c8d6a8cdf4
2017-02-28 23:17:26 -08:00

176 lines
4.7 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.proto import caffe2_pb2
from caffe2.python import cnn, workspace, core, utils, recurrent
import argparse
import numpy as np
import time
import logging
logging.basicConfig()
log = logging.getLogger("lstm_bench")
log.setLevel(logging.DEBUG)
def generate_data(T, shape):
'''
Fill a queue with input data
'''
log.info("Generating T={} sequence batches".format(T))
generate_input_init_net = core.Net('generate_input_init')
queue = generate_input_init_net.CreateBlobsQueue(
[], "inputqueue", num_blobs=1, capacity=T,
)
workspace.RunNetOnce(generate_input_init_net)
generate_input_net = core.Net('generate_input')
scratch = generate_input_net.UniformFill([], ["input_scratch"], shape=shape)
generate_input_net.EnqueueBlobs([queue, scratch], [scratch])
workspace.CreateNet(generate_input_net)
workspace.RunNet(generate_input_net.Proto().name, T)
log.info("Finished data generation")
return queue
def create_model(args, queue):
model = cnn.CNNModelHelper(name="LSTM_bench")
seq_lengths, hidden_init, cell_init, target = \
model.net.AddExternalInputs(
'seq_lengths',
'hidden_init',
'cell_init',
'target',
)
input_blob = model.DequeueBlobs(queue, "input_data")
all_hidden, last_hidden, _, last_state = recurrent.LSTM(
model=model,
input_blob=input_blob,
seq_lengths=seq_lengths,
initial_states=(hidden_init, cell_init),
dim_in=args.input_dim,
dim_out=args.hidden_dim,
scope="lstm1",
)
model.AddGradientOperators([all_hidden])
# carry states over
model.net.Copy(last_hidden, hidden_init)
model.net.Copy(last_hidden, cell_init)
workspace.FeedBlob(hidden_init, np.zeros(
[1, args.batch_size, args.hidden_dim], dtype=np.float32
))
workspace.FeedBlob(cell_init, np.zeros(
[1, args.batch_size, args.hidden_dim], dtype=np.float32
))
return model
def Caffe2LSTM(args):
T = args.data_size // args.batch_size
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
queue = generate_data(T // args.seq_length, input_blob_shape)
workspace.FeedBlob(
"seq_lengths",
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
)
model = create_model(args, queue)
workspace.RunNetOnce(model.param_init_net)
workspace.CreateNet(model.net)
last_time = time.time()
start_time = last_time
num_iters = T // args.seq_length
entries_per_iter = args.seq_length * args.batch_size
# Run the Benchmark
log.info("------ Starting benchmark ------")
for iteration in range(0, num_iters, args.iters_to_report):
iters_once = min(args.iters_to_report, num_iters - iteration)
workspace.RunNet(model.net.Proto().name, iters_once)
new_time = time.time()
log.info("Iter: {} / {}. Entries Per Second: {}k". format(
iteration,
num_iters,
entries_per_iter * iters_once / (new_time - last_time) // 1000
))
last_time = new_time
log.info("Done. Total EPS: {}k".format(
entries_per_iter * num_iters / (time.time() - start_time) // 1000
))
@utils.debug
def Benchmark(args):
Caffe2LSTM(args)
def GetArgumentParser():
parser = argparse.ArgumentParser(description="LSTM benchmark.")
parser.add_argument(
"--hidden_dim",
type=int,
default=40,
help="Hidden dimension",
)
parser.add_argument(
"--input_dim",
type=int,
default=40,
help="Input dimension",
)
parser.add_argument(
"--batch_size",
type=int,
default=256,
help="The batch size."
)
parser.add_argument(
"--seq_length",
type=int,
default=20,
help="Sequence length"
)
parser.add_argument(
"--data_size",
type=int,
default=10000000,
help="Number of data points to generate"
)
parser.add_argument(
"--iters_to_report",
type=int,
default=100,
help="Number of iteration to report progress"
)
parser.add_argument(
"--gpu",
action="store_true",
help="Run all on GPU",
)
return parser
if __name__ == '__main__':
args = GetArgumentParser().parse_args()
workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
device = core.DeviceOption(
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
with core.DeviceScope(device):
Benchmark(args)