mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Added option to recurrent_net and RNNCell's for forward_only. If this is set, the backward_step_net is not passed to the operator. When backward_step_net is not available, operator knows it is in forward_only mode and does not create workspaces for each step but cycles through only one private workspace. Note: we could avoid doing a lot of work in recurrent.py:recurrent_network call when backward step is not needed, but doing that nicely requires more refactoring that I did not want to do now. Thus, we create the backward step nets etc, but just don't pass it to the op. This can be used to create more efficient inference models. You can also sanitize existing inference nets and remove the backward_step_net argument to get the benefits. Reviewed By: salexspb Differential Revision: D4916482 fbshipit-source-id: c99b93c9cb897c32b0f449253f7f6d6a942618ad
260 lines
7.5 KiB
Python
260 lines
7.5 KiB
Python
## @package lstm_benchmark
|
|
# Module caffe2.python.lstm_benchmark
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from caffe2.proto import caffe2_pb2
|
|
from caffe2.python import cnn, workspace, core, utils, rnn_cell
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import time
|
|
|
|
import logging
|
|
|
|
logging.basicConfig()
|
|
log = logging.getLogger("lstm_bench")
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
|
|
def generate_data(T, shape, num_labels):
|
|
'''
|
|
Fill a queue with input data
|
|
'''
|
|
log.info("Generating T={} sequence batches".format(T))
|
|
|
|
generate_input_init_net = core.Net('generate_input_init')
|
|
queue = generate_input_init_net.CreateBlobsQueue(
|
|
[], "inputqueue", num_blobs=1, capacity=T,
|
|
)
|
|
label_queue = generate_input_init_net.CreateBlobsQueue(
|
|
[], "labelqueue", num_blobs=1, capacity=T,
|
|
)
|
|
|
|
workspace.RunNetOnce(generate_input_init_net)
|
|
generate_input_net = core.Net('generate_input')
|
|
|
|
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
|
|
generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
|
|
np.random.seed(2603)
|
|
|
|
for t in range(T):
|
|
if (t % 50 == 0):
|
|
print("Generating data {}/{}".format(t, T))
|
|
# Randomize the seqlength
|
|
random_shape = [np.random.randint(1, shape[0])] + shape[1:]
|
|
X = np.random.rand(*random_shape).astype(np.float32)
|
|
batch_size = random_shape[1]
|
|
L = num_labels * batch_size
|
|
labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
|
|
workspace.FeedBlob("scratch", X)
|
|
workspace.FeedBlob("label_scr", labels)
|
|
workspace.RunNetOnce(generate_input_net.Proto())
|
|
|
|
log.info("Finished data generation")
|
|
|
|
return queue, label_queue
|
|
|
|
|
|
def create_model(args, queue, label_queue, input_shape):
|
|
model = cnn.CNNModelHelper(name="LSTM_bench")
|
|
seq_lengths, hidden_init, cell_init, target = \
|
|
model.net.AddExternalInputs(
|
|
'seq_lengths',
|
|
'hidden_init',
|
|
'cell_init',
|
|
'target',
|
|
)
|
|
input_blob = model.DequeueBlobs(queue, "input_data")
|
|
labels = model.DequeueBlobs(label_queue, "label")
|
|
|
|
if args.implementation == "own":
|
|
output, last_hidden, _, last_state = rnn_cell.LSTM(
|
|
model=model,
|
|
input_blob=input_blob,
|
|
seq_lengths=seq_lengths,
|
|
initial_states=(hidden_init, cell_init),
|
|
dim_in=args.input_dim,
|
|
dim_out=args.hidden_dim,
|
|
scope="lstm1",
|
|
memory_optimization=args.memory_optimization,
|
|
forward_only=args.forward_only,
|
|
)
|
|
elif args.implementation == "cudnn":
|
|
# We need to feed a placeholder input so that RecurrentInitOp
|
|
# can infer the dimensions.
|
|
model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
|
|
output, last_hidden, _ = rnn_cell.cudnn_LSTM(
|
|
model=model,
|
|
input_blob=input_blob,
|
|
initial_states=(hidden_init, cell_init),
|
|
dim_in=args.input_dim,
|
|
dim_out=args.hidden_dim,
|
|
scope="cudnnlstm",
|
|
num_layers=1,
|
|
)
|
|
|
|
else:
|
|
assert False, "Unknown implementation"
|
|
|
|
weights = model.UniformFill(labels, "weights")
|
|
softmax, loss = model.SoftmaxWithLoss(
|
|
[model.Flatten(output), labels, weights],
|
|
['softmax', 'loss'],
|
|
)
|
|
|
|
if not args.forward_only:
|
|
model.AddGradientOperators([loss])
|
|
|
|
# carry states over
|
|
model.net.Copy(last_hidden, hidden_init)
|
|
model.net.Copy(last_hidden, cell_init)
|
|
|
|
workspace.FeedBlob(hidden_init, np.zeros(
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
))
|
|
workspace.FeedBlob(cell_init, np.zeros(
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
))
|
|
return model, output
|
|
|
|
|
|
def Caffe2LSTM(args):
|
|
T = args.data_size // args.batch_size
|
|
|
|
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
|
|
queue, label_queue = generate_data(T // args.seq_length,
|
|
input_blob_shape,
|
|
args.hidden_dim)
|
|
|
|
workspace.FeedBlob(
|
|
"seq_lengths",
|
|
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
|
|
)
|
|
|
|
model, output = create_model(args, queue, label_queue, input_blob_shape)
|
|
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.CreateNet(model.net)
|
|
|
|
last_time = time.time()
|
|
start_time = last_time
|
|
num_iters = T // args.seq_length
|
|
entries_per_iter = args.seq_length * args.batch_size
|
|
|
|
# Run the Benchmark
|
|
log.info("------ Starting benchmark ------")
|
|
for iteration in range(0, num_iters, args.iters_to_report):
|
|
iters_once = min(args.iters_to_report, num_iters - iteration)
|
|
workspace.RunNet(model.net.Proto().name, iters_once)
|
|
|
|
new_time = time.time()
|
|
log.info("Iter: {} / {}. Entries Per Second: {}k.". format(
|
|
iteration,
|
|
num_iters,
|
|
entries_per_iter * iters_once / (new_time - last_time) // 1000,
|
|
))
|
|
last_time = new_time
|
|
|
|
log.info("Done. Total EPS: {}k".format(
|
|
entries_per_iter * num_iters / (time.time() - start_time) // 1000,
|
|
))
|
|
|
|
if (args.gpu):
|
|
log.info("Memory stats:")
|
|
stats = utils.GetGPUMemoryUsageStats()
|
|
log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
|
|
if (stats['max_total'] != stats['total']):
|
|
log.warning(
|
|
"Max usage differs from current total usage: {} > {}".
|
|
format(stats['max_total'], stats['total'])
|
|
)
|
|
log.warning("This means that costly deallocations occured.")
|
|
|
|
|
|
@utils.debug
|
|
def Benchmark(args):
|
|
Caffe2LSTM(args)
|
|
|
|
|
|
def GetArgumentParser():
|
|
parser = argparse.ArgumentParser(description="LSTM benchmark.")
|
|
|
|
parser.add_argument(
|
|
"--hidden_dim",
|
|
type=int,
|
|
default=40,
|
|
help="Hidden dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--input_dim",
|
|
type=int,
|
|
default=40,
|
|
help="Input dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--batch_size",
|
|
type=int,
|
|
default=256,
|
|
help="The batch size."
|
|
)
|
|
parser.add_argument(
|
|
"--seq_length",
|
|
type=int,
|
|
default=20,
|
|
help="Max sequence length"
|
|
)
|
|
parser.add_argument(
|
|
"--data_size",
|
|
type=int,
|
|
default=10000000,
|
|
help="Number of data points to generate"
|
|
)
|
|
parser.add_argument(
|
|
"--iters_to_report",
|
|
type=int,
|
|
default=100,
|
|
help="Number of iteration to report progress"
|
|
)
|
|
parser.add_argument(
|
|
"--gpu",
|
|
action="store_true",
|
|
help="Run all on GPU",
|
|
)
|
|
parser.add_argument(
|
|
"--implementation",
|
|
type=str,
|
|
default="own",
|
|
help="'cudnn' or 'own'",
|
|
)
|
|
parser.add_argument(
|
|
"--memory_optimization",
|
|
action="store_true",
|
|
help="Whether to use memory optimized LSTM or not",
|
|
)
|
|
parser.add_argument(
|
|
"--forward_only",
|
|
action="store_true",
|
|
help="Whether to run only forward pass"
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = GetArgumentParser().parse_args()
|
|
|
|
workspace.GlobalInit([
|
|
'caffe2',
|
|
'--caffe2_log_level=0',
|
|
'--caffe2_print_blob_sizes_at_exit=0',
|
|
'--caffe2_gpu_memory_tracking=1'])
|
|
|
|
device = core.DeviceOption(
|
|
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
|
|
|
|
with core.DeviceScope(device):
|
|
Benchmark(args)
|