mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: We should resize the workspace-vector only when it increases. Otherwise we end up destroying and recreating workspaces constantly if sequence length varies. Modified the lstm_benchmark test to randomize sequence length. This provides big perf improvement to machine translation pipeline. Look at the recurrent network op runtimes. WITH: I0328 12:17:54.073976 492094 prof_dag_net.cc:156] 136.271 ms/iter ( 120.987 ms/iter) RecurrentNetwork I0328 12:17:54.073982 492094 prof_dag_net.cc:156] 190.074 ms/iter ( 156.828 ms/iter) RecurrentNetworkGradient WITHOUT: I0328 12:25:17.658206 518884 prof_dag_net.cc:156] 375.369 ms/iter ( 249.268 ms/iter) RecurrentNetwork I0328 12:25:17.658211 518884 prof_dag_net.cc:156] 278.892 ms/iter ( 227.29 ms/iter) RecurrentNetworkGradient With LSTM benchmark, get about 2x speedup Reviewed By: jamesr66a Differential Revision: D4789354 fbshipit-source-id: ad72f61974e35b0474abcacdc466ae9c6b4eb0ff
183 lines
4.9 KiB
Python
183 lines
4.9 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from caffe2.proto import caffe2_pb2
|
|
from caffe2.python import cnn, workspace, core, utils, recurrent
|
|
|
|
import argparse
|
|
import numpy as np
|
|
import time
|
|
|
|
import logging
|
|
|
|
logging.basicConfig()
|
|
log = logging.getLogger("lstm_bench")
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
|
|
def generate_data(T, shape):
|
|
'''
|
|
Fill a queue with input data
|
|
'''
|
|
log.info("Generating T={} sequence batches".format(T))
|
|
|
|
generate_input_init_net = core.Net('generate_input_init')
|
|
queue = generate_input_init_net.CreateBlobsQueue(
|
|
[], "inputqueue", num_blobs=1, capacity=T,
|
|
)
|
|
|
|
workspace.RunNetOnce(generate_input_init_net)
|
|
generate_input_net = core.Net('generate_input')
|
|
generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
|
|
np.random.seed(2603)
|
|
|
|
for t in range(T):
|
|
if (t % 50 == 0):
|
|
print("Generating data {}/{}".format(t, T))
|
|
# Randomize the seqlength
|
|
random_shape = [np.random.randint(1, shape[0])] + shape[1:]
|
|
X = np.random.rand(*random_shape).astype(np.float32)
|
|
workspace.FeedBlob("scratch", X)
|
|
workspace.RunNetOnce(generate_input_net.Proto())
|
|
log.info("Finished data generation")
|
|
return queue
|
|
|
|
|
|
def create_model(args, queue):
|
|
model = cnn.CNNModelHelper(name="LSTM_bench")
|
|
seq_lengths, hidden_init, cell_init, target = \
|
|
model.net.AddExternalInputs(
|
|
'seq_lengths',
|
|
'hidden_init',
|
|
'cell_init',
|
|
'target',
|
|
)
|
|
input_blob = model.DequeueBlobs(queue, "input_data")
|
|
all_hidden, last_hidden, _, last_state = recurrent.LSTM(
|
|
model=model,
|
|
input_blob=input_blob,
|
|
seq_lengths=seq_lengths,
|
|
initial_states=(hidden_init, cell_init),
|
|
dim_in=args.input_dim,
|
|
dim_out=args.hidden_dim,
|
|
scope="lstm1",
|
|
)
|
|
|
|
model.AddGradientOperators([all_hidden])
|
|
|
|
# carry states over
|
|
model.net.Copy(last_hidden, hidden_init)
|
|
model.net.Copy(last_hidden, cell_init)
|
|
|
|
workspace.FeedBlob(hidden_init, np.zeros(
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
))
|
|
workspace.FeedBlob(cell_init, np.zeros(
|
|
[1, args.batch_size, args.hidden_dim], dtype=np.float32
|
|
))
|
|
return model
|
|
|
|
|
|
def Caffe2LSTM(args):
|
|
T = args.data_size // args.batch_size
|
|
input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
|
|
queue = generate_data(T // args.seq_length, input_blob_shape)
|
|
|
|
workspace.FeedBlob(
|
|
"seq_lengths",
|
|
np.array([args.seq_length] * args.batch_size, dtype=np.int32)
|
|
)
|
|
|
|
model = create_model(args, queue)
|
|
|
|
workspace.RunNetOnce(model.param_init_net)
|
|
workspace.CreateNet(model.net)
|
|
|
|
last_time = time.time()
|
|
start_time = last_time
|
|
num_iters = T // args.seq_length
|
|
entries_per_iter = args.seq_length * args.batch_size
|
|
|
|
# Run the Benchmark
|
|
log.info("------ Starting benchmark ------")
|
|
for iteration in range(0, num_iters, args.iters_to_report):
|
|
iters_once = min(args.iters_to_report, num_iters - iteration)
|
|
workspace.RunNet(model.net.Proto().name, iters_once)
|
|
new_time = time.time()
|
|
log.info("Iter: {} / {}. Entries Per Second: {}k". format(
|
|
iteration,
|
|
num_iters,
|
|
entries_per_iter * iters_once / (new_time - last_time) // 1000
|
|
))
|
|
last_time = new_time
|
|
|
|
log.info("Done. Total EPS: {}k".format(
|
|
entries_per_iter * num_iters / (time.time() - start_time) // 1000
|
|
))
|
|
|
|
|
|
@utils.debug
|
|
def Benchmark(args):
|
|
Caffe2LSTM(args)
|
|
|
|
|
|
def GetArgumentParser():
|
|
parser = argparse.ArgumentParser(description="LSTM benchmark.")
|
|
parser.add_argument(
|
|
"--hidden_dim",
|
|
type=int,
|
|
default=40,
|
|
help="Hidden dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--input_dim",
|
|
type=int,
|
|
default=40,
|
|
help="Input dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--batch_size",
|
|
type=int,
|
|
default=256,
|
|
help="The batch size."
|
|
)
|
|
parser.add_argument(
|
|
"--seq_length",
|
|
type=int,
|
|
default=20,
|
|
help="Max sequence length"
|
|
)
|
|
parser.add_argument(
|
|
"--data_size",
|
|
type=int,
|
|
default=10000000,
|
|
help="Number of data points to generate"
|
|
)
|
|
parser.add_argument(
|
|
"--iters_to_report",
|
|
type=int,
|
|
default=100,
|
|
help="Number of iteration to report progress"
|
|
)
|
|
parser.add_argument(
|
|
"--gpu",
|
|
action="store_true",
|
|
help="Run all on GPU",
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = GetArgumentParser().parse_args()
|
|
|
|
workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
|
|
|
|
device = core.DeviceOption(
|
|
caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
|
|
|
|
with core.DeviceScope(device):
|
|
Benchmark(args)
|