## @package lstm_benchmark # Module caffe2.python.lstm_benchmark from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, utils, rnn_cell, model_helper from caffe2.python import recurrent import argparse import numpy as np import time import logging logging.basicConfig() log = logging.getLogger("lstm_bench") log.setLevel(logging.DEBUG) def generate_data(T, shape, num_labels, fixed_shape): ''' Fill a queue with input data ''' log.info("Generating T={} sequence batches".format(T)) generate_input_init_net = core.Net('generate_input_init') queue = generate_input_init_net.CreateBlobsQueue( [], "inputqueue", num_blobs=1, capacity=T, ) label_queue = generate_input_init_net.CreateBlobsQueue( [], "labelqueue", num_blobs=1, capacity=T, ) workspace.RunNetOnce(generate_input_init_net) generate_input_net = core.Net('generate_input') generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"]) generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"]) np.random.seed(2603) entry_counts = [] for t in range(T): if (t % (max(10, T // 10)) == 0): print("Generating data {}/{}".format(t, T)) # Randomize the seqlength random_shape = ( [np.random.randint(1, shape[0])] + shape[1:] if t > 0 and not fixed_shape else shape ) X = np.random.rand(*random_shape).astype(np.float32) batch_size = random_shape[1] L = num_labels * batch_size labels = (np.random.rand(random_shape[0]) * L).astype(np.int32) workspace.FeedBlob("scratch", X) workspace.FeedBlob("label_scr", labels) workspace.RunNetOnce(generate_input_net.Proto()) entry_counts.append(random_shape[0] * random_shape[1]) log.info("Finished data generation") return queue, label_queue, entry_counts def create_model(args, queue, label_queue, input_shape): model = model_helper.ModelHelper(name="LSTM_bench") seq_lengths, target = \ model.net.AddExternalInputs( 'seq_lengths', 'target', ) input_blob = model.net.DequeueBlobs(queue, "input_data") labels = model.net.DequeueBlobs(label_queue, "label") init_blobs = [] if args.implementation in ["own", "static", "static_dag"]: T = None if "static" in args.implementation: assert args.fixed_shape, \ "Random input length is not static RNN compatible" T = args.seq_length print("Using static RNN of size {}".format(T)) for i in range(args.num_layers): hidden_init, cell_init = model.net.AddExternalInputs( "hidden_init_{}".format(i), "cell_init_{}".format(i) ) init_blobs.extend([hidden_init, cell_init]) output, last_hidden, _, last_state = rnn_cell.LSTM( model=model, input_blob=input_blob, seq_lengths=seq_lengths, initial_states=init_blobs, dim_in=args.input_dim, dim_out=[args.hidden_dim] * args.num_layers, scope="lstm1", memory_optimization=args.memory_optimization, forward_only=args.forward_only, drop_states=True, return_last_layer_only=True, static_rnn_unroll_size=T, ) if "dag" in args.implementation: print("Using DAG net type") model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 elif args.implementation == "cudnn": # We need to feed a placeholder input so that RecurrentInitOp # can infer the dimensions. init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init") model.param_init_net.ConstantFill([], input_blob, shape=input_shape) output, last_hidden, _ = rnn_cell.cudnn_LSTM( model=model, input_blob=input_blob, initial_states=init_blobs, dim_in=args.input_dim, dim_out=args.hidden_dim, scope="cudnnlstm", num_layers=args.num_layers, ) else: assert False, "Unknown implementation" weights = model.net.UniformFill(labels, "weights") softmax, loss = model.net.SoftmaxWithLoss( [model.Flatten(output), labels, weights], ['softmax', 'loss'], ) if not args.forward_only: model.AddGradientOperators([loss]) # carry states over for init_blob in init_blobs: model.net.Copy(last_hidden, init_blob) sz = args.hidden_dim if args.implementation == "cudnn": sz *= args.num_layers workspace.FeedBlob(init_blob, np.zeros( [1, args.batch_size, sz], dtype=np.float32 )) if args.rnn_executor: for op in model.net.Proto().op: if op.type.startswith('RecurrentNetwork'): recurrent.set_rnn_executor_config( op, num_threads=args.rnn_executor_num_threads, max_cuda_streams=args.rnn_executor_max_cuda_streams, ) return model, output def Caffe2LSTM(args): T = args.data_size // args.batch_size input_blob_shape = [args.seq_length, args.batch_size, args.input_dim] queue, label_queue, entry_counts = generate_data(T // args.seq_length, input_blob_shape, args.hidden_dim, args.fixed_shape) workspace.FeedBlob( "seq_lengths", np.array([args.seq_length] * args.batch_size, dtype=np.int32) ) model, output = create_model(args, queue, label_queue, input_blob_shape) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) start_time = time.time() num_iters = T // args.seq_length total_iters = 0 # Run the Benchmark log.info("------ Warming up ------") workspace.RunNet(model.net.Proto().name) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) log.info("------ Starting benchmark ------") start_time = time.time() last_time = time.time() for iteration in range(1, num_iters, args.iters_to_report): iters_once = min(args.iters_to_report, num_iters - iteration) total_iters += iters_once workspace.RunNet(model.net.Proto().name, iters_once) new_time = time.time() log.info( "Iter: {} / {}. Entries Per Second: {}k.".format( iteration, num_iters, np.sum(entry_counts[iteration:iteration + iters_once]) / (new_time - last_time) // 100 / 10, ) ) last_time = new_time log.info("Done. Total EPS excluding 1st iteration: {}k {}".format( np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10, " (with RNN executor)" if args.rnn_executor else "", )) if (args.gpu): log.info("Memory stats:") stats = utils.GetGPUMemoryUsageStats() log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024)) if (stats['max_total'] != stats['total']): log.warning( "Max usage differs from current total usage: {} > {}". format(stats['max_total'], stats['total']) ) log.warning("This means that costly deallocations occurred.") return time.time() - start_time @utils.debug def Benchmark(args): return Caffe2LSTM(args) def GetArgumentParser(): parser = argparse.ArgumentParser(description="LSTM benchmark.") parser.add_argument( "--hidden_dim", type=int, default=800, help="Hidden dimension", ) parser.add_argument( "--input_dim", type=int, default=40, help="Input dimension", ) parser.add_argument( "--batch_size", type=int, default=128, help="The batch size." ) parser.add_argument( "--seq_length", type=int, default=20, help="Max sequence length" ) parser.add_argument( "--data_size", type=int, default=1000000, help="Number of data points to generate" ) parser.add_argument( "--iters_to_report", type=int, default=20, help="Number of iteration to report progress" ) parser.add_argument( "--gpu", action="store_true", help="Run all on GPU", ) parser.add_argument( "--implementation", type=str, default="own", help="'cudnn', 'own', 'static' or 'static_dag'", ) parser.add_argument( "--fixed_shape", action="store_true", help=("Whether to randomize shape of input batches. " "Static RNN requires fixed shape"), ) parser.add_argument( "--memory_optimization", action="store_true", help="Whether to use memory optimized LSTM or not", ) parser.add_argument( "--forward_only", action="store_true", help="Whether to run only forward pass" ) parser.add_argument( "--num_layers", type=int, default=1, help="Number of LSTM layers. All output dimensions are going to be" "of hidden_dim size", ) parser.add_argument( "--rnn_executor", action="store_true", help="Whether to use RNN executor" ) parser.add_argument( "--rnn_executor_num_threads", type=int, default=None, help="Number of threads used by CPU RNN Executor" ) parser.add_argument( "--rnn_executor_max_cuda_streams", type=int, default=None, help="Maximum number of CUDA streams used by RNN executor on GPU" ) return parser if __name__ == '__main__': args, extra_args = GetArgumentParser().parse_known_args() rnn_executor_opt = 1 if args.rnn_executor else 0 workspace.GlobalInit([ 'caffe2', '--caffe2_log_level=0', '--caffe2_print_blob_sizes_at_exit=0', '--caffe2_rnn_executor={}'.format(rnn_executor_opt), '--caffe2_gpu_memory_tracking=1'] + extra_args) device = core.DeviceOption( workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4) with core.DeviceScope(device): Benchmark(args)