mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47767 This diff implements the functionality of running benchmark on mobile on top of operator_benchmark framework. It does so through a few steps: 1. create a scripted module from existing benchmark case. 2. run mobile specific optimization pass on the scripted module 3. run the scripted module on AiBench by calling its Python API A small change in the way of writing a benchmark case is introduced so that both local and mobile run can share the same interface. The change is about having inputs as arguments of the `forward` function, so that mobile optimization pass can be run successfully (otherwise everything will be optimized away by constant propagation). Test Plan: ## local op_bench run buck run caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 --warmup_iterations 1 buck run caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 --warmup_iterations 1 --use_jit Exceptions: `py_module` op in `FakeQuantizePerTensorBaseOpBenchmark` and `FakeQuantizePerChannelBaseOpBenchmark` under JIT mode. These tests also failed in the base version ``` RuntimeError: Module 'FakeQuantizePerChannelOpBenchmark' has no attribute 'op_func' (This function exists as an attribute on the Python module, but we failed to compile it to a TorchScript function. The error stack is reproduced here: Python builtin <built-in method apply of FunctionMeta object at 0x619000c652a0> is currently not supported in Torchscript: File "/data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/pt/quantization_test#link-tree/quantization_test.py", line 260 quant_min: int, quant_max: int ): return _LearnableFakeQuantizePerChannelOp.apply(input, scale, zero_point, axis, quant_min, quant_max, 1.0) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE : File "/data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/pt/quantization_test#link-tree/quantization_test.py", line 313 axis: int, quant_min: int, quant_max: int ): return self.op_func(input, scale, zero_point, axis, quant_min, quant_max) ~~~~~~~~~~~~ <--- HERE ``` `_consume_op` typing mismatch: chunk, split, qobserver, sort in qunary. These will be fixed in D24774105 ## OSS test python3 -m benchmark_all_test --iterations 1 --warmup_iterations 1 --use_jit python3 -m benchmark_all_test --iterations 1 --warmup_iterations 1 ## saved module graph ``` module __torch__.mobile_benchmark_utils.OpBenchmarkMobile { parameters { } attributes { training = True num_iters = 1 benchmark = <__torch__.pt.add_test.___torch_mangle_4.AddBenchmark object at 0x6070001b8b50> } methods { method forward { graph(%self : __torch__.mobile_benchmark_utils.OpBenchmarkMobile): %12 : None = prim::Constant() # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:9:4 %4 : bool = prim::Constant[value=1]() # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:10:8 %1 : int = prim::GetAttr[name="num_iters"](%self) = prim::Loop(%1, %4) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:10:8 block0(%i : int): %6 : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark = prim::GetAttr[name="benchmark"](%self) %7 : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark = prim::GetAttr[name="benchmark"](%self) %self.inputs_tuple : (Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu), Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu)) = prim::Constant[value=({0.48884}, {0.809042})]() %9 : Tensor, %10 : Tensor = prim::TupleUnpack(%self.inputs_tuple) %23 : int = prim::Constant[value=1]() %24 : Tensor = aten::add(%9, %10, %23) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/pt/add_test.py:39:15 -> (%4) return (%12) } } submodules { module __torch__.pt.add_test.___torch_mangle_4.AddBenchmark { parameters { } attributes { mobile_optimized = True } methods { method forward { graph(%self : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark, %input_one.1 : Tensor, %input_two.1 : Tensor): %3 : int = prim::Constant[value=1]() %4 : Tensor = aten::add(%input_one.1, %input_two.1, %3) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/pt/add_test.py:39:15 return (%4) } method get_inputs { graph(%self : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark): %self.inputs_tuple : (Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu), Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu)) = prim::Constant[value=({0.48884}, {0.809042})]() return (%self.inputs_tuple) } } submodules { } } } } ``` Reviewed By: kimishpatel Differential Revision: D24322214 fbshipit-source-id: 335317eca4f40c4083883eb41dc47caf25cbdfd1
155 lines
4.3 KiB
Python
155 lines
4.3 KiB
Python
import argparse
|
|
|
|
import torch
|
|
|
|
import benchmark_core
|
|
import benchmark_utils
|
|
|
|
"""Performance microbenchmarks's main binary.
|
|
|
|
This is the main function for running performance microbenchmark tests.
|
|
It also registers existing benchmark tests via Python module imports.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Run microbenchmarks.",
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
)
|
|
|
|
def parse_args():
|
|
parser.add_argument(
|
|
'--tag_filter',
|
|
help='tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)',
|
|
default='short')
|
|
|
|
# This option is used to filter test cases to run.
|
|
parser.add_argument(
|
|
'--operators',
|
|
help='Filter tests based on comma-delimited list of operators to test',
|
|
default=None)
|
|
|
|
parser.add_argument(
|
|
'--operator_range',
|
|
help='Filter tests based on operator_range(e.g. a-c or b,c-d)',
|
|
default=None)
|
|
|
|
parser.add_argument(
|
|
'--test_name',
|
|
help='Run tests that have the provided test_name',
|
|
default=None)
|
|
|
|
parser.add_argument(
|
|
'--list_ops',
|
|
help='List operators without running them',
|
|
action='store_true')
|
|
|
|
parser.add_argument(
|
|
'--list_tests',
|
|
help='List all test cases without running them',
|
|
action='store_true')
|
|
|
|
parser.add_argument(
|
|
"--iterations",
|
|
help="Repeat each operator for the number of iterations",
|
|
type=int
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num_runs",
|
|
help="Run each test for num_runs. Each run executes an operator for number of <--iterations>",
|
|
type=int,
|
|
default=1,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--min_time_per_test",
|
|
help="Set the minimum time (unit: seconds) to run each test",
|
|
type=int,
|
|
default=0,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--warmup_iterations",
|
|
help="Number of iterations to ignore before measuring performance",
|
|
default=100,
|
|
type=int
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--omp_num_threads",
|
|
help="Number of OpenMP threads used in PyTorch/Caffe2 runtime",
|
|
default=None,
|
|
type=int
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--mkl_num_threads",
|
|
help="Number of MKL threads used in PyTorch/Caffe2 runtime",
|
|
default=None,
|
|
type=int
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--ai_pep_format",
|
|
type=benchmark_utils.str2bool,
|
|
nargs='?',
|
|
const=True,
|
|
default=False,
|
|
help="Print result when running on AI-PEP"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--use_jit",
|
|
type=benchmark_utils.str2bool,
|
|
nargs='?',
|
|
const=True,
|
|
default=False,
|
|
help="Run operators with PyTorch JIT mode"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--forward_only",
|
|
type=benchmark_utils.str2bool,
|
|
nargs='?',
|
|
const=True,
|
|
default=False,
|
|
help="Only run the forward path of operators"
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--framework',
|
|
help='Comma-delimited list of frameworks to test (Caffe2, PyTorch)',
|
|
default="Caffe2,PyTorch")
|
|
|
|
parser.add_argument(
|
|
'--device',
|
|
help='Run tests on the provided architecture (cpu, cuda)',
|
|
default='None')
|
|
|
|
args, _ = parser.parse_known_args()
|
|
|
|
if args.omp_num_threads:
|
|
# benchmark_utils.set_omp_threads sets the env variable OMP_NUM_THREADS
|
|
# which doesn't have any impact as C2 init logic has already been called
|
|
# before setting the env var.
|
|
|
|
# In general, OMP_NUM_THREADS (and other OMP env variables) needs to be set
|
|
# before the program is started.
|
|
# From Chapter 4 in OMP standard: https://www.openmp.org/wp-content/uploads/openmp-4.5.pdf
|
|
# "Modifications to the environment variables after the program has started,
|
|
# even if modified by the program itself, are ignored by the OpenMP implementation"
|
|
benchmark_utils.set_omp_threads(args.omp_num_threads)
|
|
if benchmark_utils.is_pytorch_enabled(args.framework):
|
|
torch.set_num_threads(args.omp_num_threads)
|
|
if args.mkl_num_threads:
|
|
benchmark_utils.set_mkl_threads(args.mkl_num_threads)
|
|
|
|
return args
|
|
|
|
def main():
|
|
args = parse_args()
|
|
benchmark_core.BenchmarkRunner(args).run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|