mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51093 Operator level benchmarks comparing eager-mode PyTorch to NNC-generated fused kernels. We wouldn't normally see these in isolation, but it points out where NNC is falling short (or doing well). I threw in a composed hardswish for fun, because it's my favorite activation function. Notably, it exposes a bug in our build process that's preventing vectorization from using `sleef`, so we're using scalar calls to libm with predictably lousy performance. Fix incoming. This benchmark is similar to the pure NNC approach in `microbenchmarks.py`, but will include the overhead of dispatching the fused kernel through TorchScript. ghstack-source-id: 120403675 Test Plan: ``` op eager nnc speedup hardswish 0.187 0.051 3.70 hardswish 0.052 0.052 1.00 sigmoid 0.148 1.177 0.13 reciprocal 0.049 0.050 0.98 neg 0.038 0.037 1.02 relu 0.037 0.036 1.03 isnan 0.119 0.020 5.86 log 0.082 1.330 0.06 log10 0.148 1.848 0.08 log1p 0.204 1.413 0.14 log2 0.285 1.167 0.24 exp 0.063 1.123 0.06 expm1 0.402 1.417 0.28 erf 0.167 0.852 0.20 erfc 0.181 1.098 0.16 cos 0.124 0.793 0.16 sin 0.126 0.838 0.15 tan 0.285 1.777 0.16 acos 0.144 1.358 0.11 asin 0.126 1.193 0.11 cosh 0.384 1.761 0.22 sinh 0.390 2.279 0.17 atan 0.240 1.564 0.15 tanh 0.320 2.259 0.14 sqrt 0.043 0.069 0.63 rsqrt 0.118 0.117 1.01 abs 0.038 0.037 1.03 ceil 0.038 0.038 1.01 floor 0.039 0.039 1.00 round 0.039 0.292 0.13 trunc 0.040 0.036 1.12 lgamma 2.045 2.721 0.75 ``` Reviewed By: zheng-xq Differential Revision: D26069791 fbshipit-source-id: 236e7287ba1b3f67fdcb938949a92bbbdfa13dba
68 lines
1.4 KiB
Python
68 lines
1.4 KiB
Python
import timeit
|
|
import torch
|
|
|
|
torch._C._jit_override_can_fuse_on_cpu(True)
|
|
torch._C._debug_set_fusion_group_inlining(False)
|
|
torch.set_num_threads(1)
|
|
|
|
|
|
def hardswish(x):
|
|
return x * torch.clamp(x + 3.0, 0.0, 6.0) / 6.0
|
|
|
|
|
|
unary_ops = [
|
|
hardswish,
|
|
torch._C._nn.hardswish,
|
|
torch.sigmoid,
|
|
torch.reciprocal,
|
|
torch.neg,
|
|
torch.relu,
|
|
torch.isnan,
|
|
torch.log,
|
|
torch.log10,
|
|
torch.log1p,
|
|
torch.log2,
|
|
torch.exp,
|
|
torch.expm1,
|
|
torch.erf,
|
|
torch.erfc,
|
|
torch.cos,
|
|
torch.sin,
|
|
torch.tan,
|
|
torch.acos,
|
|
torch.asin,
|
|
torch.cosh,
|
|
torch.sinh,
|
|
torch.atan,
|
|
torch.tanh,
|
|
torch.sqrt,
|
|
torch.rsqrt,
|
|
torch.abs,
|
|
torch.ceil,
|
|
torch.floor,
|
|
torch.round,
|
|
torch.trunc,
|
|
torch.lgamma,
|
|
]
|
|
|
|
print("{:20s} {:>10s} {:>10s} {:>10s}".format("op", "eager", "nnc", "speedup"))
|
|
|
|
for op in unary_ops:
|
|
x = torch.rand((1024, 1024))
|
|
traced = torch.jit.trace(lambda x: op(x), (x))
|
|
|
|
# Warmup.
|
|
warmup_iters = 8
|
|
for _ in range(warmup_iters):
|
|
op(x)
|
|
traced(x)
|
|
|
|
# Validate result.
|
|
torch.testing.assert_allclose(op(x), traced(x))
|
|
|
|
# Benchmark.
|
|
bench_iters = 100
|
|
teager = timeit.timeit(stmt="op(x)", globals=globals(), number=bench_iters)
|
|
tjit = timeit.timeit(stmt="traced(x)", globals=globals(), number=bench_iters)
|
|
print(f"{op.__name__:20s} {teager:10.3f} {tjit:10.3f} {teager/tjit:10.2f}")
|