mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
In this approach, we are catching any lane within a wave that is doing fastatomics to the same destination address and computing the sum on the CU. This is leading to 3x improvement in scatter_add performance and 2x improvement in index_select.
scatter_add performance on MI300x:
dtype|Baseline (before optimizations)|opportunistic fastatomics
-------|----------------------------------|----------------------------------
f32|1.389425039|0.430447996
fp16|2.195472956|0.779729486
bf16|2.194051027|0.784599513
Using the following reproducer
```
import torch
import triton
def main():
dtype = torch.float32
dim = 1305301
a = torch.rand(100, device="cuda", dtype=dtype)
index = torch.randint(0, 100, (dim,), device="cuda")
src = torch.rand(dim, device="cuda", dtype=dtype)
print("=" * 20)
print(
triton.testing.do_bench(
lambda: a.scatter_add(0, index, src),
return_mode="median",
)
)
print("=" * 20)
if __name__ == "__main__":
main()
```
co-authored by: @amd-hhashemi
Pull Request resolved: https://github.com/pytorch/pytorch/pull/146264
Approved by: https://github.com/jeffdaily, https://github.com/mxz297
Co-authored-by: Hashem Hashemi <hashem.hashemi@amd.com>
|
||
|---|---|---|
| .. | ||
| _strobelight | ||
| _sympy | ||
| backcompat | ||
| benchmark | ||
| bottleneck | ||
| data | ||
| hipify | ||
| jit | ||
| model_dump | ||
| serialization | ||
| tensorboard | ||
| viz | ||
| __init__.py | ||
| _appending_byte_serializer.py | ||
| _backport_slots.py | ||
| _config_module.py | ||
| _config_typing.pyi | ||
| _content_store.py | ||
| _contextlib.py | ||
| _cpp_embed_headers.py | ||
| _cpp_extension_versioner.py | ||
| _cxx_pytree.py | ||
| _device.py | ||
| _exposed_in.py | ||
| _filelock.py | ||
| _foreach_utils.py | ||
| _freeze.py | ||
| _functools.py | ||
| _get_clean_triton.py | ||
| _import_utils.py | ||
| _mode_utils.py | ||
| _ordered_set.py | ||
| _python_dispatch.py | ||
| _pytree.py | ||
| _stats.py | ||
| _thunk.py | ||
| _traceback.py | ||
| _triton.py | ||
| _typing_utils.py | ||
| _zip.py | ||
| backend_registration.py | ||
| bundled_inputs.py | ||
| checkpoint.py | ||
| collect_env.py | ||
| cpp_backtrace.py | ||
| cpp_extension.py | ||
| deterministic.py | ||
| dlpack.py | ||
| file_baton.py | ||
| flop_counter.py | ||
| hooks.py | ||
| mkldnn.py | ||
| mobile_optimizer.py | ||
| model_zoo.py | ||
| module_tracker.py | ||
| show_pickle.py | ||
| throughput_benchmark.py | ||
| weak.py | ||