use libdevice for tanh (#90889)

Per title I see slight differences in perf with this implementation, where standalone tanh is slightly slower for a tensor of 4000000 elements (20.4 us instead of 19.4us), other sizes are within noise. @bertmaher could you check if it affects your benchmarks? Pull Request resolved: https://github.com/pytorch/pytorch/pull/90889 Approved by: https://github.com/bertmaher, https://github.com/anijain2305
2025-12-06 12:20:52 +01:00 · 2022-12-20 02:11:53 +00:00 · 2022-12-20 02:11:53 +00:00 · 0148809131
commit 0148809131
parent 30edd39bdc
4 changed files with 13 additions and 5 deletions
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -416,6 +416,10 @@ class CppOverrides(OpOverrides):
    def expm1(x):
        return f"std::expm1({x})"

+    @staticmethod
+    def tanh(x):
+        return f"std::tanh({x})"
+
    @staticmethod
    def signbit(x):
        return f"std::signbit({x})"
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@ -239,6 +239,10 @@ class TritonOverrides(OpOverrides):
    def expm1(x):
        return f"tl.libdevice.expm1({x})"

+    @staticmethod
+    def tanh(x):
+        return f"tl.libdevice.tanh({x})"
+
    @staticmethod
    def sigmoid(x):
        return f"tl.sigmoid({x})"
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@ -126,11 +126,6 @@ def clamp(x, min=None, max=None):
    return x


-@register_decomposition([aten.tanh])
-def tanh(x):
-    return 2.0 / (1.0 + torch.exp(-2.0 * x)) - 1.0
-
-
 # TorchInductor-only decomposition. It should not be taken to core.
 # See https://github.com/pytorch/torchdynamo/pull/1120
@register_decomposition([aten.floor_divide.default])
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@ -3621,6 +3621,11 @@ register_pointwise(
    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )

+register_pointwise(
+    aten.tanh,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
 register_pointwise(
    aten.log,
    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,