From 798a6d2be17e03593a103edd54c1c4ea4b779e74 Mon Sep 17 00:00:00 2001
From: Aidyn-A <aidyn.b.aitzhan@gmail.com>
Date: Sat, 25 Oct 2025 10:40:59 +0000
Subject: [PATCH] [Inductor][Autotune] Gracefully restart the autotune process
 after ULF failure (#166073)

This PR partially fixes https://github.com/pytorch/torchtitan/issues/1791, as it will work with `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1` setting only.

The core of the problem: In `max-autotune` mode Inductor runs multiple benchmarks to determine the best config. If one of these benchmarks fails with `cudaErrorLaunchFailure`, all other CUDA calls within the same process will fail including the rest of the benchmarks.

The solution: Restart the child process gracefully and continue benchmarking.

Unfortunately, if autotuning is done in the main process, the whole program falls into unrecoverable state. In this case, the only way of successful execution would be just preventing the ULF.

Here is some info from [CUDA documentation](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html):
>cudaErrorLaunchFailure = 719
An exception occurred on the device while executing a kernel. ... . This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166073
Approved by: https://github.com/syed-ahmed, https://github.com/drisspg
---
 torch/_inductor/autotune_process.py | 14 +++++++++++++-
 torch/_inductor/select_algorithm.py |  8 +++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index a504b54f132..4b6d9e7347b 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -231,6 +231,13 @@ class TuningProcess:
             self.process.kill()
         self.close()
 
+    def restart(self) -> None:
+        """
+        Gracefully restarts the child process.
+        """
+        self.shutdown(wait=True)
+        self.start()
+
 
 class TuningProcessPool:
     """
@@ -311,11 +318,16 @@ class TuningProcessPool:
             )
             # Set to INF so this choice will be ignored
             return float("inf")
-        except Exception:
+        except Exception as process_exception:
             warnings.warn(
                 f"Failed to benchmark choice '{choice}'. It will be ignored. "
                 "Please debug the root cause in case the choice can bring perf gains."
             )
+            # An unspecified launch failure (cudaErrorLaunchFailure) corrupts the
+            # CUDA context, making it unrecoverable. All subsequent CUDA calls will
+            # fail as well. The process must be restarted to restore CUDA functionality.
+            if "cudaErrorLaunchFailure" in str(process_exception):
+                process.restart()
             # Set to INF so this choice will be ignored
             return float("inf")
         finally:
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index ee09993caee..46438e7b989 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -3287,9 +3287,11 @@ class AlgorithmSelectorCache(PersistentCache):
                 msg = str(e)
                 if "invalid argument" in msg:
                     msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
-                else:
-                    if "illegal memory access" in msg:
-                        msg += "\n\nEither error in template or triton bug.\n"
+                elif "illegal memory access" in msg:
+                    msg += "\n\nEither error in template or triton bug.\n"
+                elif "unspecified launch failure" in msg:
+                    msg += "\n\nAn unrecoverable unspecified launch failure was caught during autotuning."
+                    msg += "\nPlease try re-running with TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1.\n\n"
 
                 if isinstance(choice, CUDATemplateCaller):
                     log.debug(