From 850ba8c96d174c1328b9a91c4581525d84e0514b Mon Sep 17 00:00:00 2001
From: mansiag05 <managarw@redhat.com>
Date: Mon, 20 Oct 2025 23:03:43 +0000
Subject: [PATCH] [Code Clean] Clean asserts in torch/autograd. (#165627)

Replaces 78 assert statements across 10 files in torch.autograd with explicit if-checks raising AssertionError to prevent assertions from being disabled with Python -O flag. This ensures error checking remains active in optimized builds.

fix partially #164878

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165627
Approved by: https://github.com/albanD
---
 torch/autograd/__init__.py          |  24 ++++---
 torch/autograd/_functions/tensor.py |   5 +-
 torch/autograd/function.py          |   9 +--
 torch/autograd/functional.py        |  42 +++++++----
 torch/autograd/grad_mode.py         |   3 +-
 torch/autograd/gradcheck.py         | 107 ++++++++++++++++++----------
 torch/autograd/graph.py             |  24 ++++---
 torch/autograd/profiler.py          |  70 ++++++++++--------
 torch/autograd/profiler_legacy.py   |  39 ++++++----
 torch/autograd/profiler_util.py     |  73 +++++++++++++------
 10 files changed, 261 insertions(+), 135 deletions(-)

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index bca0ce12890..c0a8d30df32 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -113,7 +113,8 @@ def _make_grads(
             # circular import
             from torch.nested._internal.nested_tensor import NestedTensor
 
-            assert isinstance(out, torch.Tensor)
+            if not isinstance(out, torch.Tensor):
+                raise AssertionError("Expected output to be a torch.Tensor")
             out_dtype = out.dtype
             out_is_nested = out.is_nested
             out_is_cpp_nested = out_is_nested and not isinstance(out, NestedTensor)
@@ -129,13 +130,15 @@ def _make_grads(
             # singleton int to represent jagged dimension, so that size() call
             # on nested tensor works.
             if out_is_cpp_nested:
-                assert isinstance(out, torch.Tensor)
+                if not isinstance(out, torch.Tensor):
+                    raise AssertionError("Expected output to be a torch.Tensor.")
                 shape_matches = torch.is_same_size(out, first_grad)
             else:
                 # We need to do a regular size check, without going through
                 # the operator, to be able to handle unbacked symints
                 # (expect_true ensures we can deal with unbacked)
-                assert out_size is not None
+                if out_size is None:
+                    raise AssertionError("Expected out_size to be set.")
                 shape_matches = expect_true(sym_eq(out_size, first_grad.size()))
 
             if not shape_matches:
@@ -191,10 +194,12 @@ def _make_grads(
         elif grad is None:
             if isinstance(out, graph.GradientEdge) or out.requires_grad:  # type: ignore[attr-defined]
                 if isinstance(out, graph.GradientEdge):
-                    assert out_size is not None
+                    if out_size is None:
+                        raise AssertionError("Expected out_size to be set.")
                     out_numel_is_1 = all(o == 1 for o in out_size)
                 else:
-                    assert isinstance(out, torch.Tensor)
+                    if not isinstance(out, torch.Tensor):
+                        raise AssertionError("Expected output to be a torch.Tensor")
                     out_numel_is_1 = out.numel() == 1
                 if not out_numel_is_1:
                     raise RuntimeError(
@@ -207,8 +212,10 @@ def _make_grads(
                     )
                     raise RuntimeError(msg)
                 if isinstance(out, graph.GradientEdge):
-                    assert out_size is not None
-                    assert out_device is not None
+                    if out_size is None:
+                        raise AssertionError("Expected out_size to be set.")
+                    if out_device is None:
+                        raise AssertionError("Expected out_device to be set.")
                     new_grads.append(
                         torch.ones(
                             out_size,
@@ -217,7 +224,8 @@ def _make_grads(
                         )
                     )
                 else:
-                    assert isinstance(out, torch.Tensor)
+                    if not isinstance(out, torch.Tensor):
+                        raise AssertionError("Expected output to be a torch.Tensor")
                     new_grads.append(
                         torch.ones_like(out, memory_format=torch.preserve_format)
                     )
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
index 3cbe61261a2..cfa6cd23323 100644
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@@ -65,5 +65,8 @@ class Resize(Function):
     @staticmethod
     # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
-        assert grad_output.numel() == ctx.numel
+        if grad_output.numel() != ctx.numel:
+            raise AssertionError(
+                f"Expected grad_output to have {ctx.numel} elements, but got {grad_output.numel()}"
+            )
         return grad_output.contiguous().view(ctx.input_sizes), None
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index d25d93d7274..70c6cad4e99 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -146,10 +146,11 @@ class FunctionCtx:
 
         """
         for tensor in tensors:
-            assert isinstance(tensor, torch.Tensor) or tensor is None, (
-                "save_for_forward expects all arguments to be tensors; you should "
-                "save non-tensors as attributes on ctx."
-            )
+            if not (isinstance(tensor, torch.Tensor) or tensor is None):
+                raise AssertionError(
+                    "save_for_forward expects all arguments to be tensors; you should "
+                    "save non-tensors as attributes on ctx."
+                )
 
         self.saved_for_forward = tensors
 
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 09ced2e03f7..e8bce9ed7c5 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -54,7 +54,8 @@ def _tuple_postprocess(res, to_unpack):
     # - invert _as_tuple when res should match the inp given to _as_tuple
     # - optionally remove nesting of two tuples created by multiple calls to _as_tuple
     if isinstance(to_unpack, tuple):
-        assert len(to_unpack) == 2
+        if len(to_unpack) != 2:
+            raise AssertionError("Expected to_unpack tuple to have exactly 2 elements")
         if not to_unpack[1]:
             res = tuple(el[0] for el in res)
         if not to_unpack[0]:
@@ -174,11 +175,17 @@ def _autograd_grad(
 ):
     # Version of autograd.grad that accepts `None` in outputs and do not compute gradients for them.
     # This has the extra constraint that inputs has to be a tuple
-    assert isinstance(outputs, tuple)
+    if not isinstance(outputs, tuple):
+        raise AssertionError("Expected outputs to be a tuple")
     if grad_outputs is None:
         grad_outputs = (None,) * len(outputs)
-    assert isinstance(grad_outputs, tuple)
-    assert len(outputs) == len(grad_outputs)
+    if not isinstance(grad_outputs, tuple):
+        raise AssertionError("Expected grad_outputs to be a tuple")
+    if len(outputs) != len(grad_outputs):
+        raise AssertionError(
+            f"Expected outputs and grad_outputs to have the same length, "
+            f"but got {len(outputs)} and {len(grad_outputs)}"
+        )
 
     new_outputs: tuple[torch.Tensor, ...] = ()
     new_grad_outputs: tuple[torch.Tensor, ...] = ()
@@ -489,8 +496,13 @@ def _construct_standard_basis_for(
     # See NOTE: [Computing jacobian with vmap and grad for multiple tensors]
     # for context behind this function. All the pre-conditions are guarded for
     # in torch.autograd.functional.jacobian.
-    assert len(tensors) == len(tensor_numels)
-    assert len(tensors) > 0
+    if len(tensors) != len(tensor_numels):
+        raise AssertionError(
+            f"Expected tensors and tensor_numels to have the same length, "
+            f"but got {len(tensors)} and {len(tensor_numels)}"
+        )
+    if len(tensors) == 0:
+        raise AssertionError("Expected at least one tensor")
     total_numel = sum(tensor_numels)
     chunks = tuple(
         tensor.new_zeros(total_numel, tensor_numel)
@@ -664,11 +676,12 @@ def jacobian(
         >>> jac.shape
         torch.Size([4, 2, 4, 2])
     """
-    assert strategy in ("forward-mode", "reverse-mode"), (
-        'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
-        'function has more outputs than inputs, "forward-mode" tends to be more performant. '
-        'Otherwise, prefer to use "reverse-mode".'
-    )
+    if strategy not in ("forward-mode", "reverse-mode"):
+        raise AssertionError(
+            'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
+            'function has more outputs than inputs, "forward-mode" tends to be more performant. '
+            'Otherwise, prefer to use "reverse-mode".'
+        )
     if strategy == "forward-mode":
         if create_graph:
             raise NotImplementedError(
@@ -932,10 +945,13 @@ def hessian(
                   [0., 6.]])))
     """
     is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "hessian")
-    assert outer_jacobian_strategy in (
+    if outer_jacobian_strategy not in (
         "forward-mode",
         "reverse-mode",
-    ), 'Expected strategy to be either "forward-mode" or "reverse-mode".'
+    ):
+        raise AssertionError(
+            'Expected strategy to be either "forward-mode" or "reverse-mode".'
+        )
 
     def ensure_single_output_function(*inp):
         out = func(*inp)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 9ea049d7165..1004b3b9631 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -408,7 +408,8 @@ class _unsafe_preserve_version_counter(_DecoratorContextManager):
 
     def __init__(self, tensors: Union[torch.Tensor, tuple[torch.Tensor, ...]]) -> None:
         self.tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tensors
-        assert isinstance(self.tensors, tuple)
+        if not isinstance(self.tensors, tuple):
+            raise AssertionError("Expected tensors to be a tuple")
         self.prev_versions = tuple(t._version for t in self.tensors)
 
     def __enter__(self) -> None:
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 91b6273dda0..674e42b34ad 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -363,8 +363,15 @@ def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
         # sparse compressed tensors don't implement sub/add/copy_
         # yet. However, in non-masked semantics context entry and v
         # have the same sparse indices ...
-        assert entry.layout == v.layout, (entry.layout, v.layout)
-        assert entry._nnz() == v._nnz(), (entry._nnz(), v._nnz(), entry.shape)
+        if entry.layout != v.layout:
+            raise AssertionError(
+                f"Expected entry and v to have the same layout, but got {entry.layout} and {v.layout}"
+            )
+        if entry._nnz() != v._nnz():
+            raise AssertionError(
+                f"Expected entry and v to have the same nnz, but got {entry._nnz()} and {v._nnz()} "
+                f"with entry shape {entry.shape}"
+            )
         # ... the finite differencing can be performed on values only:
         entry = entry.values()
         v = v.values()
@@ -403,13 +410,15 @@ def _compute_numerical_jvps_wrt_specific_input(
             jvp_fn(delta[1] * 1j) if isinstance(delta, tuple) else jvp_fn(delta * 1j)
         )
         for ds_dx, ds_dy in zip(ds_dx_tup, ds_dy_tup):
-            assert not ds_dx.is_complex()
+            if ds_dx.is_complex():
+                raise AssertionError("Expected ds_dx to be real-valued, not complex")
             # conjugate wirtinger derivative
             conj_w_d = ds_dx + ds_dy * 1j
             jvps.append(conj_w_d)
     else:
         for ds_dx in ds_dx_tup:  # R -> R or (R -> C for the forward AD case)
-            assert is_forward_ad or not ds_dx.is_complex()
+            if not is_forward_ad and ds_dx.is_complex():
+                raise AssertionError("Expected ds_dx to be real-valued, not complex.")
             jvps.append(ds_dx)
     return jvps
 
@@ -456,16 +465,18 @@ def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None
     # Check that the returned outputs don't have different dtype or shape when you
     # perturb the input
     on_index = f"on index {idx} " if idx is not None else ""
-    assert output1.shape == output2.shape, (
-        f"Expected `func` to return outputs with the same shape"
-        f" when inputs are perturbed {on_index}by {eps}, but got:"
-        f" shapes {output1.shape} and {output2.shape}."
-    )
-    assert output1.dtype == output2.dtype, (
-        f"Expected `func` to return outputs with the same dtype"
-        f" when inputs are perturbed {on_index}by {eps}, but got:"
-        f" dtypes {output1.dtype} and {output2.dtype}."
-    )
+    if output1.shape != output2.shape:
+        raise AssertionError(
+            f"Expected `func` to return outputs with the same shape"
+            f" when inputs are perturbed {on_index}by {eps}, but got:"
+            f" shapes {output1.shape} and {output2.shape}."
+        )
+    if output1.dtype != output2.dtype:
+        raise AssertionError(
+            f"Expected `func` to return outputs with the same dtype"
+            f" when inputs are perturbed {on_index}by {eps}, but got:"
+            f" dtypes {output1.dtype} and {output2.dtype}."
+        )
 
 
 def get_numerical_jacobian_wrt_specific_input(
@@ -478,7 +489,8 @@ def get_numerical_jacobian_wrt_specific_input(
     # is equivalent to a single col of the Jacobian matrix of fn.
     jacobian_cols: dict[int, list[torch.Tensor]] = {}
     input = inputs[input_idx] if input is None else input
-    assert input.requires_grad
+    if not input.requires_grad:
+        raise AssertionError("Expected input to have requires_grad=True")
     for x, idx, d_idx in _iter_tensor(input):
         wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x)
         input_to_perturb = x[idx]
@@ -687,7 +699,11 @@ def _get_numerical_vJu(
         # Filter out the Ju for non floating point outputs
         filtered_Ju = []
         func_out = _as_tuple(func_out)
-        assert len(all_Ju) == len(func_out)
+        if len(all_Ju) != len(func_out):
+            raise AssertionError(
+                f"Expected all_Ju and func_out to have the same length, "
+                f"but got {len(all_Ju)} and {len(func_out)}"
+            )
         for Ju, output in zip(all_Ju, func_out):
             if _is_float_or_complex_tensor(output):
                 filtered_Ju.append(Ju)
@@ -734,7 +750,11 @@ def _stack_and_check_tensors(
                 out_jacobian[:, j].zero_()
             else:
                 dense = tensor.to_dense() if tensor.layout != torch.strided else tensor
-                assert out_jacobian[:, j].numel() == dense.numel()
+                if out_jacobian[:, j].numel() != dense.numel():
+                    raise AssertionError(
+                        f"Expected out_jacobian column to have {dense.numel()} elements, "
+                        f"but got {out_jacobian[:, j].numel()}"
+                    )
                 out_jacobian[:, j] = dense.reshape(-1)
     return out_jacobians, correct_grad_sizes, correct_grad_types
 
@@ -1061,7 +1081,8 @@ Expected:
 
 def _test_batched_grad_forward_ad(func, inputs) -> bool:
     fwAD = torch.autograd.forward_ad  # To avoid early import issues (do we need this?)
-    assert isinstance(inputs, tuple)
+    if not isinstance(inputs, tuple):
+        raise AssertionError("Expected inputs to be a tuple")
 
     for input_idx, current_input in enumerate(inputs):
         if not (is_tensor_like(current_input) and current_input.requires_grad):
@@ -1641,7 +1662,10 @@ def _slow_gradcheck(
 
 
 def _dot_with_type_promotion(u, v):
-    assert u.dim() == 1 and v.dim() == 1
+    if u.dim() != 1 or v.dim() != 1:
+        raise AssertionError(
+            f"Expected u and v to be 1D tensors, but got dims {u.dim()} and {v.dim()}"
+        )
     return (u * v).sum()
 
 
@@ -1908,7 +1932,8 @@ def _fast_gradcheck(
     )
     # TODO: replicate https://github.com/pytorch/pytorch/pull/77743 for fast gradcheck as well
     if use_forward_ad:
-        assert all_v is None
+        if all_v is not None:
+            raise AssertionError("Expected all_v to be None.")
         analytical_vJu = _get_analytical_jacobian_forward_ad(
             func,
             inputs,
@@ -2036,15 +2061,18 @@ def gradcheck(
         ``True`` if all differences satisfy allclose condition
 
     """
-    assert check_forward_ad or check_backward_ad, (
-        "Expected at least one of check_forward_ad or check_backward_ad to be True"
-    )
-    assert not (check_batched_grad and not check_backward_ad), (
-        "Setting check_batched_grad=True requires check_backward_ad to be True"
-    )
-    assert not (check_batched_forward_grad and not check_forward_ad), (
-        "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
-    )
+    if not (check_forward_ad or check_backward_ad):
+        raise AssertionError(
+            "Expected at least one of check_forward_ad or check_backward_ad to be True"
+        )
+    if check_batched_grad and not check_backward_ad:
+        raise AssertionError(
+            "Setting check_batched_grad=True requires check_backward_ad to be True"
+        )
+    if check_batched_forward_grad and not check_forward_ad:
+        raise AssertionError(
+            "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+        )
     args = locals().copy()
     args.pop("raise_exception")
     if not raise_exception:
@@ -2189,15 +2217,18 @@ def gradgradcheck(
     Returns:
         True if all differences satisfy allclose condition
     """
-    assert check_fwd_over_rev or check_rev_over_rev, (
-        "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
-    )
-    assert not (check_undefined_grad and not check_rev_over_rev), (
-        "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
-    )
-    assert not (check_batched_grad and not check_rev_over_rev), (
-        "Setting check_batched_grad=True requires check_rev_over_rev to be True"
-    )
+    if not (check_fwd_over_rev or check_rev_over_rev):
+        raise AssertionError(
+            "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
+        )
+    if check_undefined_grad and not check_rev_over_rev:
+        raise AssertionError(
+            "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
+        )
+    if check_batched_grad and not check_rev_over_rev:
+        raise AssertionError(
+            "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+        )
     # TODO: do we want to test this too?
     # assert not (check_batched_forward_grad and not check_fwd_over_rev), (
     #     "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 7fcc5e4b876..f7615db03ed 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -187,7 +187,8 @@ def _get_grad_fn_or_grad_acc(t: Union[torch.Tensor, "GradientEdge"]) -> Node:
             node = t.view_as(t).grad_fn.next_functions[0][0]  # type: ignore[union-attr]
     else:
         node = t.grad_fn
-    assert node is not None
+    if node is None:
+        raise AssertionError("Expected gradient function to be set")
     return node
 
 
@@ -528,9 +529,10 @@ def register_multi_grad_hook(
             def inner_hook(grad: torch.Tensor) -> None:
                 nonlocal count, nb_calls, buffer, fn
                 id = torch._C._current_graph_task_id()
-                assert id != -1, (
-                    "expected this hook to be called inside a backward call"
-                )
+                if id == -1:
+                    raise AssertionError(
+                        "expected this hook to be called inside a backward call"
+                    )
                 count[id] = count.get(id, 0)
                 # pyrefly: ignore  # unsupported-operation
                 buffer[id] = buffer.get(id, [None] * len_tensors)
@@ -546,7 +548,8 @@ def register_multi_grad_hook(
 
                 buffer[id][idx] = grad
 
-                assert nb_calls is not None
+                if nb_calls is None:
+                    raise AssertionError("Expected nb_calls to be set")
                 if curr_count == nb_calls - 1:
                     fn = cast(Callable[[Sequence[Optional[torch.Tensor]]], None], fn)
                     fn(buffer[id])
@@ -566,7 +569,10 @@ def register_multi_grad_hook(
         def wrapped_fn(grad: torch.Tensor) -> None:
             nonlocal ran_hook
             id = torch._C._current_graph_task_id()
-            assert id != -1, "expected this hook to be called inside a backward call"
+            if id == -1:
+                raise AssertionError(
+                    "expected this hook to be called inside a backward call"
+                )
             with lock:
                 prev, ran_hook[id] = ran_hook[id], True
             if prev:
@@ -662,11 +668,13 @@ class _swap_with_cloned(saved_tensors_hooks):
                 "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
                 "in which the graph was originally recorded."
             )
-            assert _allow_mutation_on_saved_tensors_enabled, error_msg
+            if not _allow_mutation_on_saved_tensors_enabled:
+                raise AssertionError(error_msg)
             if handle in ctx.cloned:
                 res = ctx.cloned[handle]
             else:
-                assert handle in ctx.original, error_msg
+                if handle not in ctx.original:
+                    raise AssertionError(error_msg)
                 res = ctx.original[handle]
             return res
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index cdab6259d85..5c478e514d0 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -255,9 +255,10 @@ class profile:
         self.custom_trace_id_callback = custom_trace_id_callback
         self.trace_id = ""
         if not self.use_cpu:
-            assert use_kineto, (
-                "Device-only events supported only with Kineto (use_kineto=True)"
-            )
+            if not use_kineto:
+                raise AssertionError(
+                    "Device-only events supported only with Kineto (use_kineto=True)"
+                )
 
         if self.use_device is not None:
             VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
@@ -289,40 +290,44 @@ class profile:
         self.profiler_kind = ProfilerState.KINETO
         if self.use_device == "cuda":
             if not use_kineto or ProfilerActivity.CUDA not in _supported_activities():
-                assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
+                if not self.use_cpu:
+                    raise AssertionError("Legacy CUDA profiling requires use_cpu=True")
                 self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.CUDA)
         elif self.use_device == "xpu":
-            assert use_kineto and ProfilerActivity.XPU in _supported_activities(), (
-                "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
-            )
+            if not (use_kineto and ProfilerActivity.XPU in _supported_activities()):
+                raise AssertionError(
+                    "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+                )
             self.kineto_activities.add(ProfilerActivity.XPU)
         elif self.use_device == "mtia":
-            assert use_kineto and ProfilerActivity.MTIA in _supported_activities(), (
-                "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
-            )
+            if not (use_kineto and ProfilerActivity.MTIA in _supported_activities()):
+                raise AssertionError(
+                    "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
+                )
             self.kineto_activities.add(ProfilerActivity.MTIA)
         elif self.use_device == "hpu":
-            assert use_kineto and ProfilerActivity.HPU in _supported_activities(), (
-                "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
-            )
+            if not (use_kineto and ProfilerActivity.HPU in _supported_activities()):
+                raise AssertionError(
+                    "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
+                )
             self.kineto_activities.add(ProfilerActivity.HPU)
         elif self.use_device is not None and self.use_device != "privateuseone":
             if (
                 not use_kineto
                 or ProfilerActivity.PrivateUse1 not in _supported_activities()
             ):
-                assert self.use_cpu, (
-                    "Legacy custombackend profiling requires use_cpu=True"
-                )
+                if not self.use_cpu:
+                    raise AssertionError(
+                        "Legacy custombackend profiling requires use_cpu=True"
+                    )
                 self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.PrivateUse1)
 
-        assert len(self.kineto_activities) > 0, (
-            "No activities specified for the profiler"
-        )
+        if len(self.kineto_activities) == 0:
+            raise AssertionError("No activities specified for the profiler")
 
     def default_trace_id(self):
         # Generate a UUID
@@ -472,7 +477,8 @@ class profile:
         top_level_events_only=False,
     ):
         self._ensure_function_events()
-        assert self._function_events is not None
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -500,8 +506,10 @@ class profile:
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
-        assert self.with_stack, "export_stacks() requires with_stack=True"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
+        if not self.with_stack:
+            raise AssertionError("export_stacks() requires with_stack=True")
         return self._function_events.export_stacks(path, metric)
 
     def toggle_collection_dynamic(
@@ -519,7 +527,8 @@ class profile:
         group_by_overload_name=False,
     ):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.key_averages(
             group_by_input_shape, group_by_stack_n, group_by_overload_name
         )
@@ -528,7 +537,8 @@ class profile:
 
     def total_average(self):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -540,7 +550,8 @@ class profile:
         The total time is a sum of all self times across all the events.
         """
         self._ensure_function_events()
-        assert self._function_events is not None
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.self_cpu_time_total
 
     def _parse_kineto_results(self, result: _ProfilerResult):
@@ -796,7 +807,8 @@ class record_function(_ContextDecorator):
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
-        assert record is not None
+        if record is None:
+            raise AssertionError("Expected record to be set")
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -833,7 +845,8 @@ class record_function(_ContextDecorator):
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
-        assert record is not None
+        if record is None:
+            raise AssertionError("Expected record to be set")
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -1124,7 +1137,8 @@ def parse_nvprof_trace(path):
     for row in conn.execute(kernel_query):
         unique.see(row["marker_id"], row["runtime_id"])
         # 211 is cudaKernelLaunch for cuda >= 9.2
-        assert row["cbid"] == 211
+        if row["cbid"] != 211:
+            raise AssertionError(f"Expected cbid to be 211, but got {row['cbid']}")
         evt = functions_map[row["marker_id"]]
         evt.append_kernel(
             row["kernel_name"], 0, row["kernel_end"] - row["kernel_start"]
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 43942d3eb38..17d84debe12 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -137,7 +137,8 @@ class profile:
         top_level_events_only=False,
     ):
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -152,27 +153,32 @@ class profile:
 
     def export_chrome_trace(self, path):
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.export_chrome_trace(path)
 
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
-        assert self.with_stack, "export_stacks() requires with_stack=True"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
+        if not self.with_stack:
+            raise AssertionError("export_stacks() requires with_stack=True")
         return self.function_events.export_stacks(path, metric)
 
     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
 
     key_averages.__doc__ = EventList.key_averages.__doc__
 
     def total_average(self):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -181,7 +187,8 @@ class profile:
     def self_cpu_time_total(self):
         """Return CPU time as the sum of self times across all events."""
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.self_cpu_time_total
 
 
@@ -199,7 +206,8 @@ def _parse_legacy_records(thread_records):
         if start_record is None and name == "__start_profile":
             start_record = record
 
-    assert start_record is not None and not start_record.is_remote()
+    if start_record is None or start_record.is_remote():
+        raise AssertionError("Expected a valid local start_record")
 
     for thread_record_list in thread_records:
         # accumulated memory allocations per handle
@@ -233,10 +241,11 @@ def _parse_legacy_records(thread_records):
                 cpu_memory_allocs[record_key] = 0
                 cuda_memory_allocs[record_key] = 0
             elif record.kind() == "pop":
-                assert (
-                    record_key in range_starts
-                ), f"""Expected record with key {record_key} to exist in range_starts.
-                    This means that the pop event did not have a corresponding push."""
+                if record_key not in range_starts:
+                    raise AssertionError(
+                        f"Expected record with key {record_key} to exist in range_starts. "
+                        "This means that the pop event did not have a corresponding push."
+                    )
 
                 start = range_starts[record_key]
 
@@ -282,7 +291,11 @@ def _parse_legacy_records(thread_records):
             elif record.kind() == "memory_alloc":
                 num_open_handles_cpu = len(cpu_memory_allocs)
                 num_open_handles_cuda = len(cuda_memory_allocs)
-                assert num_open_handles_cpu == num_open_handles_cuda
+                if num_open_handles_cpu != num_open_handles_cuda:
+                    raise AssertionError(
+                        f"Expected CPU and CUDA memory allocation handles to match, "
+                        f"but got {num_open_handles_cpu} CPU and {num_open_handles_cuda} CUDA"
+                    )
                 for handle in cpu_memory_allocs.keys():
                     cpu_memory_allocs[handle] += record.cpu_memory_usage()
                 for handle in cuda_memory_allocs.keys():
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index ff156b95bc0..530937928b8 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -130,9 +130,10 @@ class EventList(list):
                         current_events.pop()
                     else:
                         parent.append_cpu_child(event)
-                        assert event.cpu_parent is None, (
-                            f"There is already a CPU parent event for {event.key}"
-                        )
+                        if event.cpu_parent is not None:
+                            raise AssertionError(
+                                f"There is already a CPU parent event for {event.key}"
+                            )
                         event.set_cpu_parent(parent)
                         break
 
@@ -157,7 +158,10 @@ class EventList(list):
         for evt in self:
             p = bw_parent(evt)
             if p is not None:
-                assert p.fwd_thread is not None
+                if p.fwd_thread is None:
+                    raise AssertionError(
+                        "Expected fwd_thread to be set for backward parent"
+                    )
                 t = (p.sequence_nr, p.fwd_thread)
                 evt.stack = fwd_stacks.get(t, [])
 
@@ -322,7 +326,10 @@ class EventList(list):
         Returns:
             An EventList containing FunctionEventAvg objects.
         """
-        assert self._tree_built
+        if not self._tree_built:
+            raise AssertionError(
+                "Expected tree to be built before calling key_averages"
+            )
         stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
 
         def get_key(
@@ -392,7 +399,8 @@ def _format_time(time_us):
 def _format_time_share(time_us, total_time_us):
     """Define how to format time in FunctionEvent."""
     if total_time_us == 0:
-        assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
+        if time_us != 0:
+            raise AssertionError(f"Expected time_us == 0 but got {time_us}")
         return "NaN"
     return f"{time_us * 100.0 / total_time_us:.2f}%"
 
@@ -537,7 +545,8 @@ class FunctionEvent(FormattedTimesMixin):
         self.metadata_json = metadata_json
 
     def append_kernel(self, name, device, duration):
-        assert self.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
         self.kernels.append(Kernel(name, device, duration))
 
     def append_cpu_child(self, child):
@@ -546,9 +555,12 @@ class FunctionEvent(FormattedTimesMixin):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
-        assert self.device_type == DeviceType.CPU
-        assert isinstance(child, FunctionEvent)
-        assert child.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
+        if not isinstance(child, FunctionEvent):
+            raise AssertionError("Expected child to be a FunctionEvent")
+        if child.device_type != DeviceType.CPU:
+            raise AssertionError("Expected child device_type to be CPU")
         self.cpu_children.append(child)
 
     def set_cpu_parent(self, parent):
@@ -558,9 +570,12 @@ class FunctionEvent(FormattedTimesMixin):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
-        assert self.device_type == DeviceType.CPU
-        assert isinstance(parent, FunctionEvent)
-        assert parent.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
+        if not isinstance(parent, FunctionEvent):
+            raise AssertionError("Expected parent to be a FunctionEvent")
+        if parent.device_type != DeviceType.CPU:
+            raise AssertionError("Expected parent device_type to be CPU")
         self.cpu_parent = parent
 
     # Note: async events don't have children, are not used when computing 'self'
@@ -618,12 +633,15 @@ class FunctionEvent(FormattedTimesMixin):
                 # each legacy cpu events has a single (fake) kernel
                 return sum(kinfo.duration for kinfo in self.kernels)
         else:
-            assert self.device_type in [
+            if self.device_type not in [
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
-            ]
+            ]:
+                raise AssertionError(
+                    f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
+                )
             return self.time_range.elapsed_us()
 
     @property
@@ -643,12 +661,15 @@ class FunctionEvent(FormattedTimesMixin):
                 child.device_time_total for child in self.cpu_children
             )
         else:
-            assert self.device_type in [
+            if self.device_type not in [
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
-            ]
+            ]:
+                raise AssertionError(
+                    f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
+                )
             return self.device_time_total
 
     @property
@@ -726,8 +747,14 @@ class FunctionEventAvg(FormattedTimesMixin):
             self.use_device = other.use_device
             self.is_user_annotation = other.is_user_annotation
 
-        assert isinstance(other, (FunctionEvent, FunctionEventAvg))
-        assert other.key == self.key
+        if not isinstance(other, (FunctionEvent, FunctionEventAvg)):
+            raise AssertionError(
+                "Expected other to be a FunctionEvent or FunctionEventAvg"
+            )
+        if other.key != self.key:
+            raise AssertionError(
+                f"Expected keys to match, but got {other.key} vs {self.key}"
+            )
 
         self.cpu_time_total += other.cpu_time_total
         self.device_time_total += other.device_time_total
@@ -974,10 +1001,14 @@ def _build_table(
             "TFLOPs",
             "PFLOPs",
         ]
-        assert flops > 0
+        if flops <= 0:
+            raise AssertionError(f"Expected flops to be positive, but got {flops}")
         # pyrefly: ignore  # no-matching-overload
         log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
-        assert log_flops >= 0 and log_flops < len(flop_headers)
+        if not (log_flops >= 0 and log_flops < len(flop_headers)):
+            raise AssertionError(
+                f"Expected log_flops to be in range [0, {len(flop_headers)}), but got {log_flops}"
+            )
         return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
 
     add_column(name_column_width)