Fix flake8 B028 warnings (#166224)

This PR fixes flake8 B028 warning by specifying stacklevel=2 in `warnings.warn`. The advantage is that users can know more contextual information about PyTorch warnings. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166224 Approved by: https://github.com/ezyang
2025-12-06 12:20:52 +01:00 · 2025-10-26 06:18:52 +00:00 · 2025-10-26 06:18:52 +00:00 · a60d9e1f6d
commit a60d9e1f6d
parent f863550192
147 changed files with 598 additions and 311 deletions
--- a/torch/init.py
+++ b/torch/init.py
@ -2653,7 +2653,8 @@ def compile(
    if torch.compiler.is_exporting():
        warnings.warn(
            "You are calling torch.compile inside torch.export region. "
-            "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)"
+            "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)",
+            stacklevel=2,
        )
        from torch._higher_order_ops.utils import setup_compilation_env

--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@ -55,6 +55,7 @@ def warn_deprecated():
        "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please "
        "use the equivalent torch.library API instead.",
        DeprecationWarning,
+        stacklevel=2,
    )


--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@ -704,7 +704,8 @@ class TS2FXGraphConverter:
        # In a sense, the converter now becomes an stateful interpreter
        warnings.warn(
            "Converting aten::append.t, which is a inplace mutation of the list. "
-            "This makes the converter non-functional: the result depends on the order of the append nodes being converter!"
+            "This makes the converter non-functional: the result depends on the order of the append nodes being converter!",
+            stacklevel=2,
        )

        args = tuple(self.get_fx_value_by_ir_value(inp) for inp in node.inputs())
@ -1471,7 +1472,8 @@ DEBUG: (TORCH_LOGS="+export" <cmd>), additionally
            for k, tensor in self.ts_model.state_dict().items():  # type: ignore[union-attr]
                if k not in ep.state_dict:
                    warnings.warn(
-                        f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram."
+                        f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram.",
+                        stacklevel=2,
                    )
                    ep.state_dict[k] = tensor

--- a/torch/_export/tools.py
+++ b/torch/_export/tools.py
@ -51,7 +51,8 @@ def _generate_inputs_for_submodules(
        model(*args, **kwargs)
    except Exception as e:
        warnings.warn(
-            f"Failed to generate submodule inputs because of the following error:\n{e}"
+            f"Failed to generate submodule inputs because of the following error:\n{e}",
+            stacklevel=2,
        )
    finally:
        for h in handles:
--- a/torch/_functorch/_aot_autograd/frontend_utils.py
+++ b/torch/_functorch/_aot_autograd/frontend_utils.py
@ -321,5 +321,6 @@ def _detect_attribute_assignment(mod: torch.nn.Module):
            warnings.warn(
                f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. "
                "Such attributes must be registered as buffers using the `register_buffer` API "
-                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer).",
+                stacklevel=2,
            )
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@ -137,7 +137,8 @@ def call_func_at_runtime_with_args(
            warnings.warn(
                "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
                "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
-                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
+                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.",
+                stacklevel=2,
            )
            out = normalize_as_list(f(*args))
    return out
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@ -518,7 +518,8 @@ def do_auto_functionalize(
    if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
        warnings.warn(
            "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
-            "Please consider using a different name for this argument to avoid potential issues."
+            "Please consider using a different name for this argument to avoid potential issues.",
+            stacklevel=2,
        )
    with ctx.redispatch_to_next():
        unwrapped_outs = auto_functionalized(
@ -691,7 +692,8 @@ def do_auto_functionalize_v2(
    if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
        warnings.warn(
            "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
-            "Please consider using a different name for this argument to avoid potential issues."
+            "Please consider using a different name for this argument to avoid potential issues.",
+            stacklevel=2,
        )
    all_basis_unwrapped = ctx.unwrap_tensors(all_bases)

--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@ -196,7 +196,8 @@ class BaseHOP(HigherOrderOperator, abc.ABC):
                "Aliasing is not supported for HOP subgraph.\n"
                f"{subgraph.print_readable(print_output=False)}\n"
                f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
-                f"This may lead to silent incorrectness."
+                f"This may lead to silent incorrectness.",
+                stacklevel=2,
            )

        schema_gen = HopSchemaGenerator(self)
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@ -177,6 +177,7 @@ def cond(
                "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."
                " If you want torch.cond to preserve two branches, please make the predicate a boolean tensor or a SymBool.",
                UserWarning,
+                stacklevel=2,
            )
        # This is the eager case. We can just run the true or false branch.
        if pred:
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@ -859,6 +859,7 @@ def ignore(drop=False, **kwargs):
        warnings.warn(
            "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function "
            "call on compilation. Use torch.jit.unused now. {}",
+            stacklevel=2,
            category=FutureWarning,
        )

@ -867,6 +868,7 @@ def ignore(drop=False, **kwargs):
        warnings.warn(
            "ignore(True) has been deprecated. TorchScript will now drop the function "
            "call on compilation. Use torch.jit.unused now. {}",
+            stacklevel=2,
            category=FutureWarning,
        )

@ -992,7 +994,8 @@ def _check_overload_body(func):
        # Parsing the function definition can raise an OSError if source is unavailable.
        # Since this is just an initial check, just raise a warning if this is the case.
        warnings.warn(
-            f"Unable to retrieve source for @torch.jit._overload function: {func}."
+            f"Unable to retrieve source for @torch.jit._overload function: {func}.",
+            stacklevel=2,
        )
        return

@ -1385,7 +1388,8 @@ def check_empty_containers(obj) -> None:
            "calling torch.jit.isinstance in eager mode. For "
            "example, List[int] would become list and "
            "therefore falsely return True for List[float] or"
-            " List[str]."
+            " List[str].",
+            stacklevel=2,
        )


--- a/torch/_prims_common/init.py
+++ b/torch/_prims_common/init.py
@ -2137,7 +2137,8 @@ def alert_not_deterministic(caller: str):
                f"{caller} does not have a deterministic implementation, but you set "
                f"'torch.use_deterministic_algorithms(True, warn_only=True)'. "
                f"You can file an issue at https://github.com/pytorch/pytorch/issues "
-                f"to help us prioritize adding deterministic support for this operation."
+                f"to help us prioritize adding deterministic support for this operation.",
+                stacklevel=2,
            )
        else:
            torch._check(
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@ -180,7 +180,7 @@ def _resize_output_check(out: TensorLikeType, shape: ShapeType):
            "be resized unless they have zero elements. "
            "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)."
        )
-        warnings.warn(msg)
+        warnings.warn(msg, stacklevel=2)
    return True


--- a/torch/_refs/init.py
+++ b/torch/_refs/init.py
@ -3729,7 +3729,8 @@ def istft(
    if end > expected_output_signal_len:
        warnings.warn(
            "The length of signal is shorter than the length parameter. Result is being "
-            + "padded with zeros in the tail. Please check your center and hop_length settings"
+            + "padded with zeros in the tail. Please check your center and hop_length settings",
+            stacklevel=2,
        )
        y = aten.constant_pad_nd(y, (0, end - expected_output_signal_len), 0)
    return y
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@ -405,7 +405,8 @@ class FunctionalTensorMode(TorchDispatchMode):
                        warnings.warn(
                            f"At pre-dispatch tracing, we assume that any custom op marked with "
                            f"CompositeImplicitAutograd and have functional schema are safe to not decompose. "
-                            f"Found {func} to be one such op."
+                            f"Found {func} to be one such op.",
+                            stacklevel=2,
                        )
                    return False
                return True
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@ -350,7 +350,8 @@ class Tensor(torch._C.TensorBase):
            # hypothesis is that no one cares for meta tensors.
            if skip_data:
                warnings.warn(
-                    "Serializing tensors on the meta device under skip_data context manager is a no-op"
+                    "Serializing tensors on the meta device under skip_data context manager is a no-op",
+                    stacklevel=2,
                )
            arg_meta = (
                self.dtype,
@ -1033,7 +1034,7 @@ class Tensor(torch._C.TensorBase):
    def resize(self, *sizes):
        if has_torch_function_unary(self):
            return handle_torch_function(Tensor.resize, (self,), self, *sizes)
-        warnings.warn("non-inplace resize is deprecated")
+        warnings.warn("non-inplace resize is deprecated", stacklevel=2)
        from torch.autograd._functions import Resize

        return Resize.apply(self, sizes)
@ -1041,7 +1042,7 @@ class Tensor(torch._C.TensorBase):
    def resize_as(self, tensor):
        if has_torch_function_variadic(self, tensor):
            return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor)
-        warnings.warn("non-inplace resize_as is deprecated")
+        warnings.warn("non-inplace resize_as is deprecated", stacklevel=2)
        from torch.autograd._functions import Resize

        return Resize.apply(self, tensor.size())
--- a/torch/_utils.py
+++ b/torch/_utils.py
@ -118,7 +118,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
        message = "{}() got an unexpected keyword argument '{}'"
        argument = list(kwargs.keys()).pop()
        raise TypeError(message.format(function_name, argument))
-    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    warnings.warn("'async' is deprecated; use 'non_blocking'", stacklevel=2)
    return kwargs["async"]


--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@ -555,7 +555,8 @@ class Unpickler:
                        f"Detected pickle protocol {self.proto} in the checkpoint, which was "
                        "not the default pickle protocol used by `torch.load` (2). The weights_only "
                        "Unpickler might not support all instructions implemented by this protocol, "
-                        "please file an issue for adding support if you encounter this."
+                        "please file an issue for adding support if you encounter this.",
+                        stacklevel=2,
                    )
            elif key[0] == STOP[0]:
                rc = self.stack.pop()
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@ -267,7 +267,8 @@ class autocast:
            and torch.cuda.amp.common.amp_definitely_not_available()
        ):
            warnings.warn(
-                "User provided device_type of 'cuda', but CUDA is not available. Disabling"
+                "User provided device_type of 'cuda', but CUDA is not available. Disabling",
+                stacklevel=2,
            )
            enabled = False
        if cache_enabled is not None:
@ -281,42 +282,42 @@ class autocast:
                error_message += (
                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
                )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "mtia":
            supported_dtype = [torch.bfloat16, torch.float16]
            if self.fast_dtype not in supported_dtype:
                error_message = "In MTIA autocast, but the target dtype is not supported. Disabling autocast.\n"
                error_message += "MTIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "maia":
            supported_dtype = [torch.bfloat16, torch.float16]
            if self.fast_dtype not in supported_dtype:
                error_message = "In MAIA autocast, but the target dtype is not supported. Disabling autocast.\n"
                error_message += "MAIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "xpu":
            supported_dtype = [torch.bfloat16, torch.float16]
            if self.fast_dtype not in supported_dtype:
                error_message = "In XPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                error_message += "XPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "ipu":
            supported_dtypes = [torch.bfloat16, torch.float16]
            if self.fast_dtype not in supported_dtypes:
                error_message = "In IPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                error_message += "IPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "hpu":
            supported_dtype = [torch.bfloat16, torch.float16]
            if self.fast_dtype not in supported_dtype:
                error_message = "In HPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                error_message += "HPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == self.custom_backend_name:
            supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
@ -326,7 +327,7 @@ class autocast:
                error_message += (
                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
                )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        elif self.device == "cuda":
            if (
@ -344,7 +345,7 @@ class autocast:
                    "In MPS autocast, but the target dtype is not supported. Disabling autocast.\n"
                    "MPS Autocast only supports dtype of torch.bfloat16 and torch.float16 currently."
                )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
            elif self.fast_dtype == torch.bfloat16:
                if not torch.backends.mps.is_macos_or_newer(14, 0):
@ -352,7 +353,7 @@ class autocast:
                        "In MPS autocast, but the target dtype torch.bfloat16 is not supported "
                        "on macOS versions below 14. Disabling autocast."
                    )
-                    warnings.warn(error_message)
+                    warnings.warn(error_message, stacklevel=2)
                    enabled = False
        elif self.device == "xla":
            supported_dtype = [torch.float16, torch.bfloat16]
@ -361,7 +362,7 @@ class autocast:
                error_message += (
                    "XLA Autocast only supports dtype of torch.bfloat16 currently."
                )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                enabled = False
        self._enabled = enabled

--- a/torch/amp/grad_scaler.py
+++ b/torch/amp/grad_scaler.py
@ -422,6 +422,7 @@ class GradScaler:
                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
                    FutureWarning,
+                    stacklevel=2,
                )
                kwargs_.update({"grad_scaler": self})
            else:
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@ -469,14 +469,16 @@ class LSTM(torch.nn.Module):
            warnings.warn(
                "dropout option for quantizable LSTM is ignored. "
                "If you are training, please, use nn.LSTM version "
-                "followed by `prepare` step."
+                "followed by `prepare` step.",
+                stacklevel=2,
            )
            if num_layers == 1:
                warnings.warn(
                    "dropout option adds dropout after all but last "
                    "recurrent layer, so non-zero dropout expects "
                    f"num_layers greater than 1, but got dropout={dropout} "
-                    f"and num_layers={num_layers}"
+                    f"and num_layers={num_layers}",
+                    stacklevel=2,
                )

        layers = [
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@ -68,7 +68,8 @@ class Conv1d(nnq.Conv1d):
        reduce_range=True,
    ):
        warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
        )
        factory_kwargs = {"device": device, "dtype": dtype}
        kernel_size = _single(kernel_size)
@ -155,7 +156,8 @@ class Conv2d(nnq.Conv2d):
    ):
        warnings.warn(
            f"The current implementation of the {self._get_name()} module "
-            "has poor numerical accuracy and its use is not recommended"
+            "has poor numerical accuracy and its use is not recommended",
+            stacklevel=2,
        )
        factory_kwargs = {"device": device, "dtype": dtype}
        kernel_size = _pair(kernel_size)
@ -239,7 +241,8 @@ class Conv3d(nnq.Conv3d):
        dtype=None,
    ):
        warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
        )
        assert padding_mode != "reflect", "Conv3d does not support reflection padding"
        factory_kwargs = {"device": device, "dtype": dtype}
@ -330,7 +333,8 @@ class ConvTranspose1d(nnq.ConvTranspose1d):
        dtype=None,
    ):
        warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
        )
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__(
@ -412,7 +416,8 @@ class ConvTranspose2d(nnq.ConvTranspose2d):
        dtype=None,
    ):
        warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
        )
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__(
@ -494,7 +499,8 @@ class ConvTranspose3d(nnq.ConvTranspose3d):
        dtype=None,
    ):
        warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
        )
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__(
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@ -136,7 +136,8 @@ class RNNBase(torch.nn.Module):
                "dropout option adds dropout after all but last "
                "recurrent layer, so non-zero dropout expects "
                f"num_layers greater than 1, but got dropout={dropout} and "
-                f"num_layers={num_layers}"
+                f"num_layers={num_layers}",
+                stacklevel=2,
            )

        if mode == "LSTM":
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@ -724,7 +724,8 @@ def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=
        affects the outputs.
    """
    warnings.warn(
-        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
    )
    return interpolate(input, size, scale_factor, mode, align_corners)

@ -749,7 +750,8 @@ def upsample_bilinear(input, size=None, scale_factor=None):
    """
    # DeprecationWarning is ignored by default
    warnings.warn(
-        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
    )
    return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)

@ -774,6 +776,7 @@ def upsample_nearest(input, size=None, scale_factor=None):
    """
    # DeprecationWarning is ignored by default
    warnings.warn(
-        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
    )
    return interpolate(input, size, scale_factor, mode="nearest")
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@ -322,7 +322,8 @@ class PReLU(torch.nn.Module):
        observer(float_wt)
        if observer.dtype != torch.quint8:
            warn(
-                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
            )
        wt_scale, wt_zp = observer.calculate_qparams()
        qweight = torch.quantize_per_tensor(
@ -339,7 +340,8 @@ class PReLU(torch.nn.Module):
        observer(float_wt)
        if observer.dtype != torch.quint8:
            warn(
-                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
            )
        wt_scale, wt_zp = observer.calculate_qparams()
        qweight = torch.quantize_per_tensor(
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@ -213,7 +213,8 @@ class ActivationSparsifier:

        if name in self.data_groups:  # unregister layer if already present
            warnings.warn(
-                "layer already attached to the sparsifier, deregistering the layer and registering with new config"
+                "layer already attached to the sparsifier, deregistering the layer and registering with new config",
+                stacklevel=2,
            )
            self.unregister_layer(name=name)

--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@ -158,6 +158,7 @@ class BaseDataScheduler:
                    "initialization. Please, make sure to call `data_sparsifier.step()` before "
                    "`scheduler.step()`.",
                    UserWarning,
+                    stacklevel=2,
                )

            # Just check if there were two first scheduler.step() calls before sparsifier.step()
@ -167,6 +168,7 @@ class BaseDataScheduler:
                    "You have to make sure you run the data_sparsifier.step() BEFORE any "
                    "calls to the scheduler.step().",
                    UserWarning,
+                    stacklevel=2,
                )
        self._step_count += 1

--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@ -105,7 +105,8 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
        if name in self.state:
            # If the named data already exists - replace
            warnings.warn(
-                "Replacing existing data of the same name. - Did you mean a different name?"
+                "Replacing existing data of the same name. - Did you mean a different name?",
+                stacklevel=2,
            )

            # reuse old config
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@ -74,6 +74,7 @@ class StepSLScheduler(BaseDataScheduler):
                "To get the last learning rate computed by the scheduler, "
                "please use `get_last_lr()`.",
                UserWarning,
+                stacklevel=2,
            )
        data_groups = self.data_sparsifier.data_groups
        if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
--- a/torch/ao/pruning/scheduler/base_scheduler.py
+++ b/torch/ao/pruning/scheduler/base_scheduler.py
@ -92,7 +92,8 @@ class BaseScheduler:
        if not self._get_sl_called_within_step:
            warnings.warn(
                "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
            )
        raise NotImplementedError

@ -124,6 +125,7 @@ class BaseScheduler:
                    "initialization. Please, make sure to call `sparsifier.step()` before "
                    "`scheduler.step()`.",
                    UserWarning,
+                    stacklevel=2,
                )

            # Just check if there were two first scheduler.step() calls before sparsifier.step()
@ -133,6 +135,7 @@ class BaseScheduler:
                    "You have to make sure you run the sparsifier.step() BEFORE any "
                    "calls to the scheduler.step().",
                    UserWarning,
+                    stacklevel=2,
                )
        self._step_count += 1

--- a/torch/ao/pruning/scheduler/cubic_scheduler.py
+++ b/torch/ao/pruning/scheduler/cubic_scheduler.py
@ -90,7 +90,8 @@ class CubicSL(BaseScheduler):
        if not self._get_sl_called_within_step:
            warnings.warn(
                "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
            )
        return [
            self.sparsity_compute_fn(
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@ -56,7 +56,8 @@ class LambdaSL(BaseScheduler):
        if not self._get_sl_called_within_step:
            warnings.warn(
                "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
            )
        return [
            base_sl * lmbda(self.last_epoch)
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@ -121,7 +121,8 @@ class _InputEqualizationObserver(nn.Module):
        ):
            warnings.warn(
                "Must call calculate_equalization_scale before calling calculate_scaled_minmax. "
-                + "Will not scale the next quantization observer."
+                + "Will not scale the next quantization observer.",
+                stacklevel=2,
            )
            return None, None

@ -226,7 +227,8 @@ def calculate_equalization_scale(
    ):
        warnings.warn(
            "Must run observer before calling calculate_equalization_scale. "
-            + "Returning default equalization scale torch.tensor(1)."
+            + "Returning default equalization scale torch.tensor(1).",
+            stacklevel=2,
        )
        return torch.tensor(1)

--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@ -597,7 +597,8 @@ def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph) -> No
            _maybe_recursive_remove_dequantize(arg_element, node, graph)
    else:
        warnings.warn(
-            f"Unsupported node type in recursive remove dequantize: {type(arg)}"
+            f"Unsupported node type in recursive remove dequantize: {type(arg)}",
+            stacklevel=2,
        )


@ -1197,7 +1198,8 @@ def convert(
                    _maybe_recursive_remove_dequantize(output, return_node, model.graph)
            else:
                warnings.warn(
-                    f"Unsupported node type for output_quantized_idxs: {type(output)}"
+                    f"Unsupported node type for output_quantized_idxs: {type(output)}",
+                    stacklevel=2,
                )
        elif node.op == "call_module":
            mod = _get_module(node, modules)
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@ -1055,7 +1055,9 @@ def _maybe_insert_input_equalization_observers_for_node(
        return

    if is_branch:
-        warnings.warn(f"Cannot equalize {node} because it is part of a branch.")
+        warnings.warn(
+            f"Cannot equalize {node} because it is part of a branch.", stacklevel=2
+        )
        return

    new_args = []
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@ -890,7 +890,8 @@ def _qconfig_satisfies_dtype_config_constraints(
        if backend_quant_min is not None and backend_quant_max is not None:
            if app_quant_min is None or app_quant_max is None:
                warnings.warn(
-                    f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}"
+                    f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}",
+                    stacklevel=2,
                )
                return False
            elif app_quant_min < backend_quant_min or app_quant_max > backend_quant_max:
@ -898,20 +899,23 @@ def _qconfig_satisfies_dtype_config_constraints(
                    f"QConfig {debug_string} quantization range must fall within the backend's:\n"
                    f"QConfig range = ({app_quant_min}, {app_quant_max}), "
                    f"BackendConfig range = ({backend_quant_min}, {backend_quant_max}), "
-                    f"ignoring {qconfig}"
+                    f"ignoring {qconfig}",
+                    stacklevel=2,
                )
                return False
        # check scale min
        if backend_scale_min is not None:
            if app_scale_min is None:
                warnings.warn(
-                    f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}"
+                    f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}",
+                    stacklevel=2,
                )
                return False
            if app_scale_min < backend_scale_min:
                warnings.warn(
                    f"QConfig {debug_string} eps ({app_scale_min}) must be greater than or equal to "
-                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}"
+                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}",
+                    stacklevel=2,
                )
                return False
        # check fixed scale and zero point
@ -935,7 +939,8 @@ def _qconfig_satisfies_dtype_config_constraints(
            ) and not isinstance(activation_post_process, FixedQParamsFakeQuantize):
                warnings.warn(
                    f"QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize "
-                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}"
+                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}",
+                    stacklevel=2,
                )
                return False
            if (
@ -945,7 +950,8 @@ def _qconfig_satisfies_dtype_config_constraints(
                warnings.warn(
                    f"QConfig fixed scale ({observer.scale}) and zero point ({observer.zero_point}) "
                    f"do not match the backend's ({backend_scale_exact_match} and {backend_zero_point_exact_match}), "
-                    f"ignoring {qconfig}.\n{suggestion_str}"
+                    f"ignoring {qconfig}.\n{suggestion_str}",
+                    stacklevel=2,
                )
                return False
        return True
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@ -245,7 +245,8 @@ class UniformQuantizationObserverBase(ObserverBase):
        if reduce_range:
            warnings.warn(
                "Please use quant_min and quant_max to specify the range for observers. \
-                    reduce_range will be deprecated in a future release of PyTorch."
+                    reduce_range will be deprecated in a future release of PyTorch.",
+                stacklevel=2,
            )
        self.reduce_range = reduce_range
        self.register_buffer("eps", torch.tensor([eps], **factory_kwargs))
@ -829,7 +830,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
                    self.max_val.resize_(val.shape)
                else:
                    warnings.warn(
-                        f"Observer load_from_state_dict got unexpected name {name}"
+                        f"Observer load_from_state_dict got unexpected name {name}",
+                        stacklevel=2,
                    )
                # For torchscript module we need to update the attributes here since we do not
                # call the `_load_from_state_dict` function defined module.py
@ -840,7 +842,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
                        self.max_val.copy_(val)
                    else:
                        warnings.warn(
-                            f"Observer load_from_state_dict got unexpected name {name}"
+                            f"Observer load_from_state_dict got unexpected name {name}",
+                            stacklevel=2,
                        )
            elif strict:
                missing_keys.append(key)
@ -1289,7 +1292,9 @@ class HistogramObserver(UniformQuantizationObserverBase):
        # want to make our quantization range infinite
        # and in practice those values will be clamped
        if x_min == -torch.inf or x_max == torch.inf:
-            warnings.warn("torch.inf detected in input tensor, ignoring input")
+            warnings.warn(
+                "torch.inf detected in input tensor, ignoring input", stacklevel=2
+            )
            x = x[x.abs() != torch.inf]
            if x.numel() == 0:
                return x_orig
@ -1345,7 +1350,8 @@ class HistogramObserver(UniformQuantizationObserverBase):
        if is_uninitialized:
            warnings.warn(
                "must run observer before calling calculate_qparams.\
-                                    Returning default scale and zero point "
+                                    Returning default scale and zero point ",
+                stacklevel=2,
            )
            return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor(
                [0], device=self.min_val.device.type
@ -1509,7 +1515,8 @@ class PlaceholderObserver(ObserverBase):
            warnings.warn(
                "Please use `is_dynamic` instead of `compute_dtype`. \
                    `compute_dtype` will be deprecated in a future release \
-                    of PyTorch."
+                    of PyTorch.",
+                stacklevel=2,
            )

    def forward(self, x):
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@ -292,7 +292,8 @@ def get_default_qconfig(backend="x86", version=0):
            if not torch.cpu._is_vnni_supported():
                warnings.warn(
                    "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
-                    "on CPU without Vector Neural Network Instruction support."
+                    "on CPU without Vector Neural Network Instruction support.",
+                    stacklevel=2,
                )
            qconfig = QConfig(
                activation=HistogramObserver.with_args(reduce_range=False),
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@ -392,7 +392,8 @@ def prepare(
        warnings.warn(
            "None of the submodule got qconfig applied. Make sure you "
            "passed correct configuration through `qconfig_dict` or "
-            "by assigning the `.qconfig` attribute directly on submodules"
+            "by assigning the `.qconfig` attribute directly on submodules",
+            stacklevel=2,
        )

    _add_observer_(
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@ -372,6 +372,7 @@ def _config_checker(method: Callable) -> Callable:
        if quantizer._need_skip_config(quantization_config):
            warnings.warn(
                f"Skip the quantization config for {name}.",
+                stacklevel=2,
            )
            return quantizer
        return method(quantizer, name, quantization_config)
@ -464,7 +465,10 @@ class X86InductorQuantizer(Quantizer):
            current_mode.qat_state is not None
            and current_mode.qat_state != quantization_config.is_qat
        ):
-            warnings.warn("Mixed QAT and Non-QAT quantization config is not supported.")
+            warnings.warn(
+                "Mixed QAT and Non-QAT quantization config is not supported.",
+                stacklevel=2,
+            )
            need_skip = True
        if current_mode.dynamic_state is not None:
            input_activation_spec = quantization_config.input_activation
@ -473,14 +477,15 @@ class X86InductorQuantizer(Quantizer):
                and current_mode.dynamic_state != input_activation_spec.is_dynamic
            ):
                warnings.warn(
-                    "Mixed dynamic and static quantization config is not supported."
+                    "Mixed dynamic and static quantization config is not supported.",
+                    stacklevel=2,
                )
                need_skip = True
        return need_skip

    def set_global(self, quantization_config: QuantizationConfig):
        if self._need_skip_config(quantization_config):
-            warnings.warn("Skip the global quantization config.")
+            warnings.warn("Skip the global quantization config.", stacklevel=2)
            return self
        self.global_config = quantization_config
        return self
@ -489,7 +494,8 @@ class X86InductorQuantizer(Quantizer):
        if not isinstance(self.global_config, QuantizationConfig):
            warnings.warn(
                "The global_config for X86InductorQuantizer is currently invalid. \
-                Please ensure that you use set_global to establish the global quantization configuration."
+                Please ensure that you use set_global to establish the global quantization configuration.",
+                stacklevel=2,
            )
        return self.global_config

@ -508,7 +514,8 @@ class X86InductorQuantizer(Quantizer):
            )
        else:
            warnings.warn(
-                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer."
+                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.",
+                stacklevel=2,
            )
        return self

@ -525,7 +532,8 @@ class X86InductorQuantizer(Quantizer):
            )
        else:
            warnings.warn(
-                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer."
+                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.",
+                stacklevel=2,
            )
        return self

@ -551,7 +559,8 @@ class X86InductorQuantizer(Quantizer):
            self.operator_type_qconfig[operator_type] = quantization_config
        else:
            warnings.warn(
-                f"operator: Unable to quantize {operator} by X86InductorQuantizer."
+                f"operator: Unable to quantize {operator} by X86InductorQuantizer.",
+                stacklevel=2,
            )
        return self

@ -1317,7 +1326,8 @@ class X86InductorQuantizer(Quantizer):
                if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
                    if quantization_config is not None:
                        warnings.warn(
-                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}."
+                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.",
+                            stacklevel=2,
                        )
                    return

--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@ -427,7 +427,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
    if min_val.numel() == 0 or max_val.numel() == 0:
        warnings.warn(
            "must run observer before calling calculate_qparams. "
-            + "Returning default values."
+            + "Returning default values.",
+            stacklevel=2,
        )
        return False

@ -435,7 +436,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
        if min_val == float("inf") and max_val == float("-inf"):
            warnings.warn(
                "must run observer before calling calculate_qparams. "
-                + "Returning default values."
+                + "Returning default values.",
+                stacklevel=2,
            )

            return False
@ -806,7 +808,8 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
    """
    if {torch.device("cpu"), torch.device("meta")} == devices:
        warnings.warn(
-            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'."
+            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.",
+            stacklevel=2,
        )
        devices = {torch.device("cpu")}
    ""
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@ -944,7 +944,8 @@ def _check_inputs(tupled_inputs) -> bool:
                    f"Input #{idx} requires gradient and "
                    "is not a double precision floating point or complex. "
                    "This check will likely fail if all the inputs are "
-                    "not of double precision floating point or complex. "
+                    "not of double precision floating point or complex. ",
+                    stacklevel=2,
                )
            if inp.is_sparse:
                content = inp._values()
@ -1325,7 +1326,8 @@ def _test_undefined_backward_mode(func, outputs, inputs) -> bool:
            "Backwards compatibility: New undefined gradient support checking "
            "feature is enabled by default, but it may break existing callers "
            "of this function. If this is true for you, you can call this "
-            'function with "check_undefined_grad=False" to disable the feature'
+            'function with "check_undefined_grad=False" to disable the feature',
+            stacklevel=2,
        )

    def check_undefined_grad_support(output_to_check):
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -265,22 +265,24 @@ class profile:
            if _get_privateuse1_backend_name() != "privateuseone":
                VALID_DEVICE_OPTIONS.append(_get_privateuse1_backend_name())
            if self.use_device not in VALID_DEVICE_OPTIONS:
-                warn(f"The {self.use_device} is not a valid device option.")
+                warn(
+                    f"The {self.use_device} is not a valid device option.", stacklevel=2
+                )
                self.use_device = None

            if self.use_device == "cuda" and not torch.cuda.is_available():
-                warn("CUDA is not available, disabling CUDA profiling")
+                warn("CUDA is not available, disabling CUDA profiling", stacklevel=2)
                self.use_cuda = False
                self.use_device = None

            if self.use_device == "xpu" and not torch.xpu.is_available():
-                warn("XPU is not available, disabling XPU profiling")
+                warn("XPU is not available, disabling XPU profiling", stacklevel=2)
                self.use_device = None

            if self.use_device == "hpu" and not (
                hasattr(torch, "hpu") and torch.hpu.is_available()
            ):
-                warn("HPU is not available, disabling HPU profiling")
+                warn("HPU is not available, disabling HPU profiling", stacklevel=2)
                self.use_device = None

        self.kineto_activities = set()
@ -1224,7 +1226,8 @@ class KinetoStepTracker:
            if delta > 1:
                warn(
                    "Profiler step count has increased more than 1 - "
-                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
+                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}",
+                    stacklevel=2,
                )
            for _ in range(delta):
                _kineto_step()
--- a/torch/backends/cudnn/init.py
+++ b/torch/backends/cudnn/init.py
@ -118,7 +118,8 @@ def is_acceptable(tensor):
    if not is_available():
        warnings.warn(
            "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
-            "PyTorch making sure the library is visible to the build system."
+            "PyTorch making sure the library is visible to the build system.",
+            stacklevel=2,
        )
        return False
    if not _init():
@ -127,7 +128,8 @@ def is_acceptable(tensor):
                libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
                    sys.platform, "LD_LIBRARY_PATH"
                )
-            )
+            ),
+            stacklevel=2,
        )
        return False
    return True
--- a/torch/cuda/init.py
+++ b/torch/cuda/init.py
@ -293,7 +293,8 @@ def _check_capability():
                        min_arch % 10,
                        max_arch // 10,
                        max_arch % 10,
-                    )
+                    ),
+                    stacklevel=2,
                )
                matched_arches = ""
                for arch, arch_info in CUDA_ARCHES_SUPPORTED.items():
@ -303,7 +304,9 @@ def _check_capability():
                    ):
                        matched_arches += f" {arch}"
                if matched_arches != "":
-                    warnings.warn(matched_cuda_warn.format(matched_arches))
+                    warnings.warn(
+                        matched_cuda_warn.format(matched_arches), stacklevel=2
+                    )


 def _check_cubins():
@ -328,7 +331,8 @@ If you want to use the {} GPU with PyTorch, please check the instructions at htt
            warnings.warn(
                incompatible_device_warn.format(
                    device_name, capability, " ".join(arch_list), device_name
-                )
+                ),
+                stacklevel=2,
            )


@ -818,7 +822,9 @@ def _raw_device_count_amdsmi() -> int:
    try:
        amdsmi.amdsmi_init()
    except amdsmi.AmdSmiException as e:
-        warnings.warn(f"Can't initialize amdsmi - Error code: {e.err_code}")
+        warnings.warn(
+            f"Can't initialize amdsmi - Error code: {e.err_code}", stacklevel=2
+        )
        return -1
    socket_handles = amdsmi.amdsmi_get_processor_handles()
    return len(socket_handles)
@ -831,12 +837,12 @@ def _raw_device_count_nvml() -> int:
    nvml_h = CDLL("libnvidia-ml.so.1")
    rc = nvml_h.nvmlInit()
    if rc != 0:
-        warnings.warn("Can't initialize NVML")
+        warnings.warn("Can't initialize NVML", stacklevel=2)
        return -1
    dev_count = c_int(-1)
    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
    if rc != 0:
-        warnings.warn("Can't get nvml device count")
+        warnings.warn("Can't get nvml device count", stacklevel=2)
        return -1
    del nvml_h
    return dev_count.value
@ -850,27 +856,27 @@ def _raw_device_uuid_amdsmi() -> Optional[list[str]]:
    try:
        amdsmi.amdsmi_init()
    except amdsmi.AmdSmiException:
-        warnings.warn("Can't initialize amdsmi")
+        warnings.warn("Can't initialize amdsmi", stacklevel=2)
        return None
    try:
        socket_handles = amdsmi.amdsmi_get_processor_handles()
        dev_count = len(socket_handles)
    except amdsmi.AmdSmiException:
-        warnings.warn("Can't get amdsmi device count")
+        warnings.warn("Can't get amdsmi device count", stacklevel=2)
        return None
    uuids: list[str] = []
    for idx in range(dev_count):
        try:
            handler = amdsmi.amdsmi_get_processor_handles()[idx]
        except amdsmi.AmdSmiException:
-            warnings.warn("Cannot get amd device handler")
+            warnings.warn("Cannot get amd device handler", stacklevel=2)
            return None
        try:
            uuid = amdsmi.amdsmi_get_gpu_asic_info(handler)["asic_serial"][
                2:
            ]  # Removes 0x prefix from serial
        except amdsmi.AmdSmiException:
-            warnings.warn("Cannot get uuid for amd device")
+            warnings.warn("Cannot get uuid for amd device", stacklevel=2)
            return None
        uuids.append(
            str(uuid).lower()
@ -885,25 +891,25 @@ def _raw_device_uuid_nvml() -> Optional[list[str]]:
    nvml_h = CDLL("libnvidia-ml.so.1")
    rc = nvml_h.nvmlInit()
    if rc != 0:
-        warnings.warn("Can't initialize NVML")
+        warnings.warn("Can't initialize NVML", stacklevel=2)
        return None
    dev_count = c_int(-1)
    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
    if rc != 0:
-        warnings.warn("Can't get nvml device count")
+        warnings.warn("Can't get nvml device count", stacklevel=2)
        return None
    uuids: list[str] = []
    for idx in range(dev_count.value):
        dev_id = c_void_p()
        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
        if rc != 0:
-            warnings.warn("Can't get device handle")
+            warnings.warn("Can't get device handle", stacklevel=2)
            return None
        buf_len = 96
        buf = create_string_buffer(buf_len)
        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
        if rc != 0:
-            warnings.warn("Can't get device UUID")
+            warnings.warn("Can't get device UUID", stacklevel=2)
            return None
        uuids.append(buf.raw.decode("ascii").strip("\0"))
    del nvml_h
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@ -492,6 +492,7 @@ def reset_max_memory_allocated(device: "Device" = None) -> None:
        "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
        "which resets /all/ peak memory stats.",
        FutureWarning,
+        stacklevel=2,
    )
    return reset_peak_memory_stats(device=device)

@ -518,6 +519,7 @@ def reset_max_memory_cached(device: "Device" = None) -> None:
        "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
        "which resets /all/ peak memory stats.",
        FutureWarning,
+        stacklevel=2,
    )
    return reset_peak_memory_stats(device=device)

--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@ -14,7 +14,7 @@ SUM = 0  # ncclRedOp_t

 def is_available(tensors):
    if not hasattr(torch._C, "_nccl_all_reduce"):
-        warnings.warn("PyTorch is not compiled with NCCL support")
+        warnings.warn("PyTorch is not compiled with NCCL support", stacklevel=2)
        return False

    devices = set()
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@ -626,7 +626,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
            else:
                warnings.warn(
                    "Offline tuning is not supported for this GEMM. Use online tuning instead. "
-                    + f"Skipped tuning for: {untuned_gemm[1]}"
+                    + f"Skipped tuning for: {untuned_gemm[1]}",
+                    stacklevel=2,
                )
                return

@ -644,7 +645,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
        if m == 1 or n == 1 or k == 1:
            warnings.warn(
                "Offline tuning is not support for this GEMM. Use online tuning instead. "
-                + f"Skipped tuning for: {untuned_gemm[1]}"
+                + f"Skipped tuning for: {untuned_gemm[1]}",
+                stacklevel=2,
            )
            return

@ -747,7 +749,7 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
        matA = matA.t()
        torch.nn.functional.linear(X, matA, bias)
    else:
-        warnings.warn(f"error: unknown op {op_sig}")
+        warnings.warn(f"error: unknown op {op_sig}", stacklevel=2)


 def _check_tuning_assertions() -> None:
@ -756,7 +758,7 @@ def _check_tuning_assertions() -> None:
    """

    if is_enabled() is False:
-        warnings.warn("TunableOp was disabled. Trying to enable now.")
+        warnings.warn("TunableOp was disabled. Trying to enable now.", stacklevel=2)
        enable(True)
    assert is_enabled() is True
    assert tuning_is_enabled() is True
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@ -23,7 +23,8 @@ try:
    from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
 except Exception:
    warnings.warn(
-        "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly"
+        "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly",
+        stacklevel=2,
    )

    def is_torchdynamo_compiling():  # type: ignore[misc]
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@ -470,7 +470,8 @@ class ShardedTensor(ShardedTensorBase):
                src = shard.tensor.flatten()
                if src.nelement() == 0:
                    warnings.warn(
-                        "Gathering a tensor with zero elements on rank " + str(rank)
+                        "Gathering a tensor with zero elements on rank " + str(rank),
+                        stacklevel=2,
                    )
                    continue
                shard_offset = shard_placement[shard.metadata][1]
@ -671,7 +672,8 @@ class ShardedTensor(ShardedTensorBase):
            if device_to.index != current_idx:
                warnings.warn(
                    "ShardedTensor.to only move tensor to its current device"
-                    "If you want to put to different device, use `reshard` instead."
+                    "If you want to put to different device, use `reshard` instead.",
+                    stacklevel=2,
                )
            device_to = torch.device(current_idx)

--- a/torch/distributed/_tools/mod_tracker.py
+++ b/torch/distributed/_tools/mod_tracker.py
@ -182,7 +182,8 @@ class ModTracker:
                warnings.formatwarning = custom_formatwarning
                warnings.warn(
                    "The module hierarchy tracking maybe be messed up."
-                    " Please file a bug to PyTorch, if it is the case."
+                    " Please file a bug to PyTorch, if it is the case.",
+                    stacklevel=2,
                )
            if name not in self.parents:
                self._active_module_cnt[name] = 1
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@ -257,7 +257,8 @@ class Join:
                    f"{self._rank} has at least {WARN_THRESHOLD} "
                    f"fewer inputs than other currently-active ranks. "
                    "This level of skew could lead to performance "
-                    "degradation during training."
+                    "degradation during training.",
+                    stacklevel=2,
                )
            # Shadow the all-reduce in non-joined processes
            num_nonjoined_procs = self._get_num_nonjoined_procs()
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@ -101,7 +101,8 @@ class PeriodicModelAverager(ModelAverager):
                "When period is 1, no need to use model averaging because the communication cost "
                "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                "by DistributedDataParallel in the backward pass. Therefore, only "
-                "DistributedDataParallel should be used for this case."
+                "DistributedDataParallel should be used for this case.",
+                stacklevel=2,
            )
        self.period = period

--- a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
+++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@ -114,7 +114,8 @@ class HierarchicalModelAverager(averagers.ModelAverager):
                "no need to use model averaging because the communication cost "
                "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                "by DistributedDataParallel in the backward pass. Therefore, only "
-                "DistributedDataParallel should be used for this case."
+                "DistributedDataParallel should be used for this case.",
+                stacklevel=2,
            )
        overall_group_size = dist.get_world_size(group=self.process_group)
        if list(period_group_size_dict.values())[-1] != overall_group_size:
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@ -660,7 +660,8 @@ class _FileSystemWriter(StorageWriter):
                warnings.warn(
                    f"Detected an existing checkpoint in {self.path}, overwriting since {self.overwrite=}."
                    " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
-                    " maintain this functionality or False to raise when an existing checkpoint is found."
+                    " maintain this functionality or False to raise when an existing checkpoint is found.",
+                    stacklevel=2,
                )
            else:
                raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.")
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@ -290,6 +290,7 @@ def _verify_options(
            "will be removed in 2.5. This feature can be achieved by manually "
            "filtering out the state_dict returned from get_state_dict.",
            FutureWarning,
+            stacklevel=2,
        )
    if optim_only and not optims:
        raise RuntimeError(
@ -1234,6 +1235,7 @@ def _unflatten_model_state_dict(
            "feature, please preprocessing the model_state_dict to achieve the "
            "same functionality.",
            FutureWarning,
+            stacklevel=2,
        )
        cast_state_dict = cast(dict[nn.Module, dict[str, ValueType]], state_dict)
        new_state_dict: dict[str, ValueType] = {}
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@ -158,7 +158,8 @@ def load(
    no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
    if no_dist:
        warnings.warn(
-            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process."
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process.",
+            stacklevel=2,
        )

    with _profile():
@ -365,7 +366,8 @@ def _load_state_dict_from_keys(
    no_dist = not (dist.is_available() and dist.is_initialized())
    if no_dist:
        warnings.warn(
-            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process.",
+            stacklevel=2,
        )

    storage_reader = cast(
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@ -182,7 +182,8 @@ def save(
    no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
    if no_dist:
        warnings.warn(
-            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process."
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process.",
+            stacklevel=2,
        )

    with _profile():
@ -414,7 +415,8 @@ def _save_state_dict(
            warnings.warn(
                "The function definition for SavePlanner.set_up_planner has been updated"
                " to include the storage_meta argument. Please update your implementation"
-                " to include this parameter."
+                " to include this parameter.",
+                stacklevel=2,
            )
            planner.set_up_planner(state_dict, distW.is_coordinator)  # type: ignore[call-arg, arg-type]
        else:
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@ -461,7 +461,8 @@ def _api_bc_check(func):
        if len(args) == 2:
            warnings.warn(
                f"The argument order of {func.__name__} has been changed. "
-                "Please check the document to avoid future breakages."
+                "Please check the document to avoid future breakages.",
+                stacklevel=2,
            )
            sig = inspect.signature(func)
            kwonlyargs = [
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@ -85,7 +85,8 @@ else:
            # We keep this function for backward compatibility.
            warnings.warn(
                "This get_root_mesh API will be deprecated soon."
-                "Please use `get_root_mesh` inside DeviceMesh instead."
+                "Please use `get_root_mesh` inside DeviceMesh instead.",
+                stacklevel=2,
            )
            if not device_mesh:
                return device_mesh
@ -108,7 +109,8 @@ else:
        ) -> list["DeviceMesh"]:
            warnings.warn(
                "This _get_all_submeshes API will be deprecated soon."
-                "Please use `_get_all_submeshes` inside DeviceMesh instead."
+                "Please use `_get_all_submeshes` inside DeviceMesh instead.",
+                stacklevel=2,
            )
            return device_mesh._get_all_submeshes(mesh_dim_name)

@ -329,7 +331,8 @@ else:
                        "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that "
                        "the underlying communicator (i.e. NCCL) can be initialized properly. "
                        "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the "
-                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. "
+                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. ",
+                        stacklevel=2,
                    )
                    # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
                    # NOTE: This device selection would only work for homogeneous hardware.
@ -766,7 +769,8 @@ else:
                warnings.warn(
                    "You are attempting to slice a submesh from another submesh. While we support this operation, "
                    "it is users' responsibility to ensure that the submesh is consistently sliced across all ranks. "
-                    "If not, this may result in some ranks receiving the submesh while others encounter errors."
+                    "If not, this may result in some ranks receiving the submesh while others encounter errors.",
+                    stacklevel=2,
                )
                slice_from_root = False

@ -803,7 +807,8 @@ else:
                elif name in flatten_name_to_root_layout:
                    warnings.warn(
                        "Slicing a flattened dim from root mesh will be deprecated in PT 2.11. "
-                        "Users need to bookkeep the flattened mesh directly. "
+                        "Users need to bookkeep the flattened mesh directly. ",
+                        stacklevel=2,
                    )
                    layout_sliced.append(flatten_name_to_root_layout[name])

--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@ -352,7 +352,8 @@ class Backend(str):  # noqa: SLOT000
            warnings.warn(
                f"Device capability of {name} unspecified, assuming `cpu` and "
                "`cuda` or `xpu`. Please specify it via the `devices` argument of "
-                "`register_backend`."
+                "`register_backend`.",
+                stacklevel=2,
            )
            Backend.backend_capability[name.lower()] = (
                ["cpu", "cuda", "xpu"] if torch.xpu.is_available() else ["cpu", "cuda"]
@ -427,7 +428,8 @@ class BackendConfig:
            warnings.warn(
                f"Device capability of {backend} unknown, assuming `cpu` and "
                "`cuda`. You can specify it in `device:backend` format in "
-                "`init_process_group` call."
+                "`init_process_group` call.",
+                stacklevel=2,
            )
            backend_val = Backend(backend)
            self.device_backend_map = {
@ -751,7 +753,8 @@ def _get_default_timeout(backend: Backend) -> timedelta:
            # TODO moco benchmark on CPU initializes pgnccl backend today, triggered this assert in CI before it was
            # changed to be a warning.  We should fix the moco model.
            warnings.warn(
-                "Attempted to get default timeout for nccl backend, but NCCL support is not compiled"
+                "Attempted to get default timeout for nccl backend, but NCCL support is not compiled",
+                stacklevel=2,
            )
            return default_pg_timeout
        return default_pg_nccl_timeout
@ -802,6 +805,7 @@ def _get_object_coll_device(group: Optional[ProcessGroup] = None) -> str:
            f"You are using a Backend {type(group)} as a ProcessGroup. "
            "This usage is deprecated since PyTorch 2.0. Please use a public API "
            "of PyTorch Distributed instead.",
+            stacklevel=2,
        )
        # Provide backward compatibility to cases where `group` passed in is
        # actually a Backend (like `ProcessGroupGloo`) rather than a
@ -868,7 +872,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
        "backward-compatiblity reason. If you need to find a device for object "
        "collectives, please use `_get_object_coll_device`. If you need to query "
        "the device types supported by group, please use "
-        "`_device_capability(group)`. "
+        "`_device_capability(group)`. ",
+        stacklevel=2,
    )
    group = group or _get_default_group()

@ -910,7 +915,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
        warnings.warn(
            "Multiple backends are registered with this ProcessGroup. We cannot "
            f"determine which one is the default. Returning {rv}. "
-            "Please consider using other APIs."
+            "Please consider using other APIs.",
+            stacklevel=2,
        )
        return rv

@ -1010,7 +1016,8 @@ def _warn_not_in_group(op_name) -> None:
    global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank()
    warnings.warn(
        f"Running {op_name} on global rank {global_rank} which does not "
-        "belong to the given group."
+        "belong to the given group.",
+        stacklevel=2,
    )


@ -1557,7 +1564,9 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
        elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
            backends.add(backend)  # type: ignore[arg-type]
    if len(backends) == 0:
-        warnings.warn("Set timeout is now only supported for either nccl or gloo.")
+        warnings.warn(
+            "Set timeout is now only supported for either nccl or gloo.", stacklevel=2
+        )
    for backend in backends:
        backend._set_default_timeout(timeout)

@ -1758,7 +1767,8 @@ def init_process_group(
            warnings.warn(
                f"For MPI backend, world_size ({world_size}) and rank ({rank}) "
                "are ignored since they are assigned by the "
-                "MPI runtime."
+                "MPI runtime.",
+                stacklevel=2,
            )

        default_pg, _ = _new_process_group_helper(
@ -2038,7 +2048,8 @@ def _new_process_group_helper(
                if backend_options._timeout != timeout:
                    warnings.warn(
                        "backend_options._timeout was specified, "
-                        "but timeout kwarg has a default value that will always override it. "
+                        "but timeout kwarg has a default value that will always override it. ",
+                        stacklevel=2,
                    )
            else:
                # default backend_options for NCCL
@ -2259,7 +2270,8 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
        if pg in _world.pg_coalesce_state.keys():
            warnings.warn(
                "Some coalesced collectives haven't been launched when "
-                "ProcessGroup is destroyed. They will be cleaned."
+                "ProcessGroup is destroyed. They will be cleaned.",
+                stacklevel=2,
            )
            del _world.pg_coalesce_state[pg]

@ -2349,7 +2361,8 @@ def _abort_process_group(group: Optional[ProcessGroup] = None):
        if pg in _world.pg_coalesce_state.keys():
            warnings.warn(
                "Some coalesced collectives haven't been launched when "
-                "ProcessGroup is aborted. They will be cleaned."
+                "ProcessGroup is aborted. They will be cleaned.",
+                stacklevel=2,
            )
            del _world.pg_coalesce_state[pg]

@ -4919,7 +4932,8 @@ def barrier(
        if group.rank() == 0:
            warnings.warn(  # warn only once
                "barrier(): using the device under current context. "
-                "You can specify `device_id` in `init_process_group` to mute this warning."
+                "You can specify `device_id` in `init_process_group` to mute this warning.",
+                stacklevel=2,
            )

    work = group.barrier(opts=opts)
@ -5001,6 +5015,7 @@ def monitored_barrier(
        warnings.warn(
            "Please specify timeout arg as a timedelta. "
            f"Converting current value of {timeout} assuming it represents seconds",
+            stacklevel=2,
        )
        timeout = timedelta(seconds=timeout)

--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@ -106,6 +106,7 @@ class WorkerSpec:
            warnings.warn(
                "WorkerSpec.fn will be deprecated,"
                " please use WorkerSpec.entrypoint instead",
+                stacklevel=2,
                category=DeprecationWarning,
            )
            self.entrypoint = self.fn
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@ -52,7 +52,9 @@ class ErrorHandler:
        try:
            faulthandler.enable(all_threads=True)
        except Exception as e:
-            warnings.warn(f"Unable to enable fault handler. {type(e).__name__}: {e}")
+            warnings.warn(
+                f"Unable to enable fault handler. {type(e).__name__}: {e}", stacklevel=2
+            )

    def _write_error_file(self, file_path: str, error_msg: str) -> None:
        """Write error message to the file."""
@ -60,7 +62,9 @@ class ErrorHandler:
            with open(file_path, "w") as fp:
                fp.write(error_msg)
        except Exception as e:
-            warnings.warn(f"Unable to write error to file. {type(e).__name__}: {e}")
+            warnings.warn(
+                f"Unable to write error to file. {type(e).__name__}: {e}", stacklevel=2
+            )

    def record_exception(self, e: BaseException) -> None:
        """
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@ -65,5 +65,6 @@ def _derive_module_name(depth: int = 1) -> Optional[str]:
        warnings.warn(
            f"Error deriving logger module name, using <None>. Exception: {e}",
            RuntimeWarning,
+            stacklevel=2,
        )
        return None
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@ -336,7 +336,8 @@ def _get_param_to_fqns(
                    warnings.warn(
                        "FlatParameter is being traversed more than once. "
                        "This case should only happen when using "
-                        "DistributedModelParallel with FullyShardedDataParallel."
+                        "DistributedModelParallel with FullyShardedDataParallel.",
+                        stacklevel=2,
                    )
                    param_to_fqns[param] = global_fqns
                elif not dedup_shared_params:
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@ -299,7 +299,8 @@ class _ExecOrderData:
                warnings.warn(
                    "Forward order differs from that of the first iteration "
                    f"on rank {self.rank}. Collectives are unchecked and may "
-                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}",
+                    stacklevel=2,
                )
                self.warn_status = _ExecOrderWarnStatus.WARNING
            self.current_order_index += 1
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@ -1585,7 +1585,8 @@ class FlatParamHandle:
                warnings.warn(
                    f"[Rank {self.rank}] Only some but not all ranks have a "
                    "`None` `FlatParameter` gradient, so FSDP is using zeros to "
-                    "approximate those ranks' sharded gradients being `None`"
+                    "approximate those ranks' sharded gradients being `None`",
+                    stacklevel=2,
                )
            flat_param._saved_grad_shard = None  # type: ignore[assignment]
            sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device)  # type: ignore[attr-defined]
@ -2434,7 +2435,8 @@ class FlatParamHandle:
                f"[Rank {rank}] {'Parameter' if is_param else 'Gradient'} needs "
                f"writeback in {self._training_state}\n"
                f"expected shape={expected_shape} shape={src_shape} "
-                f"expected device={dst_tensor.device} device={src_device}"
+                f"expected device={dst_tensor.device} device={src_device}",
+                stacklevel=2,
            )
        if src_tensor is not None and src_tensor.shape != expected_shape:
            # NOTE: Gradient shape mismatch is not possible in practice since
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@ -431,7 +431,8 @@ def _init_core_state(
            warnings.warn(
                "FSDP is switching to use `NO_SHARD` instead of "
                f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since "
-                "the world size is 1."
+                "the world size is 1.",
+                stacklevel=2,
            )
        sharding_strategy = ShardingStrategy.NO_SHARD
    elif sharding_strategy == ShardingStrategy.NO_SHARD:
@ -704,7 +705,8 @@ def _get_ignored_modules(
        warnings.warn(
            "Trying to ignore the top-level module passed into the FSDP "
            "constructor itself will result in all parameters being "
-            f"ignored and is not well-supported: {module}"
+            f"ignored and is not well-supported: {module}",
+            stacklevel=2,
        )
    # Include nested FSDP modules' ignored modules
    for submodule in root_module.modules():
@ -847,7 +849,8 @@ def _get_device_from_device_id(
            f"FSDP will use the current device {device_handle.current_device()}. "
            f"If this is incorrect, please explicitly call `torch.{device.type}.set_device()` "
            "before FSDP initialization or pass in the explicit device "
-            "index as the `device_id` argument."
+            "index as the `device_id` argument.",
+            stacklevel=2,
        )
        device = torch.device(device_handle.current_device())
    return device
@ -929,7 +932,8 @@ def _materialize_meta_module(
        warnings.warn(
            "Unable to call `reset_parameters()` for module on meta "
            f"device with error {str(e)}. Please ensure that your module of"
-            f"type {type(module)} implements a `reset_parameters()` method."  # type: ignore[possibly-undefined]
+            f"type {type(module)} implements a `reset_parameters()` method.",
+            stacklevel=2,  # type: ignore[possibly-undefined]
        )
        raise e

@ -1049,7 +1053,8 @@ def _warn_cpu_init():
        "recommend passing in the `device_id` argument for FSDP to move "
        "`module` to GPU for the sharding initialization. `module` must also "
        "be on GPU device to work with the `sync_module_states=True` flag "
-        "since that requires GPU communication."
+        "since that requires GPU communication.",
+        stacklevel=2,
    )


--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@ -506,7 +506,8 @@ def _flatten_optim_state_dict(
                        flat_osd_state[key] = copy.deepcopy(state)
                    else:
                        warnings.warn(
-                            f"optim_state[{key}] is not on rank{fsdp_state.rank}."
+                            f"optim_state[{key}] is not on rank{fsdp_state.rank}.",
+                            stacklevel=2,
                        )

            else:
@ -2051,7 +2052,8 @@ def _optim_state_dict(
            "most cases, this is a user-defined state that is not "
            "associated with any particular parameter. Another possible "
            "case is this state is managed by TorchRec. Otherwise, there may "
-            " be a mismatched assumption of optim_state_dict of this mode."
+            " be a mismatched assumption of optim_state_dict of this mode.",
+            stacklevel=2,
        )
        fsdp_osd_state[key] = value

--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@ -337,7 +337,8 @@ def _full_post_state_dict_hook(
                    "This may mean that this state_dict entry could point to invalid "
                    "memory regions after returning from state_dict() call if this "
                    "parameter is managed by FSDP. Please check clone "
-                    f"implementation of {fqn}. Error: {str(e)}"
+                    f"implementation of {fqn}. Error: {str(e)}",
+                    stacklevel=2,
                )

    return _common_unshard_post_state_dict_hook(
@ -708,7 +709,8 @@ def _post_state_dict_hook(
        context = _replace_with_full_state_dict_type(fsdp_state)
        warnings.warn(
            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
-            "be returned."
+            "be returned.",
+            stacklevel=2,
        )
    else:
        context = contextlib.nullcontext()
@ -770,7 +772,8 @@ def _pre_state_dict_hook(
        context = _replace_with_full_state_dict_type(fsdp_state)
        warnings.warn(
            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
-            "be returned."
+            "be returned.",
+            stacklevel=2,
        )
    else:
        _set_use_dtensor(fsdp_state)
@ -824,7 +827,8 @@ def _pre_load_state_dict_hook(
        context = _replace_with_full_state_dict_type(fsdp_state)
        warnings.warn(
            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
-            "be returned."
+            "be returned.",
+            stacklevel=2,
        )
    else:
        _set_use_dtensor(fsdp_state)
@ -861,7 +865,8 @@ def _post_load_state_dict_hook(
        context = _replace_with_full_state_dict_type(fsdp_state)
        warnings.warn(
            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
-            "be returned."
+            "be returned.",
+            stacklevel=2,
        )
    else:
        context = contextlib.nullcontext()
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@ -153,7 +153,8 @@ def _validate_unshard_params_args(
            "offload_to_cpu=True and rank0_only=False may result in the"
            "unsharded parameters being redundantly copied to CPU memory for "
            "GPUs sharing the same CPU memory, which risks CPU OOM. We "
-            "recommend using offload_to_cpu=True with rank0_only=True."
+            "recommend using offload_to_cpu=True with rank0_only=True.",
+            stacklevel=2,
        )


--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@ -120,7 +120,8 @@ def _warn_on_overridden_mixed_precision(
        "Both mixed precision and an auto_wrap_policy were specified to FSDP, "
        f"where the wrapped module has submodules of type:\n{overridden_module_classes}\n"
        "These modules will be wrapped as separate FSDP instacnes with mixed "
-        "precision disabled."
+        "precision disabled.",
+        stacklevel=2,
    )


@ -172,7 +173,7 @@ def _validate_frozen_params(
                    f"The following parameters have requires_grad=False:\n{frozen_param_fqns}"
                )
                if use_orig_params:
-                    warnings.warn(msg)
+                    warnings.warn(msg, stacklevel=2)
                else:
                    raise ValueError(msg)

--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@ -680,6 +680,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
            "#torch.distributed.checkpoint.state_dict.get_state_dict ."
            "Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .",
            FutureWarning,
+            stacklevel=2,
        )
        _state_dict_type_to_config = {
            StateDictType.FULL_STATE_DICT: FullStateDictConfig,
@ -1208,7 +1209,8 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
            warnings.warn(
                f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no "
                "gradients -- returning the total norm in the default dtype "
-                f"{total_norm.dtype}"
+                f"{total_norm.dtype}",
+                stacklevel=2,
            )  # warn since this is generally unexpected
            return total_norm
        total_norm_dtype = functools.reduce(
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@ -87,7 +87,8 @@ class _NamedOptimizer(optim.Optimizer):
        else:
            warnings.warn(
                "Since we pass in param_groups, we will use param_groups to "
-                "initialize the optimizer, not all parameters of the module."
+                "initialize the optimizer, not all parameters of the module.",
+                stacklevel=2,
            )
            param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
            ordered_param_keys = []
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@ -92,7 +92,8 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
        else:
            warnings.warn(
                "Loaded state dict does not contain a step counter for an averager. "
-                "Setting step counter to 0."
+                "Setting step counter to 0.",
+                stacklevel=2,
            )
            self.averager.step = 0

--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@ -513,7 +513,8 @@ class OpDispatcher:
                "Found a non-scalar tensor with numel=1 and ndim!=0, "
                "we are implicitly creating a replicated DTensor for it. "
                "However, please consider changing it to a scalar tensor "
-                "or explicitly create a DTensor under distributed environment."
+                "or explicitly create a DTensor under distributed environment.",
+                stacklevel=2,
            )

        if tensor_arg.numel() == 1 or self._allow_implicit_replication:
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@ -43,7 +43,8 @@ def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool:
    else:
        # TODO: Logs way too much
        warnings.warn(
-            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh"
+            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh",
+            stacklevel=2,
        )
        return False

@ -72,7 +73,8 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
    if not is_rng_supported_mesh(device_mesh):
        warnings.warn(
            "DTensor manual_seed() may not have complete support "
-            f"on {device_mesh.device_type} device mesh"
+            f"on {device_mesh.device_type} device mesh",
+            stacklevel=2,
        )
        return

--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@ -74,7 +74,8 @@ def parallelize_module(  # type: ignore[return]
    if parallelize_plan is None:
        warnings.warn(
            "No parallelize_plan is provided and auto-parallel is not supported "
-            "at the moment, so this parallelize_module call will do nothing."
+            "at the moment, so this parallelize_module call will do nothing.",
+            stacklevel=2,
        )
        return module

@ -108,7 +109,8 @@ def parallelize_module(  # type: ignore[return]
                warnings.warn(
                    f"Parallelize plan key '{module_path}' could not be resolved: "
                    f"no submodule matching token '{token}' in module {module}, "
-                    f"skipping this plan entry."
+                    f"skipping this plan entry.",
+                    stacklevel=2,
                )
                continue

--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@ -62,7 +62,8 @@ class Distribution:
                warnings.warn(
                    f"{self.__class__} does not define `arg_constraints`. "
                    + "Please set `arg_constraints = {}` or initialize the distribution "
-                    + "with `validate_args=False` to turn off validation."
+                    + "with `validate_args=False` to turn off validation.",
+                    stacklevel=2,
                )
            for param, constraint in arg_constraints.items():
                if constraints.is_dependent(constraint):
@ -313,7 +314,8 @@ class Distribution:
            warnings.warn(
                f"{self.__class__} does not define `support` to enable "
                + "sample validation. Please initialize the distribution with "
-                + "`validate_args=False` to turn off validation."
+                + "`validate_args=False` to turn off validation.",
+                stacklevel=2,
            )
            return
        assert support is not None
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@ -133,6 +133,7 @@ def _dispatch_kl(type_p, type_q):
            f"Ambiguous kl_divergence({type_p.__name__}, {type_q.__name__}). "
            f"Please register_kl({left_p.__name__}, {right_q.__name__})",
            RuntimeWarning,
+            stacklevel=2,
        )
    return left_fun

--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@ -127,7 +127,8 @@ class Wishart(ExponentialFamily):

        if self.df.lt(event_shape[-1]).any():
            warnings.warn(
-                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim."
+                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim.",
+                stacklevel=2,
            )

        super().__init__(batch_shape, event_shape, validate_args=validate_args)
@ -279,7 +280,7 @@ class Wishart(ExponentialFamily):
        else:
            # More optimized version with data-dependent control flow.
            if is_singular.any():
-                warnings.warn("Singular sample detected.")
+                warnings.warn("Singular sample detected.", stacklevel=2)

                for _ in range(max_try_correction):
                    sample_new = self._bartlett_sampling(is_singular[is_singular].shape)
--- a/torch/export/init.py
+++ b/torch/export/init.py
@ -500,10 +500,10 @@ def load(
            if file_info.filename == "serialized_exported_program.json":
                serialized_exported_program = file_content
            elif file_info.filename == "serialized_state_dict.json":
-                warnings.warn("This version of file is deprecated")
+                warnings.warn("This version of file is deprecated", stacklevel=2)
                serialized_state_dict = file_content
            elif file_info.filename == "serialized_constants.json":
-                warnings.warn("This version of file is deprecated")
+                warnings.warn("This version of file is deprecated", stacklevel=2)
                serialized_constants = file_content
            elif file_info.filename == "serialized_state_dict.pt":
                serialized_state_dict = file_content
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@ -2113,7 +2113,7 @@ def _export_for_training(
                if torch._export.config.error_on_lifted_constant_tensors:
                    raise RuntimeError(error_msg)
                else:
-                    warnings.warn(error_msg)
+                    warnings.warn(error_msg, stacklevel=2)

    export_graph_signature = export_artifact.aten.sig

@ -2189,7 +2189,8 @@ def _export_for_training(
                f"This is likely result of torch.export.export not being able to track side effects "
                f"that is happening outside of model scope.\n\n"
                f"Leaked tensors:\n  {leak_details}\n\n"
-                f"Alternatively, please file a bug report to PyTorch team for further debugging help."
+                f"Alternatively, please file a bug report to PyTorch team for further debugging help.",
+                stacklevel=2,
            )

            del legit_leak
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@ -530,7 +530,8 @@ def _create_stateful_graph_module(
                f"A model attribute `{constant_fqn}` requires gradient. "
                f"but it's not properly registered as a parameter. "
                f"torch.export will detach it and treat it as a constant tensor "
-                f"but please register it as parameter instead."
+                f"but please register it as parameter instead.",
+                stacklevel=2,
            )
            detached_buffer = buffer.detach()
            original_tensor_to_detached_tensor[buffer] = detached_buffer
@ -549,7 +550,8 @@ def _create_stateful_graph_module(
                        f"A model attribute `{const_name}` requires gradient "
                        f"but it's not properly registered as a parameter. "
                        f"torch.export will detach it and treat it as a constant tensor "
-                        f"but please register it as parameter instead."
+                        f"but please register it as parameter instead.",
+                        stacklevel=2,
                    )
                    if value in original_tensor_to_detached_tensor:
                        value = original_tensor_to_detached_tensor[value]
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@ -1684,7 +1684,8 @@ def _create_graph_module_for_export(root, graph):
            "Unable to execute the generated python source code from "
            "the graph. The graph module will no longer be directly callable, "
            "but you can still run the ExportedProgram, and if needed, you can "
-            "run the graph module eagerly using torch.fx.Interpreter."
+            "run the graph module eagerly using torch.fx.Interpreter.",
+            stacklevel=2,
        )
        gm = torch.fx.GraphModule(root, torch.fx.Graph())
        gm._graph = graph
--- a/torch/export/pt2_archive/_package_weights.py
+++ b/torch/export/pt2_archive/_package_weights.py
@ -108,7 +108,8 @@ def get_complete(

    warnings.warn(
        "No complete tensor found in the group! Returning the first one. "
-        "This may cause issues when your weights are not on CPU."
+        "This may cause issues when your weights are not on CPU.",
+        stacklevel=2,
    )
    assert len(group) > 0
    return next(iter(group))
--- a/torch/hub.py
+++ b/torch/hub.py
@ -279,7 +279,8 @@ def _get_cache_or_reload(
                    f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? "
                    "Torchhub will now assume that it's a branch. "
                    "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or "
-                    "refs/tags/tag_name as the ref. That might require using skip_validation=True."
+                    "refs/tags/tag_name as the ref. That might require using skip_validation=True.",
+                    stacklevel=2,
                )
                disambiguated_branch_ref = f"refs/heads/{ref}"
                url = _git_archive_link(
@ -338,7 +339,8 @@ def _check_repo_is_trusted(
                "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
                f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
                f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
-                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour"
+                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour",
+                stacklevel=2,
            )
        return

@ -406,7 +408,9 @@ def get_dir() -> str:
    """
    # Issue warning to move data if old env is set
    if os.getenv("TORCH_HUB"):
-        warnings.warn("TORCH_HUB is deprecated, please use env TORCH_HOME instead")
+        warnings.warn(
+            "TORCH_HUB is deprecated, please use env TORCH_HOME instead", stacklevel=2
+        )

    if _hub_dir is not None:
        return _hub_dir
@ -853,7 +857,8 @@ def load_state_dict_from_url(
    # Issue warning to move data if old env is set
    if os.getenv("TORCH_MODEL_ZOO"):
        warnings.warn(
-            "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead"
+            "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead",
+            stacklevel=2,
        )

    if model_dir is None:
--- a/torch/jit/init.py
+++ b/torch/jit/init.py
@ -257,7 +257,7 @@ class strict_fusion:

    def __init__(self) -> None:
        if not torch._jit_internal.is_scripting():
-            warnings.warn("Only works in script mode")
+            warnings.warn("Only works in script mode", stacklevel=2)

    def __enter__(self):
        pass
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@ -180,7 +180,8 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
            "instance-level annotations on empty non-base "
            "types in `__init__`. Instead, either 1) use a "
            "type annotation in the class body, or 2) wrap "
-            "the type in `torch.jit.Attribute`."
+            "the type in `torch.jit.Attribute`.",
+            stacklevel=2,
        )

    def visit_Call(self, node):
@ -245,5 +246,6 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
            "instance-level annotations on empty non-base "
            "types in `__init__`. Instead, either 1) use a "
            "type annotation in the class body, or 2) wrap "
-            "the type in `torch.jit.Attribute`."
+            "the type in `torch.jit.Attribute`.",
+            stacklevel=2,
        )
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@ -48,7 +48,9 @@ def signatures_match(decomposition_sig, torch_op_sig):
        inspect_empty = inspect._empty  # type: ignore[attr-defined]
        for field in ["name", "annotation"]:
            if field == "name" and decomp_param.name == "self":
-                warnings.warn("PyTorch uses 'input' instead of 'self' on public api")
+                warnings.warn(
+                    "PyTorch uses 'input' instead of 'self' on public api", stacklevel=2
+                )

            if getattr(decomp_param, field) != getattr(op_param, field):
                return False
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@ -309,7 +309,8 @@ def infer_concrete_type_builder(nn_module, share_types=True):

            warnings.warn(
                f"'{name}' was found in ScriptModule constants, "
-                f" but it is a non-constant {hint}. Consider removing it."
+                f" but it is a non-constant {hint}. Consider removing it.",
+                stacklevel=2,
            )
            continue
        if not hasattr(nn_module, name):
@ -318,7 +319,8 @@ def infer_concrete_type_builder(nn_module, share_types=True):
            warnings.warn(
                f"'{name}' was found in ScriptModule constants, "
                "but was not actually set in __init__. "
-                "Consider removing it."
+                "Consider removing it.",
+                stacklevel=2,
            )
            continue
        value = getattr(nn_module, name)
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@ -775,6 +775,7 @@ if _enabled:
                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
                https://docs.pytorch.org/executorch/stable/getting-started.html",
                DeprecationWarning,
+                stacklevel=2,
            )
            return self._c._save_for_mobile(*args, **kwargs)

@ -787,6 +788,7 @@ if _enabled:
                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
                https://docs.pytorch.org/executorch/stable/getting-started.html",
                DeprecationWarning,
+                stacklevel=2,
            )
            return self._c._save_to_buffer_for_mobile(*args, **kwargs)

@ -1165,7 +1167,8 @@ def _script_impl(
            warnings.warn(
                "Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
                "to enable Profile-Directed Typing in TorchScript. Refer to "
-                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. "
+                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ",
+                stacklevel=2,
            )

    if isinstance(obj, torch.nn.Module):
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@ -686,7 +686,8 @@ def _trace_impl(
        # it is hard to trace it because the forward method on ScriptModule is already defined, so it
        # would result in an error.
        warnings.warn(
-            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."
+            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is.",
+            stacklevel=2,
        )
        return func

--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@ -389,7 +389,8 @@ def is_tensor(ann):
        warnings.warn(
            "TorchScript will treat type annotations of Tensor "
            "dtype-specific subtypes as if they are normal Tensors. "
-            "dtype constraints are not enforced in compilation either."
+            "dtype constraints are not enforced in compilation either.",
+            stacklevel=2,
        )
        return True

--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@ -44,7 +44,8 @@ def _apply_docstring_templates(func: Callable[_P, _T]) -> Callable[_P, _T]:
        warnings.warn(
            f"No documentation string available for {func.__name__}."
            " PyTorch team should run `python tools/update_masked_docs.py`"
-            " to generate the missing docstrings."
+            " to generate the missing docstrings.",
+            stacklevel=2,
        )
    else:
        func.__doc__ = doc_string
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@ -322,7 +322,7 @@ class MaskedTensor(torch.Tensor):
            "In the case that the semantics for the operator are not trivial, it would be appreciated "
            "to also include a proposal for the semantics."
        )
-        warnings.warn(msg)
+        warnings.warn(msg, stacklevel=2)
        return NotImplemented

    def __lt__(self, other):
--- a/torch/masked/maskedtensor/reductions.py
+++ b/torch/masked/maskedtensor/reductions.py
@ -90,7 +90,7 @@ def _torch_reduce_dim(fn):
                "In the case that the semantics for the operator are not trivial, it would be appreciated "
                "to also include a proposal for the semantics."
            )
-            warnings.warn(msg)
+            warnings.warn(msg, stacklevel=2)
            return NotImplemented
        if not is_masked_tensor(self):
            raise TypeError("Input to reduce_dim must be a MaskedTensor")
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@ -223,7 +223,9 @@ class ProcessContext:

 class SpawnContext(ProcessContext):
    def __init__(self, processes, error_files):
-        warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.")
+        warnings.warn(
+            "SpawnContext is renamed to ProcessContext since 1.4 release.", stacklevel=2
+        )
        super().__init__(processes, error_files)


--- a/torch/nn/_reduction.py
+++ b/torch/nn/_reduction.py
@ -13,7 +13,8 @@ def get_enum(reduction: str) -> int:
    elif reduction == "elementwise_mean":
        warnings.warn(
            "reduction='elementwise_mean' is deprecated. "
-            "Please use reduction='mean' instead."
+            "Please use reduction='mean' instead.",
+            stacklevel=2,
        )
        ret = 1
    elif reduction == "sum":
@ -48,7 +49,7 @@ def legacy_get_string(
    else:
        ret = "none"
    if emit_warning:
-        warnings.warn(warning.format(ret))
+        warnings.warn(warning.format(ret), stacklevel=2)
    return ret


--- a/torch/nn/attention/init.py
+++ b/torch/nn/attention/init.py
@ -60,10 +60,10 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
    """
    if WARN_FOR_UNFUSED_KERNELS:
        if not can_use_efficient_attention(params):
-            warn("Efficient attention can't be used because:")
+            warn("Efficient attention can't be used because:", stacklevel=2)
            can_use_efficient_attention(params, True)
        if not can_use_flash_attention(params):
-            warn("Flash attention can't be used because:")
+            warn("Flash attention can't be used because:", stacklevel=2)
            can_use_flash_attention(params, True)


--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@ -134,7 +134,8 @@ class CausalBias(torch.Tensor):
        self.seq_len_kv = seq_len_kv
        if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT:
            warn(
-                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!"
+                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!",
+                stacklevel=2,
            )

    def _upper_left(self, device: torch.device) -> torch.Tensor:
--- a/Show More
+++ b/Show More