diff --git a/torch/__init__.py b/torch/__init__.py
index 95f55ae5878..78a4c220a05 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -2653,7 +2653,8 @@ def compile(
     if torch.compiler.is_exporting():
         warnings.warn(
             "You are calling torch.compile inside torch.export region. "
-            "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)"
+            "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)",
+            stacklevel=2,
         )
         from torch._higher_order_ops.utils import setup_compilation_env
 
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index bcc0193fb88..1398f808da2 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -55,6 +55,7 @@ def warn_deprecated():
         "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please "
         "use the equivalent torch.library API instead.",
         DeprecationWarning,
+        stacklevel=2,
     )
 
 
diff --git a/torch/_export/converter.py b/torch/_export/converter.py
index e2a3be17118..1a928f011bb 100644
--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@@ -704,7 +704,8 @@ class TS2FXGraphConverter:
         # In a sense, the converter now becomes an stateful interpreter
         warnings.warn(
             "Converting aten::append.t, which is a inplace mutation of the list. "
-            "This makes the converter non-functional: the result depends on the order of the append nodes being converter!"
+            "This makes the converter non-functional: the result depends on the order of the append nodes being converter!",
+            stacklevel=2,
         )
 
         args = tuple(self.get_fx_value_by_ir_value(inp) for inp in node.inputs())
@@ -1471,7 +1472,8 @@ DEBUG: (TORCH_LOGS="+export" <cmd>), additionally
             for k, tensor in self.ts_model.state_dict().items():  # type: ignore[union-attr]
                 if k not in ep.state_dict:
                     warnings.warn(
-                        f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram."
+                        f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram.",
+                        stacklevel=2,
                     )
                     ep.state_dict[k] = tensor
 
diff --git a/torch/_export/tools.py b/torch/_export/tools.py
index 0007de25d3e..b254fd62e3b 100644
--- a/torch/_export/tools.py
+++ b/torch/_export/tools.py
@@ -51,7 +51,8 @@ def _generate_inputs_for_submodules(
         model(*args, **kwargs)
     except Exception as e:
         warnings.warn(
-            f"Failed to generate submodule inputs because of the following error:\n{e}"
+            f"Failed to generate submodule inputs because of the following error:\n{e}",
+            stacklevel=2,
         )
     finally:
         for h in handles:
diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py
index a75863cd739..01ae9509cb6 100644
--- a/torch/_functorch/_aot_autograd/frontend_utils.py
+++ b/torch/_functorch/_aot_autograd/frontend_utils.py
@@ -321,5 +321,6 @@ def _detect_attribute_assignment(mod: torch.nn.Module):
             warnings.warn(
                 f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. "
                 "Such attributes must be registered as buffers using the `register_buffer` API "
-                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer).",
+                stacklevel=2,
             )
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index eae75e06a42..2676042198d 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -137,7 +137,8 @@ def call_func_at_runtime_with_args(
             warnings.warn(
                 "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
                 "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
-                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
+                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.",
+                stacklevel=2,
             )
             out = normalize_as_list(f(*args))
     return out
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index cca12066bc3..9639a8b68d4 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -518,7 +518,8 @@ def do_auto_functionalize(
     if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
         warnings.warn(
             "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
-            "Please consider using a different name for this argument to avoid potential issues."
+            "Please consider using a different name for this argument to avoid potential issues.",
+            stacklevel=2,
         )
     with ctx.redispatch_to_next():
         unwrapped_outs = auto_functionalized(
@@ -691,7 +692,8 @@ def do_auto_functionalize_v2(
     if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs:
         warnings.warn(
             "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. "
-            "Please consider using a different name for this argument to avoid potential issues."
+            "Please consider using a different name for this argument to avoid potential issues.",
+            stacklevel=2,
         )
     all_basis_unwrapped = ctx.unwrap_tensors(all_bases)
 
diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py
index a7647c70984..37c5d320928 100644
--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@@ -196,7 +196,8 @@ class BaseHOP(HigherOrderOperator, abc.ABC):
                 "Aliasing is not supported for HOP subgraph.\n"
                 f"{subgraph.print_readable(print_output=False)}\n"
                 f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
-                f"This may lead to silent incorrectness."
+                f"This may lead to silent incorrectness.",
+                stacklevel=2,
             )
 
         schema_gen = HopSchemaGenerator(self)
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 9379111d689..c06ee8fcad4 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -177,6 +177,7 @@ def cond(
                 "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches."
                 " If you want torch.cond to preserve two branches, please make the predicate a boolean tensor or a SymBool.",
                 UserWarning,
+                stacklevel=2,
             )
         # This is the eager case. We can just run the true or false branch.
         if pred:
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 192f969e5c6..31f4e181b65 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -859,6 +859,7 @@ def ignore(drop=False, **kwargs):
         warnings.warn(
             "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function "
             "call on compilation. Use torch.jit.unused now. {}",
+            stacklevel=2,
             category=FutureWarning,
         )
 
@@ -867,6 +868,7 @@ def ignore(drop=False, **kwargs):
         warnings.warn(
             "ignore(True) has been deprecated. TorchScript will now drop the function "
             "call on compilation. Use torch.jit.unused now. {}",
+            stacklevel=2,
             category=FutureWarning,
         )
 
@@ -992,7 +994,8 @@ def _check_overload_body(func):
         # Parsing the function definition can raise an OSError if source is unavailable.
         # Since this is just an initial check, just raise a warning if this is the case.
         warnings.warn(
-            f"Unable to retrieve source for @torch.jit._overload function: {func}."
+            f"Unable to retrieve source for @torch.jit._overload function: {func}.",
+            stacklevel=2,
         )
         return
 
@@ -1385,7 +1388,8 @@ def check_empty_containers(obj) -> None:
             "calling torch.jit.isinstance in eager mode. For "
             "example, List[int] would become list and "
             "therefore falsely return True for List[float] or"
-            " List[str]."
+            " List[str].",
+            stacklevel=2,
         )
 
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 2afb2331048..c53c4d0d608 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -2137,7 +2137,8 @@ def alert_not_deterministic(caller: str):
                 f"{caller} does not have a deterministic implementation, but you set "
                 f"'torch.use_deterministic_algorithms(True, warn_only=True)'. "
                 f"You can file an issue at https://github.com/pytorch/pytorch/issues "
-                f"to help us prioritize adding deterministic support for this operation."
+                f"to help us prioritize adding deterministic support for this operation.",
+                stacklevel=2,
             )
         else:
             torch._check(
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 23e242290d9..8f6b7e5f1a5 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -180,7 +180,7 @@ def _resize_output_check(out: TensorLikeType, shape: ShapeType):
             "be resized unless they have zero elements. "
             "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)."
         )
-        warnings.warn(msg)
+        warnings.warn(msg, stacklevel=2)
     return True
 
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 9e70cdf9a9e..6dcee880deb 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3729,7 +3729,8 @@ def istft(
     if end > expected_output_signal_len:
         warnings.warn(
             "The length of signal is shorter than the length parameter. Result is being "
-            + "padded with zeros in the tail. Please check your center and hop_length settings"
+            + "padded with zeros in the tail. Please check your center and hop_length settings",
+            stacklevel=2,
         )
         y = aten.constant_pad_nd(y, (0, end - expected_output_signal_len), 0)
     return y
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 83d0afb837b..208f48da361 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -405,7 +405,8 @@ class FunctionalTensorMode(TorchDispatchMode):
                         warnings.warn(
                             f"At pre-dispatch tracing, we assume that any custom op marked with "
                             f"CompositeImplicitAutograd and have functional schema are safe to not decompose. "
-                            f"Found {func} to be one such op."
+                            f"Found {func} to be one such op.",
+                            stacklevel=2,
                         )
                     return False
                 return True
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 165fd6ba7e1..f020b733094 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -350,7 +350,8 @@ class Tensor(torch._C.TensorBase):
             # hypothesis is that no one cares for meta tensors.
             if skip_data:
                 warnings.warn(
-                    "Serializing tensors on the meta device under skip_data context manager is a no-op"
+                    "Serializing tensors on the meta device under skip_data context manager is a no-op",
+                    stacklevel=2,
                 )
             arg_meta = (
                 self.dtype,
@@ -1033,7 +1034,7 @@ class Tensor(torch._C.TensorBase):
     def resize(self, *sizes):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.resize, (self,), self, *sizes)
-        warnings.warn("non-inplace resize is deprecated")
+        warnings.warn("non-inplace resize is deprecated", stacklevel=2)
         from torch.autograd._functions import Resize
 
         return Resize.apply(self, sizes)
@@ -1041,7 +1042,7 @@ class Tensor(torch._C.TensorBase):
     def resize_as(self, tensor):
         if has_torch_function_variadic(self, tensor):
             return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor)
-        warnings.warn("non-inplace resize_as is deprecated")
+        warnings.warn("non-inplace resize_as is deprecated", stacklevel=2)
         from torch.autograd._functions import Resize
 
         return Resize.apply(self, tensor.size())
diff --git a/torch/_utils.py b/torch/_utils.py
index 991e543e7a5..01cf9d39318 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -118,7 +118,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
         message = "{}() got an unexpected keyword argument '{}'"
         argument = list(kwargs.keys()).pop()
         raise TypeError(message.format(function_name, argument))
-    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    warnings.warn("'async' is deprecated; use 'non_blocking'", stacklevel=2)
     return kwargs["async"]
 
 
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index d33c10ed384..1ac9d2046f2 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -555,7 +555,8 @@ class Unpickler:
                         f"Detected pickle protocol {self.proto} in the checkpoint, which was "
                         "not the default pickle protocol used by `torch.load` (2). The weights_only "
                         "Unpickler might not support all instructions implemented by this protocol, "
-                        "please file an issue for adding support if you encounter this."
+                        "please file an issue for adding support if you encounter this.",
+                        stacklevel=2,
                     )
             elif key[0] == STOP[0]:
                 rc = self.stack.pop()
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index c23058dc336..5b4666fcb28 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -267,7 +267,8 @@ class autocast:
             and torch.cuda.amp.common.amp_definitely_not_available()
         ):
             warnings.warn(
-                "User provided device_type of 'cuda', but CUDA is not available. Disabling"
+                "User provided device_type of 'cuda', but CUDA is not available. Disabling",
+                stacklevel=2,
             )
             enabled = False
         if cache_enabled is not None:
@@ -281,42 +282,42 @@ class autocast:
                 error_message += (
                     ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
                 )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "mtia":
             supported_dtype = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtype:
                 error_message = "In MTIA autocast, but the target dtype is not supported. Disabling autocast.\n"
                 error_message += "MTIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "maia":
             supported_dtype = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtype:
                 error_message = "In MAIA autocast, but the target dtype is not supported. Disabling autocast.\n"
                 error_message += "MAIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "xpu":
             supported_dtype = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtype:
                 error_message = "In XPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                 error_message += "XPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "ipu":
             supported_dtypes = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtypes:
                 error_message = "In IPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                 error_message += "IPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "hpu":
             supported_dtype = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtype:
                 error_message = "In HPU autocast, but the target dtype is not supported. Disabling autocast.\n"
                 error_message += "HPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == self.custom_backend_name:
             supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
@@ -326,7 +327,7 @@ class autocast:
                 error_message += (
                     ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
                 )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         elif self.device == "cuda":
             if (
@@ -344,7 +345,7 @@ class autocast:
                     "In MPS autocast, but the target dtype is not supported. Disabling autocast.\n"
                     "MPS Autocast only supports dtype of torch.bfloat16 and torch.float16 currently."
                 )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
             elif self.fast_dtype == torch.bfloat16:
                 if not torch.backends.mps.is_macos_or_newer(14, 0):
@@ -352,7 +353,7 @@ class autocast:
                         "In MPS autocast, but the target dtype torch.bfloat16 is not supported "
                         "on macOS versions below 14. Disabling autocast."
                     )
-                    warnings.warn(error_message)
+                    warnings.warn(error_message, stacklevel=2)
                     enabled = False
         elif self.device == "xla":
             supported_dtype = [torch.float16, torch.bfloat16]
@@ -361,7 +362,7 @@ class autocast:
                 error_message += (
                     "XLA Autocast only supports dtype of torch.bfloat16 currently."
                 )
-                warnings.warn(error_message)
+                warnings.warn(error_message, stacklevel=2)
                 enabled = False
         self._enabled = enabled
 
diff --git a/torch/amp/grad_scaler.py b/torch/amp/grad_scaler.py
index 54314b034d1..506c2cf9016 100644
--- a/torch/amp/grad_scaler.py
+++ b/torch/amp/grad_scaler.py
@@ -422,6 +422,7 @@ class GradScaler:
                     "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
                     "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
                     FutureWarning,
+                    stacklevel=2,
                 )
                 kwargs_.update({"grad_scaler": self})
             else:
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index f13bb9b1a16..9b807e82ddb 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -469,14 +469,16 @@ class LSTM(torch.nn.Module):
             warnings.warn(
                 "dropout option for quantizable LSTM is ignored. "
                 "If you are training, please, use nn.LSTM version "
-                "followed by `prepare` step."
+                "followed by `prepare` step.",
+                stacklevel=2,
             )
             if num_layers == 1:
                 warnings.warn(
                     "dropout option adds dropout after all but last "
                     "recurrent layer, so non-zero dropout expects "
                     f"num_layers greater than 1, but got dropout={dropout} "
-                    f"and num_layers={num_layers}"
+                    f"and num_layers={num_layers}",
+                    stacklevel=2,
                 )
 
         layers = [
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index 1f8a65fe9d6..0c8785da899 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -68,7 +68,8 @@ class Conv1d(nnq.Conv1d):
         reduce_range=True,
     ):
         warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _single(kernel_size)
@@ -155,7 +156,8 @@ class Conv2d(nnq.Conv2d):
     ):
         warnings.warn(
             f"The current implementation of the {self._get_name()} module "
-            "has poor numerical accuracy and its use is not recommended"
+            "has poor numerical accuracy and its use is not recommended",
+            stacklevel=2,
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _pair(kernel_size)
@@ -239,7 +241,8 @@ class Conv3d(nnq.Conv3d):
         dtype=None,
     ):
         warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
         )
         assert padding_mode != "reflect", "Conv3d does not support reflection padding"
         factory_kwargs = {"device": device, "dtype": dtype}
@@ -330,7 +333,8 @@ class ConvTranspose1d(nnq.ConvTranspose1d):
         dtype=None,
     ):
         warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
@@ -412,7 +416,8 @@ class ConvTranspose2d(nnq.ConvTranspose2d):
         dtype=None,
     ):
         warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
@@ -494,7 +499,8 @@ class ConvTranspose3d(nnq.ConvTranspose3d):
         dtype=None,
     ):
         warnings.warn(
-            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended",  # noqa: B950
+            stacklevel=2,
         )
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(
diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py
index fb5371ea4a4..4c2b43189c3 100644
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -136,7 +136,8 @@ class RNNBase(torch.nn.Module):
                 "dropout option adds dropout after all but last "
                 "recurrent layer, so non-zero dropout expects "
                 f"num_layers greater than 1, but got dropout={dropout} and "
-                f"num_layers={num_layers}"
+                f"num_layers={num_layers}",
+                stacklevel=2,
             )
 
         if mode == "LSTM":
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index 51a2f4905c2..30994b2921b 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -724,7 +724,8 @@ def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=
         affects the outputs.
     """
     warnings.warn(
-        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
     )
     return interpolate(input, size, scale_factor, mode, align_corners)
 
@@ -749,7 +750,8 @@ def upsample_bilinear(input, size=None, scale_factor=None):
     """
     # DeprecationWarning is ignored by default
     warnings.warn(
-        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
     )
     return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)
 
@@ -774,6 +776,7 @@ def upsample_nearest(input, size=None, scale_factor=None):
     """
     # DeprecationWarning is ignored by default
     warnings.warn(
-        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead."
+        "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.",
+        stacklevel=2,
     )
     return interpolate(input, size, scale_factor, mode="nearest")
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index 67b69eb7390..c8c1571bda3 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -322,7 +322,8 @@ class PReLU(torch.nn.Module):
         observer(float_wt)
         if observer.dtype != torch.quint8:
             warn(
-                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
             )
         wt_scale, wt_zp = observer.calculate_qparams()
         qweight = torch.quantize_per_tensor(
@@ -339,7 +340,8 @@ class PReLU(torch.nn.Module):
         observer(float_wt)
         if observer.dtype != torch.quint8:
             warn(
-                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}",
+                stacklevel=2,
             )
         wt_scale, wt_zp = observer.calculate_qparams()
         qweight = torch.quantize_per_tensor(
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index 4330b0e2425..d536245b0e9 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -213,7 +213,8 @@ class ActivationSparsifier:
 
         if name in self.data_groups:  # unregister layer if already present
             warnings.warn(
-                "layer already attached to the sparsifier, deregistering the layer and registering with new config"
+                "layer already attached to the sparsifier, deregistering the layer and registering with new config",
+                stacklevel=2,
             )
             self.unregister_layer(name=name)
 
diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
index 672903e8f05..c2f48abfc9d 100644
--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -158,6 +158,7 @@ class BaseDataScheduler:
                     "initialization. Please, make sure to call `data_sparsifier.step()` before "
                     "`scheduler.step()`.",
                     UserWarning,
+                    stacklevel=2,
                 )
 
             # Just check if there were two first scheduler.step() calls before sparsifier.step()
@@ -167,6 +168,7 @@ class BaseDataScheduler:
                     "You have to make sure you run the data_sparsifier.step() BEFORE any "
                     "calls to the scheduler.step().",
                     UserWarning,
+                    stacklevel=2,
                 )
         self._step_count += 1
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
index 3dea01586a2..0db7becdda5 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -105,7 +105,8 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
         if name in self.state:
             # If the named data already exists - replace
             warnings.warn(
-                "Replacing existing data of the same name. - Did you mean a different name?"
+                "Replacing existing data of the same name. - Did you mean a different name?",
+                stacklevel=2,
             )
 
             # reuse old config
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 5a36e13c7b4..9447e3331c2 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -74,6 +74,7 @@ class StepSLScheduler(BaseDataScheduler):
                 "To get the last learning rate computed by the scheduler, "
                 "please use `get_last_lr()`.",
                 UserWarning,
+                stacklevel=2,
             )
         data_groups = self.data_sparsifier.data_groups
         if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
diff --git a/torch/ao/pruning/scheduler/base_scheduler.py b/torch/ao/pruning/scheduler/base_scheduler.py
index f602028d475..ac8916713da 100644
--- a/torch/ao/pruning/scheduler/base_scheduler.py
+++ b/torch/ao/pruning/scheduler/base_scheduler.py
@@ -92,7 +92,8 @@ class BaseScheduler:
         if not self._get_sl_called_within_step:
             warnings.warn(
                 "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
             )
         raise NotImplementedError
 
@@ -124,6 +125,7 @@ class BaseScheduler:
                     "initialization. Please, make sure to call `sparsifier.step()` before "
                     "`scheduler.step()`.",
                     UserWarning,
+                    stacklevel=2,
                 )
 
             # Just check if there were two first scheduler.step() calls before sparsifier.step()
@@ -133,6 +135,7 @@ class BaseScheduler:
                     "You have to make sure you run the sparsifier.step() BEFORE any "
                     "calls to the scheduler.step().",
                     UserWarning,
+                    stacklevel=2,
                 )
         self._step_count += 1
 
diff --git a/torch/ao/pruning/scheduler/cubic_scheduler.py b/torch/ao/pruning/scheduler/cubic_scheduler.py
index 45985a8bbc5..d4706900762 100644
--- a/torch/ao/pruning/scheduler/cubic_scheduler.py
+++ b/torch/ao/pruning/scheduler/cubic_scheduler.py
@@ -90,7 +90,8 @@ class CubicSL(BaseScheduler):
         if not self._get_sl_called_within_step:
             warnings.warn(
                 "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
             )
         return [
             self.sparsity_compute_fn(
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index 7c0e8088890..d9b6cb0a4d9 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -56,7 +56,8 @@ class LambdaSL(BaseScheduler):
         if not self._get_sl_called_within_step:
             warnings.warn(
                 "To get the last sparsity level computed by the scheduler, "
-                "please use `get_last_sl()`."
+                "please use `get_last_sl()`.",
+                stacklevel=2,
             )
         return [
             base_sl * lmbda(self.last_epoch)
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 71563c236aa..37b72c372e2 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -121,7 +121,8 @@ class _InputEqualizationObserver(nn.Module):
         ):
             warnings.warn(
                 "Must call calculate_equalization_scale before calling calculate_scaled_minmax. "
-                + "Will not scale the next quantization observer."
+                + "Will not scale the next quantization observer.",
+                stacklevel=2,
             )
             return None, None
 
@@ -226,7 +227,8 @@ def calculate_equalization_scale(
     ):
         warnings.warn(
             "Must run observer before calling calculate_equalization_scale. "
-            + "Returning default equalization scale torch.tensor(1)."
+            + "Returning default equalization scale torch.tensor(1).",
+            stacklevel=2,
         )
         return torch.tensor(1)
 
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index cde3a92987c..6ad8433230f 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -597,7 +597,8 @@ def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph) -> No
             _maybe_recursive_remove_dequantize(arg_element, node, graph)
     else:
         warnings.warn(
-            f"Unsupported node type in recursive remove dequantize: {type(arg)}"
+            f"Unsupported node type in recursive remove dequantize: {type(arg)}",
+            stacklevel=2,
         )
 
 
@@ -1197,7 +1198,8 @@ def convert(
                     _maybe_recursive_remove_dequantize(output, return_node, model.graph)
             else:
                 warnings.warn(
-                    f"Unsupported node type for output_quantized_idxs: {type(output)}"
+                    f"Unsupported node type for output_quantized_idxs: {type(output)}",
+                    stacklevel=2,
                 )
         elif node.op == "call_module":
             mod = _get_module(node, modules)
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 4ea44181e96..e3561965daf 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -1055,7 +1055,9 @@ def _maybe_insert_input_equalization_observers_for_node(
         return
 
     if is_branch:
-        warnings.warn(f"Cannot equalize {node} because it is part of a branch.")
+        warnings.warn(
+            f"Cannot equalize {node} because it is part of a branch.", stacklevel=2
+        )
         return
 
     new_args = []
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 287b30c0bb8..232d4456843 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -890,7 +890,8 @@ def _qconfig_satisfies_dtype_config_constraints(
         if backend_quant_min is not None and backend_quant_max is not None:
             if app_quant_min is None or app_quant_max is None:
                 warnings.warn(
-                    f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}"
+                    f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}",
+                    stacklevel=2,
                 )
                 return False
             elif app_quant_min < backend_quant_min or app_quant_max > backend_quant_max:
@@ -898,20 +899,23 @@ def _qconfig_satisfies_dtype_config_constraints(
                     f"QConfig {debug_string} quantization range must fall within the backend's:\n"
                     f"QConfig range = ({app_quant_min}, {app_quant_max}), "
                     f"BackendConfig range = ({backend_quant_min}, {backend_quant_max}), "
-                    f"ignoring {qconfig}"
+                    f"ignoring {qconfig}",
+                    stacklevel=2,
                 )
                 return False
         # check scale min
         if backend_scale_min is not None:
             if app_scale_min is None:
                 warnings.warn(
-                    f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}"
+                    f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}",
+                    stacklevel=2,
                 )
                 return False
             if app_scale_min < backend_scale_min:
                 warnings.warn(
                     f"QConfig {debug_string} eps ({app_scale_min}) must be greater than or equal to "
-                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}"
+                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}",
+                    stacklevel=2,
                 )
                 return False
         # check fixed scale and zero point
@@ -935,7 +939,8 @@ def _qconfig_satisfies_dtype_config_constraints(
             ) and not isinstance(activation_post_process, FixedQParamsFakeQuantize):
                 warnings.warn(
                     f"QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize "
-                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}"
+                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}",
+                    stacklevel=2,
                 )
                 return False
             if (
@@ -945,7 +950,8 @@ def _qconfig_satisfies_dtype_config_constraints(
                 warnings.warn(
                     f"QConfig fixed scale ({observer.scale}) and zero point ({observer.zero_point}) "
                     f"do not match the backend's ({backend_scale_exact_match} and {backend_zero_point_exact_match}), "
-                    f"ignoring {qconfig}.\n{suggestion_str}"
+                    f"ignoring {qconfig}.\n{suggestion_str}",
+                    stacklevel=2,
                 )
                 return False
         return True
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 160738c93ee..06d1a3fd717 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -245,7 +245,8 @@ class UniformQuantizationObserverBase(ObserverBase):
         if reduce_range:
             warnings.warn(
                 "Please use quant_min and quant_max to specify the range for observers. \
-                    reduce_range will be deprecated in a future release of PyTorch."
+                    reduce_range will be deprecated in a future release of PyTorch.",
+                stacklevel=2,
             )
         self.reduce_range = reduce_range
         self.register_buffer("eps", torch.tensor([eps], **factory_kwargs))
@@ -829,7 +830,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
                     self.max_val.resize_(val.shape)
                 else:
                     warnings.warn(
-                        f"Observer load_from_state_dict got unexpected name {name}"
+                        f"Observer load_from_state_dict got unexpected name {name}",
+                        stacklevel=2,
                     )
                 # For torchscript module we need to update the attributes here since we do not
                 # call the `_load_from_state_dict` function defined module.py
@@ -840,7 +842,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
                         self.max_val.copy_(val)
                     else:
                         warnings.warn(
-                            f"Observer load_from_state_dict got unexpected name {name}"
+                            f"Observer load_from_state_dict got unexpected name {name}",
+                            stacklevel=2,
                         )
             elif strict:
                 missing_keys.append(key)
@@ -1289,7 +1292,9 @@ class HistogramObserver(UniformQuantizationObserverBase):
         # want to make our quantization range infinite
         # and in practice those values will be clamped
         if x_min == -torch.inf or x_max == torch.inf:
-            warnings.warn("torch.inf detected in input tensor, ignoring input")
+            warnings.warn(
+                "torch.inf detected in input tensor, ignoring input", stacklevel=2
+            )
             x = x[x.abs() != torch.inf]
             if x.numel() == 0:
                 return x_orig
@@ -1345,7 +1350,8 @@ class HistogramObserver(UniformQuantizationObserverBase):
         if is_uninitialized:
             warnings.warn(
                 "must run observer before calling calculate_qparams.\
-                                    Returning default scale and zero point "
+                                    Returning default scale and zero point ",
+                stacklevel=2,
             )
             return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor(
                 [0], device=self.min_val.device.type
@@ -1509,7 +1515,8 @@ class PlaceholderObserver(ObserverBase):
             warnings.warn(
                 "Please use `is_dynamic` instead of `compute_dtype`. \
                     `compute_dtype` will be deprecated in a future release \
-                    of PyTorch."
+                    of PyTorch.",
+                stacklevel=2,
             )
 
     def forward(self, x):
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 623fd12434a..c3d9f773390 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -292,7 +292,8 @@ def get_default_qconfig(backend="x86", version=0):
             if not torch.cpu._is_vnni_supported():
                 warnings.warn(
                     "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
-                    "on CPU without Vector Neural Network Instruction support."
+                    "on CPU without Vector Neural Network Instruction support.",
+                    stacklevel=2,
                 )
             qconfig = QConfig(
                 activation=HistogramObserver.with_args(reduce_range=False),
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 5a0037b6620..3c53876081e 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -392,7 +392,8 @@ def prepare(
         warnings.warn(
             "None of the submodule got qconfig applied. Make sure you "
             "passed correct configuration through `qconfig_dict` or "
-            "by assigning the `.qconfig` attribute directly on submodules"
+            "by assigning the `.qconfig` attribute directly on submodules",
+            stacklevel=2,
         )
 
     _add_observer_(
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index db47aa04790..c6fed271a3a 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -372,6 +372,7 @@ def _config_checker(method: Callable) -> Callable:
         if quantizer._need_skip_config(quantization_config):
             warnings.warn(
                 f"Skip the quantization config for {name}.",
+                stacklevel=2,
             )
             return quantizer
         return method(quantizer, name, quantization_config)
@@ -464,7 +465,10 @@ class X86InductorQuantizer(Quantizer):
             current_mode.qat_state is not None
             and current_mode.qat_state != quantization_config.is_qat
         ):
-            warnings.warn("Mixed QAT and Non-QAT quantization config is not supported.")
+            warnings.warn(
+                "Mixed QAT and Non-QAT quantization config is not supported.",
+                stacklevel=2,
+            )
             need_skip = True
         if current_mode.dynamic_state is not None:
             input_activation_spec = quantization_config.input_activation
@@ -473,14 +477,15 @@ class X86InductorQuantizer(Quantizer):
                 and current_mode.dynamic_state != input_activation_spec.is_dynamic
             ):
                 warnings.warn(
-                    "Mixed dynamic and static quantization config is not supported."
+                    "Mixed dynamic and static quantization config is not supported.",
+                    stacklevel=2,
                 )
                 need_skip = True
         return need_skip
 
     def set_global(self, quantization_config: QuantizationConfig):
         if self._need_skip_config(quantization_config):
-            warnings.warn("Skip the global quantization config.")
+            warnings.warn("Skip the global quantization config.", stacklevel=2)
             return self
         self.global_config = quantization_config
         return self
@@ -489,7 +494,8 @@ class X86InductorQuantizer(Quantizer):
         if not isinstance(self.global_config, QuantizationConfig):
             warnings.warn(
                 "The global_config for X86InductorQuantizer is currently invalid. \
-                Please ensure that you use set_global to establish the global quantization configuration."
+                Please ensure that you use set_global to establish the global quantization configuration.",
+                stacklevel=2,
             )
         return self.global_config
 
@@ -508,7 +514,8 @@ class X86InductorQuantizer(Quantizer):
             )
         else:
             warnings.warn(
-                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer."
+                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.",
+                stacklevel=2,
             )
         return self
 
@@ -525,7 +532,8 @@ class X86InductorQuantizer(Quantizer):
             )
         else:
             warnings.warn(
-                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer."
+                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.",
+                stacklevel=2,
             )
         return self
 
@@ -551,7 +559,8 @@ class X86InductorQuantizer(Quantizer):
             self.operator_type_qconfig[operator_type] = quantization_config
         else:
             warnings.warn(
-                f"operator: Unable to quantize {operator} by X86InductorQuantizer."
+                f"operator: Unable to quantize {operator} by X86InductorQuantizer.",
+                stacklevel=2,
             )
         return self
 
@@ -1317,7 +1326,8 @@ class X86InductorQuantizer(Quantizer):
                 if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
                     if quantization_config is not None:
                         warnings.warn(
-                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}."
+                            f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.",
+                            stacklevel=2,
                         )
                     return
 
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 1874dc6e20b..63c635565c4 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -427,7 +427,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
     if min_val.numel() == 0 or max_val.numel() == 0:
         warnings.warn(
             "must run observer before calling calculate_qparams. "
-            + "Returning default values."
+            + "Returning default values.",
+            stacklevel=2,
         )
         return False
 
@@ -435,7 +436,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
         if min_val == float("inf") and max_val == float("-inf"):
             warnings.warn(
                 "must run observer before calling calculate_qparams. "
-                + "Returning default values."
+                + "Returning default values.",
+                stacklevel=2,
             )
 
             return False
@@ -806,7 +808,8 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     if {torch.device("cpu"), torch.device("meta")} == devices:
         warnings.warn(
-            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'."
+            "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.",
+            stacklevel=2,
         )
         devices = {torch.device("cpu")}
     ""
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 674e42b34ad..956075590bb 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -944,7 +944,8 @@ def _check_inputs(tupled_inputs) -> bool:
                     f"Input #{idx} requires gradient and "
                     "is not a double precision floating point or complex. "
                     "This check will likely fail if all the inputs are "
-                    "not of double precision floating point or complex. "
+                    "not of double precision floating point or complex. ",
+                    stacklevel=2,
                 )
             if inp.is_sparse:
                 content = inp._values()
@@ -1325,7 +1326,8 @@ def _test_undefined_backward_mode(func, outputs, inputs) -> bool:
             "Backwards compatibility: New undefined gradient support checking "
             "feature is enabled by default, but it may break existing callers "
             "of this function. If this is true for you, you can call this "
-            'function with "check_undefined_grad=False" to disable the feature'
+            'function with "check_undefined_grad=False" to disable the feature',
+            stacklevel=2,
         )
 
     def check_undefined_grad_support(output_to_check):
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 5c478e514d0..de821b7513d 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -265,22 +265,24 @@ class profile:
             if _get_privateuse1_backend_name() != "privateuseone":
                 VALID_DEVICE_OPTIONS.append(_get_privateuse1_backend_name())
             if self.use_device not in VALID_DEVICE_OPTIONS:
-                warn(f"The {self.use_device} is not a valid device option.")
+                warn(
+                    f"The {self.use_device} is not a valid device option.", stacklevel=2
+                )
                 self.use_device = None
 
             if self.use_device == "cuda" and not torch.cuda.is_available():
-                warn("CUDA is not available, disabling CUDA profiling")
+                warn("CUDA is not available, disabling CUDA profiling", stacklevel=2)
                 self.use_cuda = False
                 self.use_device = None
 
             if self.use_device == "xpu" and not torch.xpu.is_available():
-                warn("XPU is not available, disabling XPU profiling")
+                warn("XPU is not available, disabling XPU profiling", stacklevel=2)
                 self.use_device = None
 
             if self.use_device == "hpu" and not (
                 hasattr(torch, "hpu") and torch.hpu.is_available()
             ):
-                warn("HPU is not available, disabling HPU profiling")
+                warn("HPU is not available, disabling HPU profiling", stacklevel=2)
                 self.use_device = None
 
         self.kineto_activities = set()
@@ -1224,7 +1226,8 @@ class KinetoStepTracker:
             if delta > 1:
                 warn(
                     "Profiler step count has increased more than 1 - "
-                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
+                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}",
+                    stacklevel=2,
                 )
             for _ in range(delta):
                 _kineto_step()
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 907b6d0b862..3423490d514 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -118,7 +118,8 @@ def is_acceptable(tensor):
     if not is_available():
         warnings.warn(
             "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
-            "PyTorch making sure the library is visible to the build system."
+            "PyTorch making sure the library is visible to the build system.",
+            stacklevel=2,
         )
         return False
     if not _init():
@@ -127,7 +128,8 @@ def is_acceptable(tensor):
                 libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
                     sys.platform, "LD_LIBRARY_PATH"
                 )
-            )
+            ),
+            stacklevel=2,
         )
         return False
     return True
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index bb4a1e29dae..1d7155a1a61 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -293,7 +293,8 @@ def _check_capability():
                         min_arch % 10,
                         max_arch // 10,
                         max_arch % 10,
-                    )
+                    ),
+                    stacklevel=2,
                 )
                 matched_arches = ""
                 for arch, arch_info in CUDA_ARCHES_SUPPORTED.items():
@@ -303,7 +304,9 @@ def _check_capability():
                     ):
                         matched_arches += f" {arch}"
                 if matched_arches != "":
-                    warnings.warn(matched_cuda_warn.format(matched_arches))
+                    warnings.warn(
+                        matched_cuda_warn.format(matched_arches), stacklevel=2
+                    )
 
 
 def _check_cubins():
@@ -328,7 +331,8 @@ If you want to use the {} GPU with PyTorch, please check the instructions at htt
             warnings.warn(
                 incompatible_device_warn.format(
                     device_name, capability, " ".join(arch_list), device_name
-                )
+                ),
+                stacklevel=2,
             )
 
 
@@ -818,7 +822,9 @@ def _raw_device_count_amdsmi() -> int:
     try:
         amdsmi.amdsmi_init()
     except amdsmi.AmdSmiException as e:
-        warnings.warn(f"Can't initialize amdsmi - Error code: {e.err_code}")
+        warnings.warn(
+            f"Can't initialize amdsmi - Error code: {e.err_code}", stacklevel=2
+        )
         return -1
     socket_handles = amdsmi.amdsmi_get_processor_handles()
     return len(socket_handles)
@@ -831,12 +837,12 @@ def _raw_device_count_nvml() -> int:
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
-        warnings.warn("Can't initialize NVML")
+        warnings.warn("Can't initialize NVML", stacklevel=2)
         return -1
     dev_count = c_int(-1)
     rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
-        warnings.warn("Can't get nvml device count")
+        warnings.warn("Can't get nvml device count", stacklevel=2)
         return -1
     del nvml_h
     return dev_count.value
@@ -850,27 +856,27 @@ def _raw_device_uuid_amdsmi() -> Optional[list[str]]:
     try:
         amdsmi.amdsmi_init()
     except amdsmi.AmdSmiException:
-        warnings.warn("Can't initialize amdsmi")
+        warnings.warn("Can't initialize amdsmi", stacklevel=2)
         return None
     try:
         socket_handles = amdsmi.amdsmi_get_processor_handles()
         dev_count = len(socket_handles)
     except amdsmi.AmdSmiException:
-        warnings.warn("Can't get amdsmi device count")
+        warnings.warn("Can't get amdsmi device count", stacklevel=2)
         return None
     uuids: list[str] = []
     for idx in range(dev_count):
         try:
             handler = amdsmi.amdsmi_get_processor_handles()[idx]
         except amdsmi.AmdSmiException:
-            warnings.warn("Cannot get amd device handler")
+            warnings.warn("Cannot get amd device handler", stacklevel=2)
             return None
         try:
             uuid = amdsmi.amdsmi_get_gpu_asic_info(handler)["asic_serial"][
                 2:
             ]  # Removes 0x prefix from serial
         except amdsmi.AmdSmiException:
-            warnings.warn("Cannot get uuid for amd device")
+            warnings.warn("Cannot get uuid for amd device", stacklevel=2)
             return None
         uuids.append(
             str(uuid).lower()
@@ -885,25 +891,25 @@ def _raw_device_uuid_nvml() -> Optional[list[str]]:
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
-        warnings.warn("Can't initialize NVML")
+        warnings.warn("Can't initialize NVML", stacklevel=2)
         return None
     dev_count = c_int(-1)
     rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
-        warnings.warn("Can't get nvml device count")
+        warnings.warn("Can't get nvml device count", stacklevel=2)
         return None
     uuids: list[str] = []
     for idx in range(dev_count.value):
         dev_id = c_void_p()
         rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
         if rc != 0:
-            warnings.warn("Can't get device handle")
+            warnings.warn("Can't get device handle", stacklevel=2)
             return None
         buf_len = 96
         buf = create_string_buffer(buf_len)
         rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
         if rc != 0:
-            warnings.warn("Can't get device UUID")
+            warnings.warn("Can't get device UUID", stacklevel=2)
             return None
         uuids.append(buf.raw.decode("ascii").strip("\0"))
     del nvml_h
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index dc4c3827c8a..b39c6a63f92 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -492,6 +492,7 @@ def reset_max_memory_allocated(device: "Device" = None) -> None:
         "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
         "which resets /all/ peak memory stats.",
         FutureWarning,
+        stacklevel=2,
     )
     return reset_peak_memory_stats(device=device)
 
@@ -518,6 +519,7 @@ def reset_max_memory_cached(device: "Device" = None) -> None:
         "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
         "which resets /all/ peak memory stats.",
         FutureWarning,
+        stacklevel=2,
     )
     return reset_peak_memory_stats(device=device)
 
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index 7fa06bd7c12..bef781c19a0 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -14,7 +14,7 @@ SUM = 0  # ncclRedOp_t
 
 def is_available(tensors):
     if not hasattr(torch._C, "_nccl_all_reduce"):
-        warnings.warn("PyTorch is not compiled with NCCL support")
+        warnings.warn("PyTorch is not compiled with NCCL support", stacklevel=2)
         return False
 
     devices = set()
diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py
index 262c6870d40..4a5ee73cbdd 100644
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@@ -626,7 +626,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
             else:
                 warnings.warn(
                     "Offline tuning is not supported for this GEMM. Use online tuning instead. "
-                    + f"Skipped tuning for: {untuned_gemm[1]}"
+                    + f"Skipped tuning for: {untuned_gemm[1]}",
+                    stacklevel=2,
                 )
                 return
 
@@ -644,7 +645,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         if m == 1 or n == 1 or k == 1:
             warnings.warn(
                 "Offline tuning is not support for this GEMM. Use online tuning instead. "
-                + f"Skipped tuning for: {untuned_gemm[1]}"
+                + f"Skipped tuning for: {untuned_gemm[1]}",
+                stacklevel=2,
             )
             return
 
@@ -747,7 +749,7 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         matA = matA.t()
         torch.nn.functional.linear(X, matA, bias)
     else:
-        warnings.warn(f"error: unknown op {op_sig}")
+        warnings.warn(f"error: unknown op {op_sig}", stacklevel=2)
 
 
 def _check_tuning_assertions() -> None:
@@ -756,7 +758,7 @@ def _check_tuning_assertions() -> None:
     """
 
     if is_enabled() is False:
-        warnings.warn("TunableOp was disabled. Trying to enable now.")
+        warnings.warn("TunableOp was disabled. Trying to enable now.", stacklevel=2)
         enable(True)
     assert is_enabled() is True
     assert tuning_is_enabled() is True
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index e760a1a0744..8574e258335 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -23,7 +23,8 @@ try:
     from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
 except Exception:
     warnings.warn(
-        "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly"
+        "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly",
+        stacklevel=2,
     )
 
     def is_torchdynamo_compiling():  # type: ignore[misc]
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 7b709a2965c..87fcc014257 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -470,7 +470,8 @@ class ShardedTensor(ShardedTensorBase):
                 src = shard.tensor.flatten()
                 if src.nelement() == 0:
                     warnings.warn(
-                        "Gathering a tensor with zero elements on rank " + str(rank)
+                        "Gathering a tensor with zero elements on rank " + str(rank),
+                        stacklevel=2,
                     )
                     continue
                 shard_offset = shard_placement[shard.metadata][1]
@@ -671,7 +672,8 @@ class ShardedTensor(ShardedTensorBase):
             if device_to.index != current_idx:
                 warnings.warn(
                     "ShardedTensor.to only move tensor to its current device"
-                    "If you want to put to different device, use `reshard` instead."
+                    "If you want to put to different device, use `reshard` instead.",
+                    stacklevel=2,
                 )
             device_to = torch.device(current_idx)
 
diff --git a/torch/distributed/_tools/mod_tracker.py b/torch/distributed/_tools/mod_tracker.py
index 3d5c1783d8a..ad736a8302f 100644
--- a/torch/distributed/_tools/mod_tracker.py
+++ b/torch/distributed/_tools/mod_tracker.py
@@ -182,7 +182,8 @@ class ModTracker:
                 warnings.formatwarning = custom_formatwarning
                 warnings.warn(
                     "The module hierarchy tracking maybe be messed up."
-                    " Please file a bug to PyTorch, if it is the case."
+                    " Please file a bug to PyTorch, if it is the case.",
+                    stacklevel=2,
                 )
             if name not in self.parents:
                 self._active_module_cnt[name] = 1
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index ee07c75f7ee..bf7cb117f87 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -257,7 +257,8 @@ class Join:
                     f"{self._rank} has at least {WARN_THRESHOLD} "
                     f"fewer inputs than other currently-active ranks. "
                     "This level of skew could lead to performance "
-                    "degradation during training."
+                    "degradation during training.",
+                    stacklevel=2,
                 )
             # Shadow the all-reduce in non-joined processes
             num_nonjoined_procs = self._get_num_nonjoined_procs()
diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
index eec08464167..dd97e519180 100644
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -101,7 +101,8 @@ class PeriodicModelAverager(ModelAverager):
                 "When period is 1, no need to use model averaging because the communication cost "
                 "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                 "by DistributedDataParallel in the backward pass. Therefore, only "
-                "DistributedDataParallel should be used for this case."
+                "DistributedDataParallel should be used for this case.",
+                stacklevel=2,
             )
         self.period = period
 
diff --git a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
index a52fc2babed..33cde4cb3a7 100644
--- a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
+++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@@ -114,7 +114,8 @@ class HierarchicalModelAverager(averagers.ModelAverager):
                 "no need to use model averaging because the communication cost "
                 "of all-reducing parameters will be no less than the cost of all-reducing gradients "
                 "by DistributedDataParallel in the backward pass. Therefore, only "
-                "DistributedDataParallel should be used for this case."
+                "DistributedDataParallel should be used for this case.",
+                stacklevel=2,
             )
         overall_group_size = dist.get_world_size(group=self.process_group)
         if list(period_group_size_dict.values())[-1] != overall_group_size:
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 5def6c13dc1..b21cac12ff9 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -660,7 +660,8 @@ class _FileSystemWriter(StorageWriter):
                 warnings.warn(
                     f"Detected an existing checkpoint in {self.path}, overwriting since {self.overwrite=}."
                     " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
-                    " maintain this functionality or False to raise when an existing checkpoint is found."
+                    " maintain this functionality or False to raise when an existing checkpoint is found.",
+                    stacklevel=2,
                 )
             else:
                 raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.")
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index f50a0ee8e60..0d898c3ff06 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -290,6 +290,7 @@ def _verify_options(
             "will be removed in 2.5. This feature can be achieved by manually "
             "filtering out the state_dict returned from get_state_dict.",
             FutureWarning,
+            stacklevel=2,
         )
     if optim_only and not optims:
         raise RuntimeError(
@@ -1234,6 +1235,7 @@ def _unflatten_model_state_dict(
             "feature, please preprocessing the model_state_dict to achieve the "
             "same functionality.",
             FutureWarning,
+            stacklevel=2,
         )
         cast_state_dict = cast(dict[nn.Module, dict[str, ValueType]], state_dict)
         new_state_dict: dict[str, ValueType] = {}
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index 389dc0e5e57..178e190e937 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -158,7 +158,8 @@ def load(
     no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
     if no_dist:
         warnings.warn(
-            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process."
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process.",
+            stacklevel=2,
         )
 
     with _profile():
@@ -365,7 +366,8 @@ def _load_state_dict_from_keys(
     no_dist = not (dist.is_available() and dist.is_initialized())
     if no_dist:
         warnings.warn(
-            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process.",
+            stacklevel=2,
         )
 
     storage_reader = cast(
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index ef0be9f9309..38ab2dcb510 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -182,7 +182,8 @@ def save(
     no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized())
     if no_dist:
         warnings.warn(
-            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process."
+            "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process.",
+            stacklevel=2,
         )
 
     with _profile():
@@ -414,7 +415,8 @@ def _save_state_dict(
             warnings.warn(
                 "The function definition for SavePlanner.set_up_planner has been updated"
                 " to include the storage_meta argument. Please update your implementation"
-                " to include this parameter."
+                " to include this parameter.",
+                stacklevel=2,
             )
             planner.set_up_planner(state_dict, distW.is_coordinator)  # type: ignore[call-arg, arg-type]
         else:
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 94844812b52..073649c5f12 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -461,7 +461,8 @@ def _api_bc_check(func):
         if len(args) == 2:
             warnings.warn(
                 f"The argument order of {func.__name__} has been changed. "
-                "Please check the document to avoid future breakages."
+                "Please check the document to avoid future breakages.",
+                stacklevel=2,
             )
             sig = inspect.signature(func)
             kwonlyargs = [
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 052b74ba479..0a58eab1abf 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -85,7 +85,8 @@ else:
             # We keep this function for backward compatibility.
             warnings.warn(
                 "This get_root_mesh API will be deprecated soon."
-                "Please use `get_root_mesh` inside DeviceMesh instead."
+                "Please use `get_root_mesh` inside DeviceMesh instead.",
+                stacklevel=2,
             )
             if not device_mesh:
                 return device_mesh
@@ -108,7 +109,8 @@ else:
         ) -> list["DeviceMesh"]:
             warnings.warn(
                 "This _get_all_submeshes API will be deprecated soon."
-                "Please use `_get_all_submeshes` inside DeviceMesh instead."
+                "Please use `_get_all_submeshes` inside DeviceMesh instead.",
+                stacklevel=2,
             )
             return device_mesh._get_all_submeshes(mesh_dim_name)
 
@@ -329,7 +331,8 @@ else:
                         "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that "
                         "the underlying communicator (i.e. NCCL) can be initialized properly. "
                         "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the "
-                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. "
+                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. ",
+                        stacklevel=2,
                     )
                     # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
                     # NOTE: This device selection would only work for homogeneous hardware.
@@ -766,7 +769,8 @@ else:
                 warnings.warn(
                     "You are attempting to slice a submesh from another submesh. While we support this operation, "
                     "it is users' responsibility to ensure that the submesh is consistently sliced across all ranks. "
-                    "If not, this may result in some ranks receiving the submesh while others encounter errors."
+                    "If not, this may result in some ranks receiving the submesh while others encounter errors.",
+                    stacklevel=2,
                 )
                 slice_from_root = False
 
@@ -803,7 +807,8 @@ else:
                 elif name in flatten_name_to_root_layout:
                     warnings.warn(
                         "Slicing a flattened dim from root mesh will be deprecated in PT 2.11. "
-                        "Users need to bookkeep the flattened mesh directly. "
+                        "Users need to bookkeep the flattened mesh directly. ",
+                        stacklevel=2,
                     )
                     layout_sliced.append(flatten_name_to_root_layout[name])
 
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 52370a4545f..0cebfaff6d6 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -352,7 +352,8 @@ class Backend(str):  # noqa: SLOT000
             warnings.warn(
                 f"Device capability of {name} unspecified, assuming `cpu` and "
                 "`cuda` or `xpu`. Please specify it via the `devices` argument of "
-                "`register_backend`."
+                "`register_backend`.",
+                stacklevel=2,
             )
             Backend.backend_capability[name.lower()] = (
                 ["cpu", "cuda", "xpu"] if torch.xpu.is_available() else ["cpu", "cuda"]
@@ -427,7 +428,8 @@ class BackendConfig:
             warnings.warn(
                 f"Device capability of {backend} unknown, assuming `cpu` and "
                 "`cuda`. You can specify it in `device:backend` format in "
-                "`init_process_group` call."
+                "`init_process_group` call.",
+                stacklevel=2,
             )
             backend_val = Backend(backend)
             self.device_backend_map = {
@@ -751,7 +753,8 @@ def _get_default_timeout(backend: Backend) -> timedelta:
             # TODO moco benchmark on CPU initializes pgnccl backend today, triggered this assert in CI before it was
             # changed to be a warning.  We should fix the moco model.
             warnings.warn(
-                "Attempted to get default timeout for nccl backend, but NCCL support is not compiled"
+                "Attempted to get default timeout for nccl backend, but NCCL support is not compiled",
+                stacklevel=2,
             )
             return default_pg_timeout
         return default_pg_nccl_timeout
@@ -802,6 +805,7 @@ def _get_object_coll_device(group: Optional[ProcessGroup] = None) -> str:
             f"You are using a Backend {type(group)} as a ProcessGroup. "
             "This usage is deprecated since PyTorch 2.0. Please use a public API "
             "of PyTorch Distributed instead.",
+            stacklevel=2,
         )
         # Provide backward compatibility to cases where `group` passed in is
         # actually a Backend (like `ProcessGroupGloo`) rather than a
@@ -868,7 +872,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
         "backward-compatiblity reason. If you need to find a device for object "
         "collectives, please use `_get_object_coll_device`. If you need to query "
         "the device types supported by group, please use "
-        "`_device_capability(group)`. "
+        "`_device_capability(group)`. ",
+        stacklevel=2,
     )
     group = group or _get_default_group()
 
@@ -910,7 +915,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
         warnings.warn(
             "Multiple backends are registered with this ProcessGroup. We cannot "
             f"determine which one is the default. Returning {rv}. "
-            "Please consider using other APIs."
+            "Please consider using other APIs.",
+            stacklevel=2,
         )
         return rv
 
@@ -1010,7 +1016,8 @@ def _warn_not_in_group(op_name) -> None:
     global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank()
     warnings.warn(
         f"Running {op_name} on global rank {global_rank} which does not "
-        "belong to the given group."
+        "belong to the given group.",
+        stacklevel=2,
     )
 
 
@@ -1557,7 +1564,9 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
     if len(backends) == 0:
-        warnings.warn("Set timeout is now only supported for either nccl or gloo.")
+        warnings.warn(
+            "Set timeout is now only supported for either nccl or gloo.", stacklevel=2
+        )
     for backend in backends:
         backend._set_default_timeout(timeout)
 
@@ -1758,7 +1767,8 @@ def init_process_group(
             warnings.warn(
                 f"For MPI backend, world_size ({world_size}) and rank ({rank}) "
                 "are ignored since they are assigned by the "
-                "MPI runtime."
+                "MPI runtime.",
+                stacklevel=2,
             )
 
         default_pg, _ = _new_process_group_helper(
@@ -2038,7 +2048,8 @@ def _new_process_group_helper(
                 if backend_options._timeout != timeout:
                     warnings.warn(
                         "backend_options._timeout was specified, "
-                        "but timeout kwarg has a default value that will always override it. "
+                        "but timeout kwarg has a default value that will always override it. ",
+                        stacklevel=2,
                     )
             else:
                 # default backend_options for NCCL
@@ -2259,7 +2270,8 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         if pg in _world.pg_coalesce_state.keys():
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
-                "ProcessGroup is destroyed. They will be cleaned."
+                "ProcessGroup is destroyed. They will be cleaned.",
+                stacklevel=2,
             )
             del _world.pg_coalesce_state[pg]
 
@@ -2349,7 +2361,8 @@ def _abort_process_group(group: Optional[ProcessGroup] = None):
         if pg in _world.pg_coalesce_state.keys():
             warnings.warn(
                 "Some coalesced collectives haven't been launched when "
-                "ProcessGroup is aborted. They will be cleaned."
+                "ProcessGroup is aborted. They will be cleaned.",
+                stacklevel=2,
             )
             del _world.pg_coalesce_state[pg]
 
@@ -4919,7 +4932,8 @@ def barrier(
         if group.rank() == 0:
             warnings.warn(  # warn only once
                 "barrier(): using the device under current context. "
-                "You can specify `device_id` in `init_process_group` to mute this warning."
+                "You can specify `device_id` in `init_process_group` to mute this warning.",
+                stacklevel=2,
             )
 
     work = group.barrier(opts=opts)
@@ -5001,6 +5015,7 @@ def monitored_barrier(
         warnings.warn(
             "Please specify timeout arg as a timedelta. "
             f"Converting current value of {timeout} assuming it represents seconds",
+            stacklevel=2,
         )
         timeout = timedelta(seconds=timeout)
 
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index edcac432b66..d56d61e7eaa 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -106,6 +106,7 @@ class WorkerSpec:
             warnings.warn(
                 "WorkerSpec.fn will be deprecated,"
                 " please use WorkerSpec.entrypoint instead",
+                stacklevel=2,
                 category=DeprecationWarning,
             )
             self.entrypoint = self.fn
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index f15ce4f241d..437a9c07d2c 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -52,7 +52,9 @@ class ErrorHandler:
         try:
             faulthandler.enable(all_threads=True)
         except Exception as e:
-            warnings.warn(f"Unable to enable fault handler. {type(e).__name__}: {e}")
+            warnings.warn(
+                f"Unable to enable fault handler. {type(e).__name__}: {e}", stacklevel=2
+            )
 
     def _write_error_file(self, file_path: str, error_msg: str) -> None:
         """Write error message to the file."""
@@ -60,7 +62,9 @@ class ErrorHandler:
             with open(file_path, "w") as fp:
                 fp.write(error_msg)
         except Exception as e:
-            warnings.warn(f"Unable to write error to file. {type(e).__name__}: {e}")
+            warnings.warn(
+                f"Unable to write error to file. {type(e).__name__}: {e}", stacklevel=2
+            )
 
     def record_exception(self, e: BaseException) -> None:
         """
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
index 8f0370173b7..c7d56374e7d 100644
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@@ -65,5 +65,6 @@ def _derive_module_name(depth: int = 1) -> Optional[str]:
         warnings.warn(
             f"Error deriving logger module name, using <None>. Exception: {e}",
             RuntimeWarning,
+            stacklevel=2,
         )
         return None
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index a995e567bba..54d6c974cae 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -336,7 +336,8 @@ def _get_param_to_fqns(
                     warnings.warn(
                         "FlatParameter is being traversed more than once. "
                         "This case should only happen when using "
-                        "DistributedModelParallel with FullyShardedDataParallel."
+                        "DistributedModelParallel with FullyShardedDataParallel.",
+                        stacklevel=2,
                     )
                     param_to_fqns[param] = global_fqns
                 elif not dedup_shared_params:
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index 778302a957a..db2ea7bfae0 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -299,7 +299,8 @@ class _ExecOrderData:
                 warnings.warn(
                     "Forward order differs from that of the first iteration "
                     f"on rank {self.rank}. Collectives are unchecked and may "
-                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}",
+                    stacklevel=2,
                 )
                 self.warn_status = _ExecOrderWarnStatus.WARNING
             self.current_order_index += 1
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 2d742c30302..8adde16de6b 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -1585,7 +1585,8 @@ class FlatParamHandle:
                 warnings.warn(
                     f"[Rank {self.rank}] Only some but not all ranks have a "
                     "`None` `FlatParameter` gradient, so FSDP is using zeros to "
-                    "approximate those ranks' sharded gradients being `None`"
+                    "approximate those ranks' sharded gradients being `None`",
+                    stacklevel=2,
                 )
             flat_param._saved_grad_shard = None  # type: ignore[assignment]
             sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device)  # type: ignore[attr-defined]
@@ -2434,7 +2435,8 @@ class FlatParamHandle:
                 f"[Rank {rank}] {'Parameter' if is_param else 'Gradient'} needs "
                 f"writeback in {self._training_state}\n"
                 f"expected shape={expected_shape} shape={src_shape} "
-                f"expected device={dst_tensor.device} device={src_device}"
+                f"expected device={dst_tensor.device} device={src_device}",
+                stacklevel=2,
             )
         if src_tensor is not None and src_tensor.shape != expected_shape:
             # NOTE: Gradient shape mismatch is not possible in practice since
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 74cc12dc889..36bdc23e741 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -431,7 +431,8 @@ def _init_core_state(
             warnings.warn(
                 "FSDP is switching to use `NO_SHARD` instead of "
                 f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since "
-                "the world size is 1."
+                "the world size is 1.",
+                stacklevel=2,
             )
         sharding_strategy = ShardingStrategy.NO_SHARD
     elif sharding_strategy == ShardingStrategy.NO_SHARD:
@@ -704,7 +705,8 @@ def _get_ignored_modules(
         warnings.warn(
             "Trying to ignore the top-level module passed into the FSDP "
             "constructor itself will result in all parameters being "
-            f"ignored and is not well-supported: {module}"
+            f"ignored and is not well-supported: {module}",
+            stacklevel=2,
         )
     # Include nested FSDP modules' ignored modules
     for submodule in root_module.modules():
@@ -847,7 +849,8 @@ def _get_device_from_device_id(
             f"FSDP will use the current device {device_handle.current_device()}. "
             f"If this is incorrect, please explicitly call `torch.{device.type}.set_device()` "
             "before FSDP initialization or pass in the explicit device "
-            "index as the `device_id` argument."
+            "index as the `device_id` argument.",
+            stacklevel=2,
         )
         device = torch.device(device_handle.current_device())
     return device
@@ -929,7 +932,8 @@ def _materialize_meta_module(
         warnings.warn(
             "Unable to call `reset_parameters()` for module on meta "
             f"device with error {str(e)}. Please ensure that your module of"
-            f"type {type(module)} implements a `reset_parameters()` method."  # type: ignore[possibly-undefined]
+            f"type {type(module)} implements a `reset_parameters()` method.",
+            stacklevel=2,  # type: ignore[possibly-undefined]
         )
         raise e
 
@@ -1049,7 +1053,8 @@ def _warn_cpu_init():
         "recommend passing in the `device_id` argument for FSDP to move "
         "`module` to GPU for the sharding initialization. `module` must also "
         "be on GPU device to work with the `sync_module_states=True` flag "
-        "since that requires GPU communication."
+        "since that requires GPU communication.",
+        stacklevel=2,
     )
 
 
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 3c64bfbf2f6..c8afeb058ba 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -506,7 +506,8 @@ def _flatten_optim_state_dict(
                         flat_osd_state[key] = copy.deepcopy(state)
                     else:
                         warnings.warn(
-                            f"optim_state[{key}] is not on rank{fsdp_state.rank}."
+                            f"optim_state[{key}] is not on rank{fsdp_state.rank}.",
+                            stacklevel=2,
                         )
 
             else:
@@ -2051,7 +2052,8 @@ def _optim_state_dict(
             "most cases, this is a user-defined state that is not "
             "associated with any particular parameter. Another possible "
             "case is this state is managed by TorchRec. Otherwise, there may "
-            " be a mismatched assumption of optim_state_dict of this mode."
+            " be a mismatched assumption of optim_state_dict of this mode.",
+            stacklevel=2,
         )
         fsdp_osd_state[key] = value
 
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 496475b5b11..ec648ced837 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -337,7 +337,8 @@ def _full_post_state_dict_hook(
                     "This may mean that this state_dict entry could point to invalid "
                     "memory regions after returning from state_dict() call if this "
                     "parameter is managed by FSDP. Please check clone "
-                    f"implementation of {fqn}. Error: {str(e)}"
+                    f"implementation of {fqn}. Error: {str(e)}",
+                    stacklevel=2,
                 )
 
     return _common_unshard_post_state_dict_hook(
@@ -708,7 +709,8 @@ def _post_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
-            "be returned."
+            "be returned.",
+            stacklevel=2,
         )
     else:
         context = contextlib.nullcontext()
@@ -770,7 +772,8 @@ def _pre_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
-            "be returned."
+            "be returned.",
+            stacklevel=2,
         )
     else:
         _set_use_dtensor(fsdp_state)
@@ -824,7 +827,8 @@ def _pre_load_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
-            "be returned."
+            "be returned.",
+            stacklevel=2,
         )
     else:
         _set_use_dtensor(fsdp_state)
@@ -861,7 +865,8 @@ def _post_load_state_dict_hook(
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
-            "be returned."
+            "be returned.",
+            stacklevel=2,
         )
     else:
         context = contextlib.nullcontext()
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
index bd24583d919..71dc1a9f4e2 100644
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -153,7 +153,8 @@ def _validate_unshard_params_args(
             "offload_to_cpu=True and rank0_only=False may result in the"
             "unsharded parameters being redundantly copied to CPU memory for "
             "GPUs sharing the same CPU memory, which risks CPU OOM. We "
-            "recommend using offload_to_cpu=True with rank0_only=True."
+            "recommend using offload_to_cpu=True with rank0_only=True.",
+            stacklevel=2,
         )
 
 
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index 0a83e6307e1..41dc4d85751 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -120,7 +120,8 @@ def _warn_on_overridden_mixed_precision(
         "Both mixed precision and an auto_wrap_policy were specified to FSDP, "
         f"where the wrapped module has submodules of type:\n{overridden_module_classes}\n"
         "These modules will be wrapped as separate FSDP instacnes with mixed "
-        "precision disabled."
+        "precision disabled.",
+        stacklevel=2,
     )
 
 
@@ -172,7 +173,7 @@ def _validate_frozen_params(
                     f"The following parameters have requires_grad=False:\n{frozen_param_fqns}"
                 )
                 if use_orig_params:
-                    warnings.warn(msg)
+                    warnings.warn(msg, stacklevel=2)
                 else:
                     raise ValueError(msg)
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index ce396a84777..cdc5ef424e7 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -680,6 +680,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
             "#torch.distributed.checkpoint.state_dict.get_state_dict ."
             "Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .",
             FutureWarning,
+            stacklevel=2,
         )
         _state_dict_type_to_config = {
             StateDictType.FULL_STATE_DICT: FullStateDictConfig,
@@ -1208,7 +1209,8 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
             warnings.warn(
                 f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no "
                 "gradients -- returning the total norm in the default dtype "
-                f"{total_norm.dtype}"
+                f"{total_norm.dtype}",
+                stacklevel=2,
             )  # warn since this is generally unexpected
             return total_norm
         total_norm_dtype = functools.reduce(
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index 65d3944ae71..b5135ae5411 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -87,7 +87,8 @@ class _NamedOptimizer(optim.Optimizer):
         else:
             warnings.warn(
                 "Since we pass in param_groups, we will use param_groups to "
-                "initialize the optimizer, not all parameters of the module."
+                "initialize the optimizer, not all parameters of the module.",
+                stacklevel=2,
             )
             param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
             ordered_param_keys = []
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index 44d59cab44e..c7b78510ed1 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -92,7 +92,8 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
         else:
             warnings.warn(
                 "Loaded state dict does not contain a step counter for an averager. "
-                "Setting step counter to 0."
+                "Setting step counter to 0.",
+                stacklevel=2,
             )
             self.averager.step = 0
 
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 4f91e3444b0..ba04eeb30df 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -513,7 +513,8 @@ class OpDispatcher:
                 "Found a non-scalar tensor with numel=1 and ndim!=0, "
                 "we are implicitly creating a replicated DTensor for it. "
                 "However, please consider changing it to a scalar tensor "
-                "or explicitly create a DTensor under distributed environment."
+                "or explicitly create a DTensor under distributed environment.",
+                stacklevel=2,
             )
 
         if tensor_arg.numel() == 1 or self._allow_implicit_replication:
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index d81f58520aa..f8325c83d55 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -43,7 +43,8 @@ def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool:
     else:
         # TODO: Logs way too much
         warnings.warn(
-            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh"
+            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh",
+            stacklevel=2,
         )
         return False
 
@@ -72,7 +73,8 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     if not is_rng_supported_mesh(device_mesh):
         warnings.warn(
             "DTensor manual_seed() may not have complete support "
-            f"on {device_mesh.device_type} device mesh"
+            f"on {device_mesh.device_type} device mesh",
+            stacklevel=2,
         )
         return
 
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
index 2a3369a8edd..51cfd0f144b 100644
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -74,7 +74,8 @@ def parallelize_module(  # type: ignore[return]
     if parallelize_plan is None:
         warnings.warn(
             "No parallelize_plan is provided and auto-parallel is not supported "
-            "at the moment, so this parallelize_module call will do nothing."
+            "at the moment, so this parallelize_module call will do nothing.",
+            stacklevel=2,
         )
         return module
 
@@ -108,7 +109,8 @@ def parallelize_module(  # type: ignore[return]
                 warnings.warn(
                     f"Parallelize plan key '{module_path}' could not be resolved: "
                     f"no submodule matching token '{token}' in module {module}, "
-                    f"skipping this plan entry."
+                    f"skipping this plan entry.",
+                    stacklevel=2,
                 )
                 continue
 
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index a72c90789cc..dcdb2762cff 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -62,7 +62,8 @@ class Distribution:
                 warnings.warn(
                     f"{self.__class__} does not define `arg_constraints`. "
                     + "Please set `arg_constraints = {}` or initialize the distribution "
-                    + "with `validate_args=False` to turn off validation."
+                    + "with `validate_args=False` to turn off validation.",
+                    stacklevel=2,
                 )
             for param, constraint in arg_constraints.items():
                 if constraints.is_dependent(constraint):
@@ -313,7 +314,8 @@ class Distribution:
             warnings.warn(
                 f"{self.__class__} does not define `support` to enable "
                 + "sample validation. Please initialize the distribution with "
-                + "`validate_args=False` to turn off validation."
+                + "`validate_args=False` to turn off validation.",
+                stacklevel=2,
             )
             return
         assert support is not None
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index ca82802bcc8..85932828d21 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -133,6 +133,7 @@ def _dispatch_kl(type_p, type_q):
             f"Ambiguous kl_divergence({type_p.__name__}, {type_q.__name__}). "
             f"Please register_kl({left_p.__name__}, {right_q.__name__})",
             RuntimeWarning,
+            stacklevel=2,
         )
     return left_fun
 
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index 5aaa3ddc9d0..96918a68abc 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -127,7 +127,8 @@ class Wishart(ExponentialFamily):
 
         if self.df.lt(event_shape[-1]).any():
             warnings.warn(
-                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim."
+                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim.",
+                stacklevel=2,
             )
 
         super().__init__(batch_shape, event_shape, validate_args=validate_args)
@@ -279,7 +280,7 @@ class Wishart(ExponentialFamily):
         else:
             # More optimized version with data-dependent control flow.
             if is_singular.any():
-                warnings.warn("Singular sample detected.")
+                warnings.warn("Singular sample detected.", stacklevel=2)
 
                 for _ in range(max_try_correction):
                     sample_new = self._bartlett_sampling(is_singular[is_singular].shape)
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index a76cda0682c..c300df11a0c 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -500,10 +500,10 @@ def load(
             if file_info.filename == "serialized_exported_program.json":
                 serialized_exported_program = file_content
             elif file_info.filename == "serialized_state_dict.json":
-                warnings.warn("This version of file is deprecated")
+                warnings.warn("This version of file is deprecated", stacklevel=2)
                 serialized_state_dict = file_content
             elif file_info.filename == "serialized_constants.json":
-                warnings.warn("This version of file is deprecated")
+                warnings.warn("This version of file is deprecated", stacklevel=2)
                 serialized_constants = file_content
             elif file_info.filename == "serialized_state_dict.pt":
                 serialized_state_dict = file_content
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index a2c47ca3e5d..d770cad63a8 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -2113,7 +2113,7 @@ def _export_for_training(
                 if torch._export.config.error_on_lifted_constant_tensors:
                     raise RuntimeError(error_msg)
                 else:
-                    warnings.warn(error_msg)
+                    warnings.warn(error_msg, stacklevel=2)
 
     export_graph_signature = export_artifact.aten.sig
 
@@ -2189,7 +2189,8 @@ def _export_for_training(
                 f"This is likely result of torch.export.export not being able to track side effects "
                 f"that is happening outside of model scope.\n\n"
                 f"Leaked tensors:\n  {leak_details}\n\n"
-                f"Alternatively, please file a bug report to PyTorch team for further debugging help."
+                f"Alternatively, please file a bug report to PyTorch team for further debugging help.",
+                stacklevel=2,
             )
 
             del legit_leak
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index 4ce7c28f4b0..b9e82481322 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -530,7 +530,8 @@ def _create_stateful_graph_module(
                 f"A model attribute `{constant_fqn}` requires gradient. "
                 f"but it's not properly registered as a parameter. "
                 f"torch.export will detach it and treat it as a constant tensor "
-                f"but please register it as parameter instead."
+                f"but please register it as parameter instead.",
+                stacklevel=2,
             )
             detached_buffer = buffer.detach()
             original_tensor_to_detached_tensor[buffer] = detached_buffer
@@ -549,7 +550,8 @@ def _create_stateful_graph_module(
                         f"A model attribute `{const_name}` requires gradient "
                         f"but it's not properly registered as a parameter. "
                         f"torch.export will detach it and treat it as a constant tensor "
-                        f"but please register it as parameter instead."
+                        f"but please register it as parameter instead.",
+                        stacklevel=2,
                     )
                     if value in original_tensor_to_detached_tensor:
                         value = original_tensor_to_detached_tensor[value]
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index eec86b28c04..6263a5ea44d 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -1684,7 +1684,8 @@ def _create_graph_module_for_export(root, graph):
             "Unable to execute the generated python source code from "
             "the graph. The graph module will no longer be directly callable, "
             "but you can still run the ExportedProgram, and if needed, you can "
-            "run the graph module eagerly using torch.fx.Interpreter."
+            "run the graph module eagerly using torch.fx.Interpreter.",
+            stacklevel=2,
         )
         gm = torch.fx.GraphModule(root, torch.fx.Graph())
         gm._graph = graph
diff --git a/torch/export/pt2_archive/_package_weights.py b/torch/export/pt2_archive/_package_weights.py
index d7f8d4fb2f8..a6fddaaf4cf 100644
--- a/torch/export/pt2_archive/_package_weights.py
+++ b/torch/export/pt2_archive/_package_weights.py
@@ -108,7 +108,8 @@ def get_complete(
 
     warnings.warn(
         "No complete tensor found in the group! Returning the first one. "
-        "This may cause issues when your weights are not on CPU."
+        "This may cause issues when your weights are not on CPU.",
+        stacklevel=2,
     )
     assert len(group) > 0
     return next(iter(group))
diff --git a/torch/hub.py b/torch/hub.py
index 84740905ecc..3d6183ee7b2 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -279,7 +279,8 @@ def _get_cache_or_reload(
                     f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? "
                     "Torchhub will now assume that it's a branch. "
                     "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or "
-                    "refs/tags/tag_name as the ref. That might require using skip_validation=True."
+                    "refs/tags/tag_name as the ref. That might require using skip_validation=True.",
+                    stacklevel=2,
                 )
                 disambiguated_branch_ref = f"refs/heads/{ref}"
                 url = _git_archive_link(
@@ -338,7 +339,8 @@ def _check_repo_is_trusted(
                 "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
                 f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
                 f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
-                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour"
+                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour",
+                stacklevel=2,
             )
         return
 
@@ -406,7 +408,9 @@ def get_dir() -> str:
     """
     # Issue warning to move data if old env is set
     if os.getenv("TORCH_HUB"):
-        warnings.warn("TORCH_HUB is deprecated, please use env TORCH_HOME instead")
+        warnings.warn(
+            "TORCH_HUB is deprecated, please use env TORCH_HOME instead", stacklevel=2
+        )
 
     if _hub_dir is not None:
         return _hub_dir
@@ -853,7 +857,8 @@ def load_state_dict_from_url(
     # Issue warning to move data if old env is set
     if os.getenv("TORCH_MODEL_ZOO"):
         warnings.warn(
-            "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead"
+            "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead",
+            stacklevel=2,
         )
 
     if model_dir is None:
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f62ec3afe12..9decaeecc86 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -257,7 +257,7 @@ class strict_fusion:
 
     def __init__(self) -> None:
         if not torch._jit_internal.is_scripting():
-            warnings.warn("Only works in script mode")
+            warnings.warn("Only works in script mode", stacklevel=2)
 
     def __enter__(self):
         pass
diff --git a/torch/jit/_check.py b/torch/jit/_check.py
index f708ee87f30..042d0241897 100644
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@@ -180,7 +180,8 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
             "instance-level annotations on empty non-base "
             "types in `__init__`. Instead, either 1) use a "
             "type annotation in the class body, or 2) wrap "
-            "the type in `torch.jit.Attribute`."
+            "the type in `torch.jit.Attribute`.",
+            stacklevel=2,
         )
 
     def visit_Call(self, node):
@@ -245,5 +246,6 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
             "instance-level annotations on empty non-base "
             "types in `__init__`. Instead, either 1) use a "
             "type annotation in the class body, or 2) wrap "
-            "the type in `torch.jit.Attribute`."
+            "the type in `torch.jit.Attribute`.",
+            stacklevel=2,
         )
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index b4d2d624669..426d4a9de3c 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -48,7 +48,9 @@ def signatures_match(decomposition_sig, torch_op_sig):
         inspect_empty = inspect._empty  # type: ignore[attr-defined]
         for field in ["name", "annotation"]:
             if field == "name" and decomp_param.name == "self":
-                warnings.warn("PyTorch uses 'input' instead of 'self' on public api")
+                warnings.warn(
+                    "PyTorch uses 'input' instead of 'self' on public api", stacklevel=2
+                )
 
             if getattr(decomp_param, field) != getattr(op_param, field):
                 return False
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 530266fa9dc..52c7ac88d3f 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -309,7 +309,8 @@ def infer_concrete_type_builder(nn_module, share_types=True):
 
             warnings.warn(
                 f"'{name}' was found in ScriptModule constants, "
-                f" but it is a non-constant {hint}. Consider removing it."
+                f" but it is a non-constant {hint}. Consider removing it.",
+                stacklevel=2,
             )
             continue
         if not hasattr(nn_module, name):
@@ -318,7 +319,8 @@ def infer_concrete_type_builder(nn_module, share_types=True):
             warnings.warn(
                 f"'{name}' was found in ScriptModule constants, "
                 "but was not actually set in __init__. "
-                "Consider removing it."
+                "Consider removing it.",
+                stacklevel=2,
             )
             continue
         value = getattr(nn_module, name)
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 86b72d1d465..804f44f80e3 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -775,6 +775,7 @@ if _enabled:
                 "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
                 https://docs.pytorch.org/executorch/stable/getting-started.html",
                 DeprecationWarning,
+                stacklevel=2,
             )
             return self._c._save_for_mobile(*args, **kwargs)
 
@@ -787,6 +788,7 @@ if _enabled:
                 "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
                 https://docs.pytorch.org/executorch/stable/getting-started.html",
                 DeprecationWarning,
+                stacklevel=2,
             )
             return self._c._save_to_buffer_for_mobile(*args, **kwargs)
 
@@ -1165,7 +1167,8 @@ def _script_impl(
             warnings.warn(
                 "Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
                 "to enable Profile-Directed Typing in TorchScript. Refer to "
-                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. "
+                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ",
+                stacklevel=2,
             )
 
     if isinstance(obj, torch.nn.Module):
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 5b1713e77d3..e17700b8cac 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -686,7 +686,8 @@ def _trace_impl(
         # it is hard to trace it because the forward method on ScriptModule is already defined, so it
         # would result in an error.
         warnings.warn(
-            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."
+            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is.",
+            stacklevel=2,
         )
         return func
 
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 9bfa6832944..50b131aeab4 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -389,7 +389,8 @@ def is_tensor(ann):
         warnings.warn(
             "TorchScript will treat type annotations of Tensor "
             "dtype-specific subtypes as if they are normal Tensors. "
-            "dtype constraints are not enforced in compilation either."
+            "dtype constraints are not enforced in compilation either.",
+            stacklevel=2,
         )
         return True
 
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index fd7d19b0284..e330b59f47a 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -44,7 +44,8 @@ def _apply_docstring_templates(func: Callable[_P, _T]) -> Callable[_P, _T]:
         warnings.warn(
             f"No documentation string available for {func.__name__}."
             " PyTorch team should run `python tools/update_masked_docs.py`"
-            " to generate the missing docstrings."
+            " to generate the missing docstrings.",
+            stacklevel=2,
         )
     else:
         func.__doc__ = doc_string
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 0b3fa9b858f..75a41e705b1 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -322,7 +322,7 @@ class MaskedTensor(torch.Tensor):
             "In the case that the semantics for the operator are not trivial, it would be appreciated "
             "to also include a proposal for the semantics."
         )
-        warnings.warn(msg)
+        warnings.warn(msg, stacklevel=2)
         return NotImplemented
 
     def __lt__(self, other):
diff --git a/torch/masked/maskedtensor/reductions.py b/torch/masked/maskedtensor/reductions.py
index fedab1c12a6..6acc8415267 100644
--- a/torch/masked/maskedtensor/reductions.py
+++ b/torch/masked/maskedtensor/reductions.py
@@ -90,7 +90,7 @@ def _torch_reduce_dim(fn):
                 "In the case that the semantics for the operator are not trivial, it would be appreciated "
                 "to also include a proposal for the semantics."
             )
-            warnings.warn(msg)
+            warnings.warn(msg, stacklevel=2)
             return NotImplemented
         if not is_masked_tensor(self):
             raise TypeError("Input to reduce_dim must be a MaskedTensor")
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index d4652ab32ff..272335a538b 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -223,7 +223,9 @@ class ProcessContext:
 
 class SpawnContext(ProcessContext):
     def __init__(self, processes, error_files):
-        warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.")
+        warnings.warn(
+            "SpawnContext is renamed to ProcessContext since 1.4 release.", stacklevel=2
+        )
         super().__init__(processes, error_files)
 
 
diff --git a/torch/nn/_reduction.py b/torch/nn/_reduction.py
index 93b00dc6feb..9764f935b7c 100644
--- a/torch/nn/_reduction.py
+++ b/torch/nn/_reduction.py
@@ -13,7 +13,8 @@ def get_enum(reduction: str) -> int:
     elif reduction == "elementwise_mean":
         warnings.warn(
             "reduction='elementwise_mean' is deprecated. "
-            "Please use reduction='mean' instead."
+            "Please use reduction='mean' instead.",
+            stacklevel=2,
         )
         ret = 1
     elif reduction == "sum":
@@ -48,7 +49,7 @@ def legacy_get_string(
     else:
         ret = "none"
     if emit_warning:
-        warnings.warn(warning.format(ret))
+        warnings.warn(warning.format(ret), stacklevel=2)
     return ret
 
 
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 4aa6c14f811..9113fd7e379 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -60,10 +60,10 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
     """
     if WARN_FOR_UNFUSED_KERNELS:
         if not can_use_efficient_attention(params):
-            warn("Efficient attention can't be used because:")
+            warn("Efficient attention can't be used because:", stacklevel=2)
             can_use_efficient_attention(params, True)
         if not can_use_flash_attention(params):
-            warn("Flash attention can't be used because:")
+            warn("Flash attention can't be used because:", stacklevel=2)
             can_use_flash_attention(params, True)
 
 
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index 2a1a97fc756..551a57e6963 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -134,7 +134,8 @@ class CausalBias(torch.Tensor):
         self.seq_len_kv = seq_len_kv
         if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT:
             warn(
-                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!"
+                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!",
+                stacklevel=2,
             )
 
     def _upper_left(self, device: torch.device) -> torch.Tensor:
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index 01f5fe84356..fae220f7545 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -1152,6 +1152,7 @@ def create_block_mask(
         warnings.warn(
             "_compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.",
             DeprecationWarning,
+            stacklevel=2,
         )
         return torch.compile(create_block_mask)(
             mask_mod, B, H, Q_LEN, KV_LEN, device, BLOCK_SIZE
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index c562bc63dc4..360d687094d 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1551,7 +1551,7 @@ def dropout2d(
             "exists to provide channel-wise dropout on inputs with 2 spatial dimensions, "
             "a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs)."
         )
-        warnings.warn(warn_msg)
+        warnings.warn(warn_msg, stacklevel=2)
 
     # TODO: Properly support no-batch-dim inputs. For now, these are NOT supported; passing
     # a 3D input will perform dropout1d behavior instead. This was done historically and the
@@ -1563,7 +1563,8 @@ def dropout2d(
             "1D dropout behavior is desired - input is interpreted as shape (N, C, L), where C "
             "is the channel dim. This behavior will change in a future release to interpret the "
             "input as one without a batch dimension, i.e. shape (C, H, W). To maintain the 1D "
-            "channel-wise dropout behavior, please switch to using dropout1d instead."
+            "channel-wise dropout behavior, please switch to using dropout1d instead.",
+            stacklevel=2,
         )
 
     result = (
@@ -1610,7 +1611,7 @@ def dropout3d(
             "exists to provide channel-wise dropout on inputs with 3 spatial dimensions, "
             "a channel dimension, and an optional batch dimension (i.e. 4D or 5D inputs)."
         )
-        warnings.warn(warn_msg)
+        warnings.warn(warn_msg, stacklevel=2)
 
     is_batched = inp_dim == 5
     if not is_batched:
@@ -2210,7 +2211,7 @@ def gumbel_softmax(
             gumbel_softmax, (logits,), logits, tau=tau, hard=hard, eps=eps, dim=dim
         )
     if eps != 1e-10:
-        warnings.warn("`eps` parameter is deprecated and has no effect.")
+        warnings.warn("`eps` parameter is deprecated and has no effect.", stacklevel=2)
 
     gumbels = (
         -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format)
@@ -2681,7 +2682,8 @@ def embedding_bag(
         warnings.warn(
             "Argument order of nn.functional.embedding_bag was changed. "
             "Usage `embedding_bag(weight, input, ...)` is deprecated, "
-            "and should now be `embedding_bag(input, weight, ...)`."
+            "and should now be `embedding_bag(input, weight, ...)`.",
+            stacklevel=2,
         )
         weight, input = input, weight
 
@@ -3392,7 +3394,8 @@ def kl_div(
             warnings.warn(
                 "reduction: 'mean' divides the total loss by both the batch size and the support size."
                 "'batchmean' divides only by the batch size, and aligns with the KL div math definition."
-                "'mean' will be changed to behave the same as 'batchmean' in the next major release."
+                "'mean' will be changed to behave the same as 'batchmean' in the next major release.",
+                stacklevel=2,
             )
 
         # special case for batchmean
@@ -5213,7 +5216,8 @@ def grid_sample(
             "Default grid_sample and affine_grid behavior has changed "
             "to align_corners=False since 1.3.0. Please specify "
             "align_corners=True if the old behavior is desired. "
-            "See the documentation of grid_sample for details."
+            "See the documentation of grid_sample for details.",
+            stacklevel=2,
         )
         align_corners = False
 
@@ -5280,7 +5284,8 @@ def affine_grid(
             "Default grid_sample and affine_grid behavior has changed "
             "to align_corners=False since 1.3.0. Please specify "
             "align_corners=True if the old behavior is desired. "
-            "See the documentation of grid_sample for details."
+            "See the documentation of grid_sample for details.",
+            stacklevel=2,
         )
         align_corners = False
 
@@ -5314,7 +5319,8 @@ def affine_grid(
             "Since version 1.3.0, affine_grid behavior has changed "
             "for unit-size grids when align_corners=True. "
             "This is not an intended use case of affine_grid. "
-            "See the documentation of affine_grid for details."
+            "See the documentation of affine_grid for details.",
+            stacklevel=2,
         )
     elif min(size) <= 0:
         raise ValueError(f"Expected non-zero, positive output size. Got {size}")
@@ -6158,7 +6164,8 @@ def _canonical_mask(
             if _mask_dtype != other_type:
                 warnings.warn(
                     f"Support for mismatched {mask_name} and {other_name} "
-                    "is deprecated. Use same type for both instead."
+                    "is deprecated. Use same type for both instead.",
+                    stacklevel=2,
                 )
         if not _mask_is_float:
             mask = torch.zeros_like(mask, dtype=target_type).masked_fill_(
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 18358dbabbb..78fe7cd7ff8 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -565,7 +565,7 @@ def kaiming_uniform_(
         )
 
     if 0 in tensor.shape:
-        warnings.warn("Initializing zero-element tensors is a no-op")
+        warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2)
         return tensor
     fan = _calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
@@ -619,7 +619,7 @@ def kaiming_normal_(
         pass in a transposed weight matrix, i.e. ``nn.init.kaiming_normal_(w.T, ...)``.
     """
     if 0 in tensor.shape:
-        warnings.warn("Initializing zero-element tensors is a no-op")
+        warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2)
         return tensor
     fan = _calculate_correct_fan(tensor, mode)
     gain = calculate_gain(nonlinearity, a)
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index 25f0c45d5c1..da3d3658553 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -115,7 +115,8 @@ class _InstanceNorm(_NormBase):
                 warnings.warn(
                     f"input's size at dim={feature_dim} does not match num_features. "
                     "You can silence this warning by not passing in num_features, "
-                    "which is not used because affine=False"
+                    "which is not used because affine=False",
+                    stacklevel=2,
                 )
 
         if input.dim() == self._get_no_batch_dim():
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 194e68046e8..f7e3d2f262d 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -1353,7 +1353,8 @@ class Module:
                     "Complex modules are a new feature under active development whose design may change, "
                     "and some modules might not work as expected when using complex tensors as parameters or buffers. "
                     "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
-                    "if a complex module does not work as expected."
+                    "if a complex module does not work as expected.",
+                    stacklevel=2,
                 )
 
         def convert(t):
@@ -1855,7 +1856,7 @@ class Module:
                 if not isinstance(result, (torch.Tensor, tuple)):
                     warnings.warn("For backward hooks to be called,"
                                   " module output should be a Tensor or a tuple of Tensors"
-                                  f" but received {type(result)}")
+                                  f" but received {type(result)}", stacklevel=2)
                 result = bw_hook.setup_output_hook(result)
 
             # Handle the non-full backward hooks
@@ -1898,7 +1899,7 @@ class Module:
                             result = hook_result
                     except Exception as e:
                         warnings.warn("global module forward hook with ``always_call=True`` raised an exception "
-                                      f"that was silenced as another error was raised in forward: {str(e)}")
+                                      f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
                         continue
 
             for hook_id, hook in self._forward_hooks.items():
@@ -1912,7 +1913,7 @@ class Module:
                             result = hook_result
                     except Exception as e:
                         warnings.warn("module forward hook with ``always_call=True`` raised an exception "
-                                      f"that was silenced as another error was raised in forward: {str(e)}")
+                                      f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2)
                         continue
             # raise exception raised in try block
             raise
@@ -2457,7 +2458,8 @@ class Module:
                         f"for {key}: copying from a non-meta parameter in the checkpoint to a meta "
                         "parameter in the current model, which is a no-op. (Did you mean to "
                         "pass `assign=True` to assign items in the state dictionary to their "
-                        "corresponding key in the module instead of copying them in place?)"
+                        "corresponding key in the module instead of copying them in place?)",
+                        stacklevel=2,
                     )
 
                 try:
@@ -2956,7 +2958,8 @@ class Module:
                 "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
                 "The parameters are copied (in a differentiable manner) from the original module. "
                 "This means they are not leaf nodes in autograd and so don't accumulate gradients. "
-                "If you need gradients in your forward method, consider using autograd.grad instead."
+                "If you need gradients in your forward method, consider using autograd.grad instead.",
+                stacklevel=2,
             )
 
         for p in self.parameters():
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index bff265bd92a..c7b44b61354 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -124,7 +124,8 @@ class RNNBase(Module):
                 "dropout option adds dropout after all but last "
                 "recurrent layer, so non-zero dropout expects "
                 f"num_layers greater than 1, but got dropout={dropout} and "
-                f"num_layers={num_layers}"
+                f"num_layers={num_layers}",
+                stacklevel=2,
             )
 
         if not isinstance(hidden_size, int):
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 2f69d89b19e..5f445bf26c7 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -399,7 +399,8 @@ class TransformerEncoder(Module):
 
         if enable_nested_tensor and why_not_sparsity_fast_path:
             warnings.warn(
-                f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}"
+                f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}",
+                stacklevel=2,
             )
             self.use_nested_tensor = False
 
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index 5170b172fbb..e88a8e1795f 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -71,7 +71,8 @@ class Gather(Function):
             warnings.warn(
                 "Was asked to gather along dimension 0, but all "
                 "input tensors were scalars; will instead unsqueeze "
-                "and return a vector."
+                "and return a vector.",
+                stacklevel=2,
             )
             ctx.unsqueezed_scalar = True
         else:
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 56ad3b8b201..9a0f4973d31 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -36,7 +36,8 @@ def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
         max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
         if min_val / max_val < 0.75:
             warnings.warn(
-                imbalance_warn.format(device_ids[min_pos], device_ids[max_pos])
+                imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]),
+                stacklevel=2,
             )
             return True
         return False
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 4444f557f4a..1072b68ea11 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -2365,7 +2365,8 @@ class DistributedDataParallel(Module, Joinable):
         # If self.static_graph has been set, no need to set it again
         if self.static_graph:
             warnings.warn(
-                "You've set static_graph to be True, no need to set it again."
+                "You've set static_graph to be True, no need to set it again.",
+                stacklevel=2,
             )
             return
         self.static_graph = True
@@ -2379,7 +2380,8 @@ class DistributedDataParallel(Module, Joinable):
                 "`_set_static_graph` will detect unused parameters automatically, so "
                 "you do not need to set find_unused_parameters=true, just be sure these "
                 "unused parameters will not change during training loop while calling "
-                "`_set_static_graph`."
+                "`_set_static_graph`.",
+                stacklevel=2,
             )
 
     def _remove_autograd_hooks(self):
diff --git a/torch/nn/utils/_deprecation_utils.py b/torch/nn/utils/_deprecation_utils.py
index 995da89c70b..a25b6473079 100644
--- a/torch/nn/utils/_deprecation_utils.py
+++ b/torch/nn/utils/_deprecation_utils.py
@@ -45,7 +45,7 @@ def lazy_deprecated_import(
         if name in all:
             # We are using the "RuntimeWarning" to make sure it is not
             # ignored by default.
-            warnings.warn(warning_message, RuntimeWarning)
+            warnings.warn(warning_message, RuntimeWarning, stacklevel=2)
             package = importlib.import_module(new_module)
             return getattr(package, name)
         raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.")
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index 20651017f3e..e128ecf74e9 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -271,7 +271,8 @@ def create_rename_mapping(
                 if input.shape[dim].value in rename_mapping:
                     warnings.warn(
                         f"# The axis name: {custom_name} will not be used, since it shares "
-                        f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}."
+                        f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}.",
+                        stacklevel=2,
                     )
                     continue
                 rename_mapping[input.shape[dim].value] = custom_name
diff --git a/torch/onnx/_internal/torchscript_exporter/registration.py b/torch/onnx/_internal/torchscript_exporter/registration.py
index f073227f87b..e35903e6823 100644
--- a/torch/onnx/_internal/torchscript_exporter/registration.py
+++ b/torch/onnx/_internal/torchscript_exporter/registration.py
@@ -164,6 +164,7 @@ class _SymbolicFunctionGroup:
                 f"Replacing the existing function with new function. This is unexpected. "
                 f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
                 errors.OnnxExporterWarning,
+                stacklevel=2,
             )
         self._functions.set_base(opset, func)
 
@@ -184,7 +185,8 @@ class _SymbolicFunctionGroup:
         """
         if not self._functions.overridden(opset):
             warnings.warn(
-                f"No custom function registered for '{self._name}' opset {opset}"
+                f"No custom function registered for '{self._name}' opset {opset}",
+                stacklevel=2,
             )
             return
         self._functions.remove_override(opset)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
index 59cd0eb0f89..6b0da4a6ae0 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -909,7 +909,8 @@ def _interpolate_warning(interpolate_mode):
         "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
         "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
         "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
-        "We recommend using opset 11 and above for models using this operator."
+        "We recommend using opset 11 and above for models using this operator.",
+        stacklevel=2,
     )
 
 
@@ -1236,7 +1237,8 @@ def __interpolate_helper(
             if not is_scalar:
                 warnings.warn(
                     "Cannot verify if the output_size is a scalar "
-                    "while exporting interpolate. Assuming that it is not a scalar."
+                    "while exporting interpolate. Assuming that it is not a scalar.",
+                    stacklevel=2,
                 )
 
         if is_scalar:
@@ -1577,7 +1579,8 @@ def check_training_mode(op_train_mode: int, op_name: str) -> None:
     # in training.
     warnings.warn(
         f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
-        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}.",
+        stacklevel=2,
     )
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
index 6bb09ef3ec2..a757409bf10 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
@@ -613,7 +613,8 @@ def embedding_bag(
 
     warnings.warn(
         "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
-        "Please use opset 11 or higher to export model for dynamic input shape.'"
+        "Please use opset 11 or higher to export model for dynamic input shape.'",
+        stacklevel=2,
     )
     offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
     if offsets_dim_0 is not None:
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
index cbba5d2e61c..c46af044a3e 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
@@ -914,7 +914,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None):
             + str(dim_size)
             + ". The model will "
             + "be exported without the squeeze node. If the model is intended to be used with dynamic "
-            + "input shapes, please export with dynamic_axes argument."
+            + "input shapes, please export with dynamic_axes argument.",
+            stacklevel=2,
         )
         return self
     return symbolic_helper._squeeze_helper(g, self, [dim])
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
index d11750b1ee8..ae9a5039d39 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
@@ -48,7 +48,8 @@ def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
         warnings.warn(
             "Multidirectional broadcasting is not supported in opset 7. "
             "This might cause the onnx model to be incorrect, if inputs to max operators "
-            "have different shapes"
+            "have different shapes",
+            stacklevel=2,
         )
     return opset9.max(g, self, dim_or_y, keepdim)
 
@@ -60,7 +61,8 @@ def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
         warnings.warn(
             "Multidirectional broadcasting is not supported in opset 7. "
             "This might cause the onnx model to be incorrect, if inputs to min operators "
-            "have different shapes"
+            "have different shapes",
+            stacklevel=2,
         )
     return opset9.min(g, self, dim_or_y, keepdim)
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
index 8ba8e6ee662..3e05e82842f 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -183,7 +183,8 @@ def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
         warnings.warn(
             "Only floating datatype is supported for these operators: "
             "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
-            "the onnx model to be incorrect, if inputs have integer datatypes."
+            "the onnx model to be incorrect, if inputs have integer datatypes.",
+            stacklevel=2,
         )
     return (old_type,) + args
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index 16e94b91f89..53e6e592da0 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -925,7 +925,8 @@ def embedding(
         warnings.warn(
             "Warning: ONNX export of embedding with padding_idx >= 0 "
             "for training mode. "
-            "ONNX does not support not updating the embedding vector at padding_idx during training."
+            "ONNX does not support not updating the embedding vector at padding_idx during training.",
+            stacklevel=2,
         )
 
     return g.op("Gather", weight, indices)
@@ -1142,7 +1143,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None):
                 + "Axis is converted to "
                 + str(squeeze_dim + rank)
                 + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
+                + "Passing an tensor of different rank in execution will be incorrect.",
+                stacklevel=2,
             )
             squeeze_dim += rank
         else:
@@ -1161,7 +1163,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None):
             + " of the input "
             + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
             + "non-singleton dimensions, it is recommended to export this model using opset "
-            + "version 11 or higher."
+            + "version 11 or higher.",
+            stacklevel=2,
         )
         return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
     if dim_size > 1:
@@ -1174,7 +1177,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None):
             + ". The model will "
             + "be exported without the squeeze node. If the model is intended to be used with dynamic "
             + "input shapes, please use opset version 11 to "
-            + "export the model."
+            + "export the model.",
+            stacklevel=2,
         )
         return self
 
@@ -1182,7 +1186,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None):
         "This model contains a squeeze operation on dimension "
         + str(squeeze_dim)
         + ". If the model is "
-        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model.",
+        stacklevel=2,
     )
     return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
 
@@ -3859,7 +3864,8 @@ def unsqueeze(g: jit_utils.GraphContext, self, dim):
                 + "Axis is converted to "
                 + str(dim + rank + 1)
                 + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
+                + "Passing an tensor of different rank in execution will be incorrect.",
+                stacklevel=2,
             )
             dim = dim + rank + 1
         else:
@@ -4266,7 +4272,8 @@ def _generic_rnn(
         + " can cause an error "
         + "when running the ONNX model with a different batch size. "
         + "Make sure to save the model with a batch size of 1, "
-        + "or define the initial states (h0/c0) as inputs of the model. "
+        + "or define the initial states (h0/c0) as inputs of the model. ",
+        stacklevel=2,
     )
 
     onnxActivations = [
@@ -5316,7 +5323,8 @@ def index(g: jit_utils.GraphContext, self, index):
             warnings.warn(
                 "Exporting aten::index operator with indices of type Byte. "
                 "Only 1-D indices are supported. In any other case, "
-                "this will produce an incorrect ONNX graph."
+                "this will produce an incorrect ONNX graph.",
+                stacklevel=2,
             )
             index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
         return index
@@ -5370,7 +5378,8 @@ def index(g: jit_utils.GraphContext, self, index):
                 f"{GLOBALS.export_onnx_opset_version}"
                 " is achieved by combination of multiple ONNX operators, "
                 "including Reshape, Transpose, Concat, and Gather. "
-                "If indices include negative values, the exported graph will produce incorrect results."
+                "If indices include negative values, the exported graph will produce incorrect results.",
+                stacklevel=2,
             )
             adv_idx_count = len(adv_idx_indices)
             shape_tensor = _shape_as_tensor(g, self)
@@ -6061,7 +6070,8 @@ def fill(g: jit_utils.GraphContext, self, value):
 def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
     warnings.warn(
         "Warning: ONNX export does not support duplicated values in 'index' field, "
-        + "this will cause the ONNX model to be incorrect."
+        + "this will cause the ONNX model to be incorrect.",
+        stacklevel=2,
     )
 
     # ONNX does not support "alpha" argument, unlike aten index_add
diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py
index f2004ac0232..d66962f690e 100644
--- a/torch/onnx/_internal/torchscript_exporter/utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/utils.py
@@ -121,7 +121,8 @@ def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
                     "You are exporting the model in training mode with onnx opset "
                     f"version {GLOBALS.export_onnx_opset_version}. "
                     "Opset versions lower than opset 12 will not be able to export "
-                    "nodes such as Dropout and BatchNorm correctly."
+                    "nodes such as Dropout and BatchNorm correctly.",
+                    stacklevel=2,
                 )
         else:
             GLOBALS.export_training = False
@@ -532,6 +533,7 @@ def export(
         warnings.warn(
             "Setting `operator_export_type` to something other than default is deprecated. "
             "The option will be removed in a future release.",
+            stacklevel=2,
             category=DeprecationWarning,
         )
     if training == _C_onnx.TrainingMode.TRAINING:
@@ -539,6 +541,7 @@ def export(
             "Setting `training` to something other than default is deprecated. "
             "The option will be removed in a future release. Please set the training mode "
             "before exporting the model.",
+            stacklevel=2,
             category=DeprecationWarning,
         )
 
@@ -738,14 +741,14 @@ def warn_on_static_input_change(input_states):
                     "for configuration use. "
                     "Also note that the order and values of the keys must remain the same. "
                 )
-                warnings.warn(warning)
+                warnings.warn(warning, stacklevel=2)
         elif isinstance(input, str):
             if input != traced_input:
                 warning = (
                     "The model seems to have string inputs/outputs. "
                     "Note that strings will not appear as inputs/outputs of the ONNX graph. "
                 )
-                warnings.warn(warning)
+                warnings.warn(warning, stacklevel=2)
 
 
 def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
@@ -782,7 +785,8 @@ def _decide_keep_init_as_input(
                 "8 or lower would lead to an invalid ONNX graph. Therefore, "
                 "'keep_initializers_as_inputs=False' is ignored during export."
                 "Exported model will have initializers as graph inputs (compliant "
-                " to ONNX IR v3)."
+                " to ONNX IR v3).",
+                stacklevel=2,
             )
         return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
     val_keep_init_as_ip = (
@@ -815,7 +819,8 @@ def _decide_constant_folding(do_constant_folding, operator_export_type, training
             "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
             "learnable model parameters may not translate correctly in the exported ONNX model "
             "because constant folding mutates model parameters. Please consider "
-            "turning off constant folding or setting the training=TrainingMode.EVAL."
+            "turning off constant folding or setting the training=TrainingMode.EVAL.",
+            stacklevel=2,
         )
     return do_constant_folding
 
@@ -831,7 +836,7 @@ def _decide_input_format(model, args):
     try:
         sig = _signature(model)
     except ValueError as e:
-        warnings.warn(f"{e}, skipping _decide_input_format")
+        warnings.warn(f"{e}, skipping _decide_input_format", stacklevel=2)
         return args
     try:
         ordered_list_keys = list(sig.parameters.keys())
@@ -859,9 +864,9 @@ def _decide_input_format(model, args):
         args = args_list if isinstance(args, list) else tuple(args_list)
     # Cases of models with no input args
     except IndexError:
-        warnings.warn("No input args, skipping _decide_input_format")
+        warnings.warn("No input args, skipping _decide_input_format", stacklevel=2)
     except Exception as e:
-        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}", stacklevel=2)
     return args
 
 
@@ -1449,6 +1454,7 @@ def _export(
             f"by 'torch.onnx.export()'. "
             f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
             f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
+            stacklevel=2,
             category=errors.OnnxExporterWarning,
         )
 
@@ -1901,12 +1907,14 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
     for key, value in dynamic_axes.items():
         if key not in valid_names:
             warnings.warn(
-                f"Provided key {key} for dynamic axes is not a valid input/output name"
+                f"Provided key {key} for dynamic axes is not a valid input/output name",
+                stacklevel=2,
             )
         if isinstance(value, list):
             warnings.warn(
                 "No names were found for specified dynamic axes of provided input."
-                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+                f"Automatically generated names will be applied to each dynamic axes of input {key}",
+                stacklevel=2,
             )
 
             value_dict = {}
@@ -1917,7 +1925,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
                     )
                 if x in value_dict:
                     warnings.warn(
-                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                        f"Duplicate dynamic axis index {x} was provided for input {key}.",
+                        stacklevel=2,
                     )
                 else:
                     value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py
index c3cb967c14c..4ef9742ad6f 100644
--- a/torch/onnx/_internal/torchscript_exporter/verification.py
+++ b/torch/onnx/_internal/torchscript_exporter/verification.py
@@ -244,15 +244,16 @@ def _compare_onnx_pytorch_outputs_in_np(
                     warnings.warn(
                         f"Suppressed AssertionError:\n{e}.\n"
                         f"Error percentage {error_percentage} "
-                        f"within acceptable range {acceptable_error_percentage}."
+                        f"within acceptable range {acceptable_error_percentage}.",
+                        stacklevel=2,
                     )
                     continue
             # pyrefly: ignore  # missing-attribute
             if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
-                warnings.warn("ONNX output is quantized")
+                warnings.warn("ONNX output is quantized", stacklevel=2)
             # pyrefly: ignore  # missing-attribute
             if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
-                warnings.warn("PyTorch output is quantized")
+                warnings.warn("PyTorch output is quantized", stacklevel=2)
             raise
 
 
@@ -369,7 +370,8 @@ def _try_clone_model(model):
         return copy.deepcopy(model)
     except Exception:
         warnings.warn(
-            "Failed to clone model. Model state might be mutated during verification."
+            "Failed to clone model. Model state might be mutated during verification.",
+            stacklevel=2,
         )
         return model
 
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 3a6bc296d70..3cc6649e0d8 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -267,6 +267,7 @@ class LRScheduler:
                     "`lr_scheduler.step()`. See more details at "
                     "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
                     UserWarning,
+                    stacklevel=2,
                 )
 
             # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
@@ -279,11 +280,12 @@ class LRScheduler:
                     "See more details at "
                     "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
                     UserWarning,
+                    stacklevel=2,
                 )
 
         self._step_count += 1
         if epoch is not None:
-            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2)
         self._update_lr(epoch)
 
     def _update_lr(self, epoch: Optional[int] = None):
@@ -1696,7 +1698,7 @@ class ReduceLROnPlateau(LRScheduler):
         if epoch is None:
             epoch = self.last_epoch + 1
         else:
-            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2)
         self.last_epoch = epoch
 
         if self._is_better(current, self.best):
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index c542dbfd84f..5475b2755d4 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -483,7 +483,8 @@ class Optimizer:
                 warnings.warn(
                     "This instance was constructed with capturable=True or some of all the param_groups came with capturable=True, "
                     "but step() is running without CUDA graph capture. If you never intend to graph-capture this "
-                    "instance, capturable=True can impair performance, and you should set capturable=False."
+                    "instance, capturable=True can impair performance, and you should set capturable=False.",
+                    stacklevel=2,
                 )
                 self._warned_capturable_if_run_uncaptured = True
 
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 347a7976a58..08cd0b504dc 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -491,6 +491,7 @@ class SWALR(LRScheduler):
                 "To get the last learning rate computed by the scheduler, "
                 "please use `get_last_lr()`.",
                 UserWarning,
+                stacklevel=2,
             )
         # Set in `LRScheduler._initial_step()`
         step = self._step_count - 1
diff --git a/torch/overrides.py b/torch/overrides.py
index 264edf07b91..db4a7535a36 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -1747,6 +1747,7 @@ def handle_torch_function(
                 "Defining your `__torch_function__ as a plain method is deprecated and "
                 "will be an error in future, please define it as a classmethod.",
                 DeprecationWarning,
+                stacklevel=2,
             )
 
         # Use `public_api` instead of `implementation` so __torch_function__
@@ -2057,7 +2058,8 @@ class TorchFunctionMode:
     @classmethod
     def push(cls, *args, **kwargs):
         warnings.warn(
-            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`"
+            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`",
+            stacklevel=2,
         )
         instance = cls(*args, **kwargs)
         return instance
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index e92aa3fafb7..aa046db4454 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -512,7 +512,10 @@ def schedule(
         wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
     ), "Invalid profiler schedule arguments"
     if warmup == 0:
-        warn("Profiler won't be using warmup, this can skew profiler results")
+        warn(
+            "Profiler won't be using warmup, this can skew profiler results",
+            stacklevel=2,
+        )
     return schedule_fn
 
 
@@ -930,7 +933,8 @@ class ExecutionTraceObserver(_ITraceObserver):
                 fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
             except Exception as e:
                 warn(
-                    f"Execution trace will not be recorded. Exception on creating default temporary file: {e}"
+                    f"Execution trace will not be recorded. Exception on creating default temporary file: {e}",
+                    stacklevel=2,
                 )
                 return None
             fp.close()
@@ -1015,7 +1019,10 @@ class ExecutionTraceObserver(_ITraceObserver):
                 try:
                     os.mkdir(resource_dir)
                 except Exception:
-                    warn(f"Execution trace exception when creating {resource_dir}")
+                    warn(
+                        f"Execution trace exception when creating {resource_dir}",
+                        stacklevel=2,
+                    )
                     return None
             else:
                 return None
@@ -1031,7 +1038,8 @@ class ExecutionTraceObserver(_ITraceObserver):
                 resource_dir = self.get_resources_dir()
             except Exception as e:
                 warn(
-                    f"Execution trace exception when generating resource directory: {e}"
+                    f"Execution trace exception when generating resource directory: {e}",
+                    stacklevel=2,
                 )
                 return
             if not resource_dir:
@@ -1066,7 +1074,7 @@ class ExecutionTraceObserver(_ITraceObserver):
             try:
                 _save_triton_kernels()
             except Exception as e:
-                warn(f"Execution trace failed to save kernels: {e}")
+                warn(f"Execution trace failed to save kernels: {e}", stacklevel=2)
 
             _remove_execution_trace_observer()
             if self.output_file_path.endswith("gz"):
diff --git a/torch/random.py b/torch/random.py
index 18a1cf9a5d5..cf23e52db32 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -184,7 +184,7 @@ def fork_rng(
                 f"and suppress this warning, set the '{_devices_kw}' keyword argument to "
                 f"`range(torch.{device_type}.device_count())`."
             )
-            warnings.warn(message)
+            warnings.warn(message, stacklevel=2)
             _fork_rng_warned_already = True
         devices = list(range(num_devices))
     else:
diff --git a/torch/serialization.py b/torch/serialization.py
index 1ef46d63ca2..ed0f76a4412 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -524,7 +524,10 @@ def check_module_version_greater_or_equal(
         if error_if_malformed:
             raise RuntimeError(message) from e
         else:
-            warnings.warn(message + ", but continuing assuming that requirement is met")
+            warnings.warn(
+                message + ", but continuing assuming that requirement is met",
+                stacklevel=2,
+            )
             requirement_is_met = True
 
     return requirement_is_met
@@ -1021,7 +1024,8 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
                 warnings.warn(
                     "Couldn't retrieve source code for container of "
                     "type " + obj.__name__ + ". It won't be checked "
-                    "for correctness upon loading."
+                    "for correctness upon loading.",
+                    stacklevel=2,
                 )
             return ("module", obj, source_file, source)
 
@@ -1502,6 +1506,7 @@ def load(
                         " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
                         " silence this warning)",
                         UserWarning,
+                        stacklevel=2,
                     )
                     if weights_only:
                         raise RuntimeError(
@@ -1603,7 +1608,8 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args):
             warnings.warn(
                 "Couldn't retrieve source code for container of "
                 "type " + container_type.__name__ + ". It won't be checked "
-                "for correctness upon loading."
+                "for correctness upon loading.",
+                stacklevel=2,
             )
             return
         if original_source != current_source:
@@ -1645,7 +1651,7 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args):
                     "patch tool to revert the changes."
                 )
             msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}"
-            warnings.warn(msg, SourceChangeWarning)
+            warnings.warn(msg, SourceChangeWarning, stacklevel=2)
 
     def legacy_load(f):
         deserialized_objects: dict[int, Any] = {}
@@ -1949,6 +1955,7 @@ def _load(
             "torch.serialization.set_default_load_endianness to set "
             "the desired default load endianness",
             UserWarning,
+            stacklevel=2,
         )
 
     from torch.utils.serialization import config
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 903c0a5a9d6..78bdbf07b2b 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -194,7 +194,8 @@ def update(op, device_name, version, key, value):
     # skip storing possible optimization failures:
     if not value:
         warnings.warn(
-            f"skipping empty value for {op}: {device_name=} {version=} {key=}"
+            f"skipping empty value for {op}: {device_name=} {version=} {key=}",
+            stacklevel=2,
         )
         return
     if (op, device_name, version) in _operation_device_version_data:
@@ -217,7 +218,8 @@ def dump():
     if begin_data_index == -1 or end_data_index == -1:
         warnings.warn(
             f"{current_file} cannot be updated:"
-            " BEGIN/END GENERATED DATA comment blocks appear to be corrupted"
+            " BEGIN/END GENERATED DATA comment blocks appear to be corrupted",
+            stacklevel=2,
         )
         return
 
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index 7fcdd868793..da5b8b4798a 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -121,6 +121,7 @@ class SparseSemiStructuredTensor(torch.Tensor):
                     "module for further information about the project."
                 ),
                 UserWarning,
+                stacklevel=2,
             )
             cls._PROTOTYPE_WARNING_SHOWN = True
 
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index dd211599cf1..36c72f1d5c3 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1551,7 +1551,9 @@ def compiled_fsdp_test(compile_compute_on_module: Optional[type] = None):
             original_fully_shard: Any = torch.distributed.fsdp.fully_shard
             for mode in FullyShardMode:
                 if mode != FullyShardMode.EAGER and not has_triton():
-                    warnings.warn("Inductor on GPU needs Triton and recent GPU arch")
+                    warnings.warn(
+                        "Inductor on GPU needs Triton and recent GPU arch", stacklevel=2
+                    )
                     continue
                 # barrier to ensure thread reading the same value
                 original_skip_fsdp_hooks = torch._dynamo.config.skip_fsdp_hooks
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 552140f8046..0c26738c2f5 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1258,7 +1258,7 @@ def run_tests(argv=None):
                 # use env vars so pytest-xdist subprocesses can still access them
                 os.environ['SLOW_TESTS_FILE'] = SLOW_TESTS_FILE
         else:
-            warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}')
+            warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}', stacklevel=2)
     if DISABLED_TESTS_FILE:
         if os.path.exists(DISABLED_TESTS_FILE):
             with open(DISABLED_TESTS_FILE) as fp:
@@ -1266,7 +1266,7 @@ def run_tests(argv=None):
                 disabled_tests_dict = json.load(fp)
                 os.environ['DISABLED_TESTS_FILE'] = DISABLED_TESTS_FILE
         else:
-            warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}')
+            warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}', stacklevel=2)
     # Determine the test launch mechanism
     if TEST_DISCOVER:
         _print_test_names()
@@ -2663,7 +2663,7 @@ class CudaMemoryLeakCheck:
                        f"and is now reported as {caching_allocator_mem_allocated} "  # type: ignore[possibly-undefined]
                        f"on device {i}. "
                        f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")  # type: ignore[possibly-undefined]
-                warnings.warn(msg)
+                warnings.warn(msg, stacklevel=2)
             elif caching_allocator_discrepancy and driver_discrepancy:  # type: ignore[possibly-undefined]
                 # A caching allocator discrepancy validated by the driver API is a
                 #   failure (except on ROCm, see below)
@@ -2757,7 +2757,7 @@ try:
         "pytorch_ci" if IS_CI else os.getenv('PYTORCH_HYPOTHESIS_PROFILE', 'dev')
     )
 except ImportError:
-    warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning)
+    warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning, stacklevel=2)
 
 # Used in check_if_enable to see if a test method should be disabled by an issue,
 # sanitizes a test method name from appended suffixes by @dtypes parametrization.
@@ -4648,7 +4648,7 @@ def download_file(url, binary=True):
         return path
     except error.URLError as e:
         msg = f"could not download test file '{url}'"
-        warnings.warn(msg, RuntimeWarning)
+        warnings.warn(msg, RuntimeWarning, stacklevel=2)
         raise unittest.SkipTest(msg) from e
 
 def find_free_port():
@@ -5744,17 +5744,17 @@ def check_leaked_tensors(limit=1, matched_type=torch.Tensor):
         num_garbage_objs = len(garbage_objs)
         if num_garbage_objs > 0:
             warnings.warn(
-                f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?"
+                f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?", stacklevel=2
             )
             try:
                 import objgraph  # type: ignore[import-not-found,import-untyped]
                 warnings.warn(
-                    f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png"
+                    f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png", stacklevel=2
                 )
                 for g in garbage_objs[:limit]:
                     objgraph.show_backrefs([g], max_depth=10)
             except ImportError:
-                warnings.warn("`pip install objgraph` to enable memory leak debugging")
+                warnings.warn("`pip install objgraph` to enable memory leak debugging", stacklevel=2)
 
     finally:
         gc.set_debug(0)
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index 4000ec6ca13..d9e2127e956 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -62,7 +62,8 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type):
     assert device_type in ["cpu", "cuda"]
     if not TEST_CUDA and device_type == "cuda":
         warnings.warn(
-            "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!"
+            "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!",
+            stacklevel=2,
         )
         return _dynamic_dispatch_dtypes(())
 
@@ -76,7 +77,8 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type):
             # We raise a warning, so that user knows that this was the case
             # and can investigate if there was an issue with the `sample_inputs_fn`.
             warnings.warn(
-                f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}"
+                f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}",
+                stacklevel=2,
             )
             continue
 
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index fa756892c34..5c8df5d9ead 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -158,7 +158,8 @@ class TorchDispatchMode:
     @classmethod
     def push(cls, *args, **kwargs):
         warnings.warn(
-            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`"
+            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`",
+            stacklevel=2,
         )
         instance = cls(*args, **kwargs)
         return instance
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 2ed1ba60a59..703aea93a56 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -602,6 +602,7 @@ def _private_register_pytree_node(
             warnings.warn(
                 f"{cls} is already registered as pytree node. "
                 "Overwriting the previous registration.",
+                stacklevel=2,
             )
 
         node_def = NodeDef(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn)
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index d3c41b8fb9e..5707f4e0fd4 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -83,7 +83,7 @@ def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
 def check_backward_validity(inputs: Iterable[Any]) -> None:
     if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
         warnings.warn(
-            "None of the inputs have requires_grad=True. Gradients will be None"
+            "None of the inputs have requires_grad=True. Gradients will be None", stacklevel=2
         )
 
 
@@ -144,7 +144,7 @@ def _infer_device_type(*args):
             "devices will be ignored. Consequently, if any checkpointed functions involve randomness, "
             "this may result in incorrect gradients. (Note that if CUDA devices are among the devices "
             "detected, it will be prioritized; otherwise, the first device encountered will be selected.)"
-            f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}"
+            f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}", stacklevel=2
         )
     if len(device_types) == 0:
         return DefaultDeviceType.get_device_type()
@@ -565,7 +565,7 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar
             "is not passed. use_reentrant=False is "
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
-            "details on the differences between the two variants."
+            "details on the differences between the two variants.", stacklevel=2
         )
         use_reentrant = True
 
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 5e5307555e5..19400eb4a21 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -624,7 +624,8 @@ class DataLoader(Generic[_T_co]):
             warnings.warn(
                 _create_warning_msg(
                     max_num_worker_suggest, self.num_workers, cpuset_checked
-                )
+                ),
+                stacklevel=2,
             )
             return
 
@@ -632,7 +633,8 @@ class DataLoader(Generic[_T_co]):
             warnings.warn(
                 _create_warning_msg(
                     max_num_worker_suggest, self.num_workers, cpuset_checked
-                )
+                ),
+                stacklevel=2,
             )
 
 
@@ -663,14 +665,15 @@ class _BaseDataLoaderIter:
         if loader.pin_memory and loader.pin_memory_device:
             warnings.warn(
                 "pin_memory_device is deprecated, the current accelerator will be used as the device,"
-                f"ignore pin_memory_device='{loader.pin_memory_device}'."
+                f"ignore pin_memory_device='{loader.pin_memory_device}'.",
+                stacklevel=2,
             )
         if loader.pin_memory and not torch.accelerator.is_available():
             warn_msg = (
                 "'pin_memory' argument is set as true but no accelerator is found, "
                 "then device pinned memory won't be used."
             )
-            warnings.warn(warn_msg)
+            warnings.warn(warn_msg, stacklevel=2)
 
         # Enabling pin_memory in _BaseDataLoaderIter to support identical
         # behavior in forked implementations using _BaseDataLoaderIter.
@@ -694,7 +697,7 @@ class _BaseDataLoaderIter:
                 "'pin_memory' argument is set as true but not supported on MPS now, "
                 "device pinned memory won't be used."
             )
-            warnings.warn(warn_msg)
+            warnings.warn(warn_msg, stacklevel=2)
 
         self._timeout = loader.timeout
         self._collate_fn = loader.collate_fn
@@ -751,7 +754,7 @@ class _BaseDataLoaderIter:
                         "IterableDataset replica at each worker. Please see "
                         "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples."
                     )
-                warnings.warn(warn_msg)
+                warnings.warn(warn_msg, stacklevel=2)
             return data
 
     def __len__(self) -> int:
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index b6dda4552c2..2e3d3712442 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -159,6 +159,7 @@ class _ForkerIterDataPipe(IterDataPipe, _ContainerTemplate):
                 "Unlimited buffer size is set for `fork`, "
                 "please be aware of OOM at random places",
                 UserWarning,
+                stacklevel=2,
             )
         if copy is None:
             self.copy_fn = _no_op
@@ -359,6 +360,7 @@ class _ChildDataPipe(IterDataPipe):
                     "Some child DataPipes are not exhausted when __iter__ is called. We are resetting "
                     "the buffer and each child DataPipe will read from the start again.",
                     UserWarning,
+                    stacklevel=2,
                 )
             self.main_datapipe.reset()
         # 3. Otherwise, the iterator is behind the others, so it will just need to catch up by setting
@@ -464,6 +466,7 @@ class _DemultiplexerIterDataPipe(IterDataPipe, _ContainerTemplate):
                 "Unlimited buffer size is set for `demux`, "
                 "please be aware of OOM at random places",
                 UserWarning,
+                stacklevel=2,
             )
         self.current_buffer_usage = 0
         # pyrefly: ignore [invalid-type-var]
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index f90b426be12..e45ddab282f 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -49,7 +49,8 @@ class IterableWrapperIterDataPipe(IterDataPipe[_T]):
             except TypeError:
                 warnings.warn(
                     "The input iterable can not be deepcopied, "
-                    "please be aware of in-place modification would affect source data."
+                    "please be aware of in-place modification would affect source data.",
+                    stacklevel=2,
                 )
         yield from source_data
 
diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py
index e1290df3237..360f66b3137 100644
--- a/torch/utils/data/datapipes/map/utils.py
+++ b/torch/utils/data/datapipes/map/utils.py
@@ -47,7 +47,8 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]):
             except TypeError:
                 warnings.warn(
                     "The input sequence can not be deepcopied, "
-                    "please be aware of in-place modification would affect source data"
+                    "please be aware of in-place modification would affect source data",
+                    stacklevel=2,
                 )
                 self.sequence = sequence
         else:
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index 2390434c3ef..003ca568fca 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -149,7 +149,8 @@ def _check_unpickable_fn(fn: Callable):
     if _is_local_fn(fn) and not dill_available():
         warnings.warn(
             "Local function is not supported by pickle, please use "
-            "regular python function or functools.partial instead."
+            "regular python function or functools.partial instead.",
+            stacklevel=2,
         )
         return
 
@@ -157,7 +158,8 @@ def _check_unpickable_fn(fn: Callable):
     if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not dill_available():
         warnings.warn(
             "Lambda function is not supported by pickle, please use "
-            "regular python function or functools.partial instead."
+            "regular python function or functools.partial instead.",
+            stacklevel=2,
         )
         return
 
@@ -185,7 +187,7 @@ def get_file_pathnames_from_root(
 ) -> Iterable[str]:
     # print out an error message and raise the error out
     def onerror(err: OSError):
-        warnings.warn(err.filename + " : " + err.strerror)
+        warnings.warn(err.filename + " : " + err.strerror, stacklevel=2)
         raise err
 
     if os.path.isfile(root):
@@ -311,7 +313,7 @@ def _deprecation_warning(
     if new_argument_name:
         msg = f"{msg}\nPlease use `{old_class_name}({new_argument_name}=)` instead."
 
-    warnings.warn(msg, FutureWarning)
+    warnings.warn(msg, FutureWarning, stacklevel=2)
 
 
 class StreamWrapper:
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 221b3116017..f4e61963cd0 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -463,7 +463,8 @@ def random_split(
             if length == 0:
                 warnings.warn(
                     f"Length of split at index {i} is 0. "
-                    f"This might result in an empty dataset."
+                    f"This might result in an empty dataset.",
+                    stacklevel=2,
                 )
 
     # Cannot verify that dataset is Sized
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index a08421f9b68..8867109c1e0 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -132,7 +132,7 @@ def traverse(datapipe: DataPipe, only_datapipe: Optional[bool] = None) -> DataPi
     )
     if not only_datapipe:
         msg += " And, the behavior will be changed to the equivalent of `only_datapipe=True`."
-    warnings.warn(msg, FutureWarning)
+    warnings.warn(msg, FutureWarning, stacklevel=2)
     if only_datapipe is None:
         only_datapipe = False
     cache: set[int] = set()
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index 8cc16c86b0f..bb97558256b 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -116,7 +116,8 @@ def apply_shuffle_settings(
     if not shufflers and shuffle:
         warnings.warn(
             "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. "
-            "Be aware that the default buffer size might not be sufficient for your task."
+            "Be aware that the default buffer size might not be sufficient for your task.",
+            stacklevel=2,
         )
         datapipe = datapipe.shuffle()
         shufflers = [
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index c7ce437ab9b..3d51d9efb33 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -53,7 +53,7 @@ class FileBaton:
             if self.warn_after_seconds is not None:
                 if time.time() - start_time > self.warn_after_seconds and not has_warned:
                     warnings.warn(f'Waited on lock file "{self.lock_file_path}" for '
-                                  f'{self.warn_after_seconds} seconds.')
+                                  f'{self.warn_after_seconds} seconds.', stacklevel=2)
                     has_warned = True
 
     def release(self):
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 9ee3dbe18e9..157a5f4fb22 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -88,7 +88,7 @@ def warn_if_has_hooks(tensor):
                 warnings.warn(f"backward hook {repr(hook)} on tensor will not be "
                               "serialized.  If this is expected, you can "
                               "decorate the function with @torch.utils.hooks.unserializable_hook "
-                              "to suppress this warning")
+                              "to suppress this warning", stacklevel=2)
 
 class BackwardHook:
     """
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 9b39c303ac3..2ba3ea36088 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -429,7 +429,7 @@ def get_info_and_burn_skeleton(path_or_bytesio, **kwargs):
 
 
 def main(argv, *, stdout=None):
-    warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.")
+    warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.", stacklevel=2)
     parser = argparse.ArgumentParser()
     parser.add_argument("--style", choices=["json", "html"])
     parser.add_argument("--title")