diff --git a/torch/__init__.py b/torch/__init__.py index 95f55ae5878..78a4c220a05 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -2653,7 +2653,8 @@ def compile( if torch.compiler.is_exporting(): warnings.warn( "You are calling torch.compile inside torch.export region. " - "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)" + "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)", + stacklevel=2, ) from torch._higher_order_ops.utils import setup_compilation_env diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py index bcc0193fb88..1398f808da2 100644 --- a/torch/_custom_op/impl.py +++ b/torch/_custom_op/impl.py @@ -55,6 +55,7 @@ def warn_deprecated(): "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please " "use the equivalent torch.library API instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/torch/_export/converter.py b/torch/_export/converter.py index e2a3be17118..1a928f011bb 100644 --- a/torch/_export/converter.py +++ b/torch/_export/converter.py @@ -704,7 +704,8 @@ class TS2FXGraphConverter: # In a sense, the converter now becomes an stateful interpreter warnings.warn( "Converting aten::append.t, which is a inplace mutation of the list. " - "This makes the converter non-functional: the result depends on the order of the append nodes being converter!" + "This makes the converter non-functional: the result depends on the order of the append nodes being converter!", + stacklevel=2, ) args = tuple(self.get_fx_value_by_ir_value(inp) for inp in node.inputs()) @@ -1471,7 +1472,8 @@ DEBUG: (TORCH_LOGS="+export" ), additionally for k, tensor in self.ts_model.state_dict().items(): # type: ignore[union-attr] if k not in ep.state_dict: warnings.warn( - f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram." + f"Manually populate {k} into state_dict ExportedProgram, but it is never used by the ExportedProgram.", + stacklevel=2, ) ep.state_dict[k] = tensor diff --git a/torch/_export/tools.py b/torch/_export/tools.py index 0007de25d3e..b254fd62e3b 100644 --- a/torch/_export/tools.py +++ b/torch/_export/tools.py @@ -51,7 +51,8 @@ def _generate_inputs_for_submodules( model(*args, **kwargs) except Exception as e: warnings.warn( - f"Failed to generate submodule inputs because of the following error:\n{e}" + f"Failed to generate submodule inputs because of the following error:\n{e}", + stacklevel=2, ) finally: for h in handles: diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py index a75863cd739..01ae9509cb6 100644 --- a/torch/_functorch/_aot_autograd/frontend_utils.py +++ b/torch/_functorch/_aot_autograd/frontend_utils.py @@ -321,5 +321,6 @@ def _detect_attribute_assignment(mod: torch.nn.Module): warnings.warn( f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. " "Such attributes must be registered as buffers using the `register_buffer` API " - "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)." + "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer).", + stacklevel=2, ) diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py index eae75e06a42..2676042198d 100644 --- a/torch/_functorch/_aot_autograd/utils.py +++ b/torch/_functorch/_aot_autograd/utils.py @@ -137,7 +137,8 @@ def call_func_at_runtime_with_args( warnings.warn( "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. " "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. " - "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale." + "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale.", + stacklevel=2, ) out = normalize_as_list(f(*args)) return out diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py index cca12066bc3..9639a8b68d4 100644 --- a/torch/_higher_order_ops/auto_functionalize.py +++ b/torch/_higher_order_ops/auto_functionalize.py @@ -518,7 +518,8 @@ def do_auto_functionalize( if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs: warnings.warn( "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. " - "Please consider using a different name for this argument to avoid potential issues." + "Please consider using a different name for this argument to avoid potential issues.", + stacklevel=2, ) with ctx.redispatch_to_next(): unwrapped_outs = auto_functionalized( @@ -691,7 +692,8 @@ def do_auto_functionalize_v2( if "self" in unwrapped_kwargs or "self_" in unwrapped_kwargs: warnings.warn( "Using `self` or `self_` as an argument in the definition of custom ops may lead to ambiguous parsing. " - "Please consider using a different name for this argument to avoid potential issues." + "Please consider using a different name for this argument to avoid potential issues.", + stacklevel=2, ) all_basis_unwrapped = ctx.unwrap_tensors(all_bases) diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py index a7647c70984..37c5d320928 100644 --- a/torch/_higher_order_ops/base_hop.py +++ b/torch/_higher_order_ops/base_hop.py @@ -196,7 +196,8 @@ class BaseHOP(HigherOrderOperator, abc.ABC): "Aliasing is not supported for HOP subgraph.\n" f"{subgraph.print_readable(print_output=False)}\n" f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}" - f"This may lead to silent incorrectness." + f"This may lead to silent incorrectness.", + stacklevel=2, ) schema_gen = HopSchemaGenerator(self) diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py index 9379111d689..c06ee8fcad4 100644 --- a/torch/_higher_order_ops/cond.py +++ b/torch/_higher_order_ops/cond.py @@ -177,6 +177,7 @@ def cond( "Pred is a Python constant. When used with torch.cond, it specializes on one of the branches." " If you want torch.cond to preserve two branches, please make the predicate a boolean tensor or a SymBool.", UserWarning, + stacklevel=2, ) # This is the eager case. We can just run the true or false branch. if pred: diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 192f969e5c6..31f4e181b65 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -859,6 +859,7 @@ def ignore(drop=False, **kwargs): warnings.warn( "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function " "call on compilation. Use torch.jit.unused now. {}", + stacklevel=2, category=FutureWarning, ) @@ -867,6 +868,7 @@ def ignore(drop=False, **kwargs): warnings.warn( "ignore(True) has been deprecated. TorchScript will now drop the function " "call on compilation. Use torch.jit.unused now. {}", + stacklevel=2, category=FutureWarning, ) @@ -992,7 +994,8 @@ def _check_overload_body(func): # Parsing the function definition can raise an OSError if source is unavailable. # Since this is just an initial check, just raise a warning if this is the case. warnings.warn( - f"Unable to retrieve source for @torch.jit._overload function: {func}." + f"Unable to retrieve source for @torch.jit._overload function: {func}.", + stacklevel=2, ) return @@ -1385,7 +1388,8 @@ def check_empty_containers(obj) -> None: "calling torch.jit.isinstance in eager mode. For " "example, List[int] would become list and " "therefore falsely return True for List[float] or" - " List[str]." + " List[str].", + stacklevel=2, ) diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py index 2afb2331048..c53c4d0d608 100644 --- a/torch/_prims_common/__init__.py +++ b/torch/_prims_common/__init__.py @@ -2137,7 +2137,8 @@ def alert_not_deterministic(caller: str): f"{caller} does not have a deterministic implementation, but you set " f"'torch.use_deterministic_algorithms(True, warn_only=True)'. " f"You can file an issue at https://github.com/pytorch/pytorch/issues " - f"to help us prioritize adding deterministic support for this operation." + f"to help us prioritize adding deterministic support for this operation.", + stacklevel=2, ) else: torch._check( diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py index 23e242290d9..8f6b7e5f1a5 100644 --- a/torch/_prims_common/wrappers.py +++ b/torch/_prims_common/wrappers.py @@ -180,7 +180,7 @@ def _resize_output_check(out: TensorLikeType, shape: ShapeType): "be resized unless they have zero elements. " "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)." ) - warnings.warn(msg) + warnings.warn(msg, stacklevel=2) return True diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 9e70cdf9a9e..6dcee880deb 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -3729,7 +3729,8 @@ def istft( if end > expected_output_signal_len: warnings.warn( "The length of signal is shorter than the length parameter. Result is being " - + "padded with zeros in the tail. Please check your center and hop_length settings" + + "padded with zeros in the tail. Please check your center and hop_length settings", + stacklevel=2, ) y = aten.constant_pad_nd(y, (0, end - expected_output_signal_len), 0) return y diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py index 83d0afb837b..208f48da361 100644 --- a/torch/_subclasses/functional_tensor.py +++ b/torch/_subclasses/functional_tensor.py @@ -405,7 +405,8 @@ class FunctionalTensorMode(TorchDispatchMode): warnings.warn( f"At pre-dispatch tracing, we assume that any custom op marked with " f"CompositeImplicitAutograd and have functional schema are safe to not decompose. " - f"Found {func} to be one such op." + f"Found {func} to be one such op.", + stacklevel=2, ) return False return True diff --git a/torch/_tensor.py b/torch/_tensor.py index 165fd6ba7e1..f020b733094 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -350,7 +350,8 @@ class Tensor(torch._C.TensorBase): # hypothesis is that no one cares for meta tensors. if skip_data: warnings.warn( - "Serializing tensors on the meta device under skip_data context manager is a no-op" + "Serializing tensors on the meta device under skip_data context manager is a no-op", + stacklevel=2, ) arg_meta = ( self.dtype, @@ -1033,7 +1034,7 @@ class Tensor(torch._C.TensorBase): def resize(self, *sizes): if has_torch_function_unary(self): return handle_torch_function(Tensor.resize, (self,), self, *sizes) - warnings.warn("non-inplace resize is deprecated") + warnings.warn("non-inplace resize is deprecated", stacklevel=2) from torch.autograd._functions import Resize return Resize.apply(self, sizes) @@ -1041,7 +1042,7 @@ class Tensor(torch._C.TensorBase): def resize_as(self, tensor): if has_torch_function_variadic(self, tensor): return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor) - warnings.warn("non-inplace resize_as is deprecated") + warnings.warn("non-inplace resize_as is deprecated", stacklevel=2) from torch.autograd._functions import Resize return Resize.apply(self, tensor.size()) diff --git a/torch/_utils.py b/torch/_utils.py index 991e543e7a5..01cf9d39318 100644 --- a/torch/_utils.py +++ b/torch/_utils.py @@ -118,7 +118,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs): message = "{}() got an unexpected keyword argument '{}'" argument = list(kwargs.keys()).pop() raise TypeError(message.format(function_name, argument)) - warnings.warn("'async' is deprecated; use 'non_blocking'") + warnings.warn("'async' is deprecated; use 'non_blocking'", stacklevel=2) return kwargs["async"] diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py index d33c10ed384..1ac9d2046f2 100644 --- a/torch/_weights_only_unpickler.py +++ b/torch/_weights_only_unpickler.py @@ -555,7 +555,8 @@ class Unpickler: f"Detected pickle protocol {self.proto} in the checkpoint, which was " "not the default pickle protocol used by `torch.load` (2). The weights_only " "Unpickler might not support all instructions implemented by this protocol, " - "please file an issue for adding support if you encounter this." + "please file an issue for adding support if you encounter this.", + stacklevel=2, ) elif key[0] == STOP[0]: rc = self.stack.pop() diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py index c23058dc336..5b4666fcb28 100644 --- a/torch/amp/autocast_mode.py +++ b/torch/amp/autocast_mode.py @@ -267,7 +267,8 @@ class autocast: and torch.cuda.amp.common.amp_definitely_not_available() ): warnings.warn( - "User provided device_type of 'cuda', but CUDA is not available. Disabling" + "User provided device_type of 'cuda', but CUDA is not available. Disabling", + stacklevel=2, ) enabled = False if cache_enabled is not None: @@ -281,42 +282,42 @@ class autocast: error_message += ( ", ".join(str(dtype) for dtype in supported_dtype) + " currently." ) - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "mtia": supported_dtype = [torch.bfloat16, torch.float16] if self.fast_dtype not in supported_dtype: error_message = "In MTIA autocast, but the target dtype is not supported. Disabling autocast.\n" error_message += "MTIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently." - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "maia": supported_dtype = [torch.bfloat16, torch.float16] if self.fast_dtype not in supported_dtype: error_message = "In MAIA autocast, but the target dtype is not supported. Disabling autocast.\n" error_message += "MAIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently." - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "xpu": supported_dtype = [torch.bfloat16, torch.float16] if self.fast_dtype not in supported_dtype: error_message = "In XPU autocast, but the target dtype is not supported. Disabling autocast.\n" error_message += "XPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently." - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "ipu": supported_dtypes = [torch.bfloat16, torch.float16] if self.fast_dtype not in supported_dtypes: error_message = "In IPU autocast, but the target dtype is not supported. Disabling autocast.\n" error_message += "IPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently." - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "hpu": supported_dtype = [torch.bfloat16, torch.float16] if self.fast_dtype not in supported_dtype: error_message = "In HPU autocast, but the target dtype is not supported. Disabling autocast.\n" error_message += "HPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently." - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == self.custom_backend_name: supported_dtype = self.custom_device_mod.get_amp_supported_dtype() @@ -326,7 +327,7 @@ class autocast: error_message += ( ", ".join(str(dtype) for dtype in supported_dtype) + " currently." ) - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "cuda": if ( @@ -344,7 +345,7 @@ class autocast: "In MPS autocast, but the target dtype is not supported. Disabling autocast.\n" "MPS Autocast only supports dtype of torch.bfloat16 and torch.float16 currently." ) - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.fast_dtype == torch.bfloat16: if not torch.backends.mps.is_macos_or_newer(14, 0): @@ -352,7 +353,7 @@ class autocast: "In MPS autocast, but the target dtype torch.bfloat16 is not supported " "on macOS versions below 14. Disabling autocast." ) - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False elif self.device == "xla": supported_dtype = [torch.float16, torch.bfloat16] @@ -361,7 +362,7 @@ class autocast: error_message += ( "XLA Autocast only supports dtype of torch.bfloat16 currently." ) - warnings.warn(error_message) + warnings.warn(error_message, stacklevel=2) enabled = False self._enabled = enabled diff --git a/torch/amp/grad_scaler.py b/torch/amp/grad_scaler.py index 54314b034d1..506c2cf9016 100644 --- a/torch/amp/grad_scaler.py +++ b/torch/amp/grad_scaler.py @@ -422,6 +422,7 @@ class GradScaler: "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and " "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.", FutureWarning, + stacklevel=2, ) kwargs_.update({"grad_scaler": self}) else: diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py index f13bb9b1a16..9b807e82ddb 100644 --- a/torch/ao/nn/quantizable/modules/rnn.py +++ b/torch/ao/nn/quantizable/modules/rnn.py @@ -469,14 +469,16 @@ class LSTM(torch.nn.Module): warnings.warn( "dropout option for quantizable LSTM is ignored. " "If you are training, please, use nn.LSTM version " - "followed by `prepare` step." + "followed by `prepare` step.", + stacklevel=2, ) if num_layers == 1: warnings.warn( "dropout option adds dropout after all but last " "recurrent layer, so non-zero dropout expects " f"num_layers greater than 1, but got dropout={dropout} " - f"and num_layers={num_layers}" + f"and num_layers={num_layers}", + stacklevel=2, ) layers = [ diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py index 1f8a65fe9d6..0c8785da899 100644 --- a/torch/ao/nn/quantized/dynamic/modules/conv.py +++ b/torch/ao/nn/quantized/dynamic/modules/conv.py @@ -68,7 +68,8 @@ class Conv1d(nnq.Conv1d): reduce_range=True, ): warnings.warn( - f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended" # noqa: B950 + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, ) factory_kwargs = {"device": device, "dtype": dtype} kernel_size = _single(kernel_size) @@ -155,7 +156,8 @@ class Conv2d(nnq.Conv2d): ): warnings.warn( f"The current implementation of the {self._get_name()} module " - "has poor numerical accuracy and its use is not recommended" + "has poor numerical accuracy and its use is not recommended", + stacklevel=2, ) factory_kwargs = {"device": device, "dtype": dtype} kernel_size = _pair(kernel_size) @@ -239,7 +241,8 @@ class Conv3d(nnq.Conv3d): dtype=None, ): warnings.warn( - f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended" # noqa: B950 + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, ) assert padding_mode != "reflect", "Conv3d does not support reflection padding" factory_kwargs = {"device": device, "dtype": dtype} @@ -330,7 +333,8 @@ class ConvTranspose1d(nnq.ConvTranspose1d): dtype=None, ): warnings.warn( - f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended" # noqa: B950 + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, ) factory_kwargs = {"device": device, "dtype": dtype} super().__init__( @@ -412,7 +416,8 @@ class ConvTranspose2d(nnq.ConvTranspose2d): dtype=None, ): warnings.warn( - f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended" # noqa: B950 + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, ) factory_kwargs = {"device": device, "dtype": dtype} super().__init__( @@ -494,7 +499,8 @@ class ConvTranspose3d(nnq.ConvTranspose3d): dtype=None, ): warnings.warn( - f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended" # noqa: B950 + f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended", # noqa: B950 + stacklevel=2, ) factory_kwargs = {"device": device, "dtype": dtype} super().__init__( diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py index fb5371ea4a4..4c2b43189c3 100644 --- a/torch/ao/nn/quantized/dynamic/modules/rnn.py +++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py @@ -136,7 +136,8 @@ class RNNBase(torch.nn.Module): "dropout option adds dropout after all but last " "recurrent layer, so non-zero dropout expects " f"num_layers greater than 1, but got dropout={dropout} and " - f"num_layers={num_layers}" + f"num_layers={num_layers}", + stacklevel=2, ) if mode == "LSTM": diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py index 51a2f4905c2..30994b2921b 100644 --- a/torch/ao/nn/quantized/functional.py +++ b/torch/ao/nn/quantized/functional.py @@ -724,7 +724,8 @@ def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners= affects the outputs. """ warnings.warn( - "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead." + "nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, ) return interpolate(input, size, scale_factor, mode, align_corners) @@ -749,7 +750,8 @@ def upsample_bilinear(input, size=None, scale_factor=None): """ # DeprecationWarning is ignored by default warnings.warn( - "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead." + "nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, ) return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True) @@ -774,6 +776,7 @@ def upsample_nearest(input, size=None, scale_factor=None): """ # DeprecationWarning is ignored by default warnings.warn( - "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead." + "nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.", + stacklevel=2, ) return interpolate(input, size, scale_factor, mode="nearest") diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py index 67b69eb7390..c8c1571bda3 100644 --- a/torch/ao/nn/quantized/modules/activation.py +++ b/torch/ao/nn/quantized/modules/activation.py @@ -322,7 +322,8 @@ class PReLU(torch.nn.Module): observer(float_wt) if observer.dtype != torch.quint8: warn( - f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}" + f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}", + stacklevel=2, ) wt_scale, wt_zp = observer.calculate_qparams() qweight = torch.quantize_per_tensor( @@ -339,7 +340,8 @@ class PReLU(torch.nn.Module): observer(float_wt) if observer.dtype != torch.quint8: warn( - f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}" + f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}", + stacklevel=2, ) wt_scale, wt_zp = observer.calculate_qparams() qweight = torch.quantize_per_tensor( diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py index 4330b0e2425..d536245b0e9 100644 --- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py +++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py @@ -213,7 +213,8 @@ class ActivationSparsifier: if name in self.data_groups: # unregister layer if already present warnings.warn( - "layer already attached to the sparsifier, deregistering the layer and registering with new config" + "layer already attached to the sparsifier, deregistering the layer and registering with new config", + stacklevel=2, ) self.unregister_layer(name=name) diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py index 672903e8f05..c2f48abfc9d 100644 --- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py +++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py @@ -158,6 +158,7 @@ class BaseDataScheduler: "initialization. Please, make sure to call `data_sparsifier.step()` before " "`scheduler.step()`.", UserWarning, + stacklevel=2, ) # Just check if there were two first scheduler.step() calls before sparsifier.step() @@ -167,6 +168,7 @@ class BaseDataScheduler: "You have to make sure you run the data_sparsifier.step() BEFORE any " "calls to the scheduler.step().", UserWarning, + stacklevel=2, ) self._step_count += 1 diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py index 3dea01586a2..0db7becdda5 100644 --- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py +++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py @@ -105,7 +105,8 @@ class BaseDataSparsifier(base_sparsifier.BaseSparsifier): if name in self.state: # If the named data already exists - replace warnings.warn( - "Replacing existing data of the same name. - Did you mean a different name?" + "Replacing existing data of the same name. - Did you mean a different name?", + stacklevel=2, ) # reuse old config diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py index 5a36e13c7b4..9447e3331c2 100644 --- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py +++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py @@ -74,6 +74,7 @@ class StepSLScheduler(BaseDataScheduler): "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning, + stacklevel=2, ) data_groups = self.data_sparsifier.data_groups if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0): diff --git a/torch/ao/pruning/scheduler/base_scheduler.py b/torch/ao/pruning/scheduler/base_scheduler.py index f602028d475..ac8916713da 100644 --- a/torch/ao/pruning/scheduler/base_scheduler.py +++ b/torch/ao/pruning/scheduler/base_scheduler.py @@ -92,7 +92,8 @@ class BaseScheduler: if not self._get_sl_called_within_step: warnings.warn( "To get the last sparsity level computed by the scheduler, " - "please use `get_last_sl()`." + "please use `get_last_sl()`.", + stacklevel=2, ) raise NotImplementedError @@ -124,6 +125,7 @@ class BaseScheduler: "initialization. Please, make sure to call `sparsifier.step()` before " "`scheduler.step()`.", UserWarning, + stacklevel=2, ) # Just check if there were two first scheduler.step() calls before sparsifier.step() @@ -133,6 +135,7 @@ class BaseScheduler: "You have to make sure you run the sparsifier.step() BEFORE any " "calls to the scheduler.step().", UserWarning, + stacklevel=2, ) self._step_count += 1 diff --git a/torch/ao/pruning/scheduler/cubic_scheduler.py b/torch/ao/pruning/scheduler/cubic_scheduler.py index 45985a8bbc5..d4706900762 100644 --- a/torch/ao/pruning/scheduler/cubic_scheduler.py +++ b/torch/ao/pruning/scheduler/cubic_scheduler.py @@ -90,7 +90,8 @@ class CubicSL(BaseScheduler): if not self._get_sl_called_within_step: warnings.warn( "To get the last sparsity level computed by the scheduler, " - "please use `get_last_sl()`." + "please use `get_last_sl()`.", + stacklevel=2, ) return [ self.sparsity_compute_fn( diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py index 7c0e8088890..d9b6cb0a4d9 100644 --- a/torch/ao/pruning/scheduler/lambda_scheduler.py +++ b/torch/ao/pruning/scheduler/lambda_scheduler.py @@ -56,7 +56,8 @@ class LambdaSL(BaseScheduler): if not self._get_sl_called_within_step: warnings.warn( "To get the last sparsity level computed by the scheduler, " - "please use `get_last_sl()`." + "please use `get_last_sl()`.", + stacklevel=2, ) return [ base_sl * lmbda(self.last_epoch) diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py index 71563c236aa..37b72c372e2 100644 --- a/torch/ao/quantization/fx/_equalize.py +++ b/torch/ao/quantization/fx/_equalize.py @@ -121,7 +121,8 @@ class _InputEqualizationObserver(nn.Module): ): warnings.warn( "Must call calculate_equalization_scale before calling calculate_scaled_minmax. " - + "Will not scale the next quantization observer." + + "Will not scale the next quantization observer.", + stacklevel=2, ) return None, None @@ -226,7 +227,8 @@ def calculate_equalization_scale( ): warnings.warn( "Must run observer before calling calculate_equalization_scale. " - + "Returning default equalization scale torch.tensor(1)." + + "Returning default equalization scale torch.tensor(1).", + stacklevel=2, ) return torch.tensor(1) diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py index cde3a92987c..6ad8433230f 100644 --- a/torch/ao/quantization/fx/convert.py +++ b/torch/ao/quantization/fx/convert.py @@ -597,7 +597,8 @@ def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph) -> No _maybe_recursive_remove_dequantize(arg_element, node, graph) else: warnings.warn( - f"Unsupported node type in recursive remove dequantize: {type(arg)}" + f"Unsupported node type in recursive remove dequantize: {type(arg)}", + stacklevel=2, ) @@ -1197,7 +1198,8 @@ def convert( _maybe_recursive_remove_dequantize(output, return_node, model.graph) else: warnings.warn( - f"Unsupported node type for output_quantized_idxs: {type(output)}" + f"Unsupported node type for output_quantized_idxs: {type(output)}", + stacklevel=2, ) elif node.op == "call_module": mod = _get_module(node, modules) diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py index 4ea44181e96..e3561965daf 100644 --- a/torch/ao/quantization/fx/prepare.py +++ b/torch/ao/quantization/fx/prepare.py @@ -1055,7 +1055,9 @@ def _maybe_insert_input_equalization_observers_for_node( return if is_branch: - warnings.warn(f"Cannot equalize {node} because it is part of a branch.") + warnings.warn( + f"Cannot equalize {node} because it is part of a branch.", stacklevel=2 + ) return new_args = [] diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py index 287b30c0bb8..232d4456843 100644 --- a/torch/ao/quantization/fx/utils.py +++ b/torch/ao/quantization/fx/utils.py @@ -890,7 +890,8 @@ def _qconfig_satisfies_dtype_config_constraints( if backend_quant_min is not None and backend_quant_max is not None: if app_quant_min is None or app_quant_max is None: warnings.warn( - f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}" + f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}", + stacklevel=2, ) return False elif app_quant_min < backend_quant_min or app_quant_max > backend_quant_max: @@ -898,20 +899,23 @@ def _qconfig_satisfies_dtype_config_constraints( f"QConfig {debug_string} quantization range must fall within the backend's:\n" f"QConfig range = ({app_quant_min}, {app_quant_max}), " f"BackendConfig range = ({backend_quant_min}, {backend_quant_max}), " - f"ignoring {qconfig}" + f"ignoring {qconfig}", + stacklevel=2, ) return False # check scale min if backend_scale_min is not None: if app_scale_min is None: warnings.warn( - f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}" + f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}", + stacklevel=2, ) return False if app_scale_min < backend_scale_min: warnings.warn( f"QConfig {debug_string} eps ({app_scale_min}) must be greater than or equal to " - f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}" + f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}", + stacklevel=2, ) return False # check fixed scale and zero point @@ -935,7 +939,8 @@ def _qconfig_satisfies_dtype_config_constraints( ) and not isinstance(activation_post_process, FixedQParamsFakeQuantize): warnings.warn( f"QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize " - f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}" + f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}", + stacklevel=2, ) return False if ( @@ -945,7 +950,8 @@ def _qconfig_satisfies_dtype_config_constraints( warnings.warn( f"QConfig fixed scale ({observer.scale}) and zero point ({observer.zero_point}) " f"do not match the backend's ({backend_scale_exact_match} and {backend_zero_point_exact_match}), " - f"ignoring {qconfig}.\n{suggestion_str}" + f"ignoring {qconfig}.\n{suggestion_str}", + stacklevel=2, ) return False return True diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py index 160738c93ee..06d1a3fd717 100644 --- a/torch/ao/quantization/observer.py +++ b/torch/ao/quantization/observer.py @@ -245,7 +245,8 @@ class UniformQuantizationObserverBase(ObserverBase): if reduce_range: warnings.warn( "Please use quant_min and quant_max to specify the range for observers. \ - reduce_range will be deprecated in a future release of PyTorch." + reduce_range will be deprecated in a future release of PyTorch.", + stacklevel=2, ) self.reduce_range = reduce_range self.register_buffer("eps", torch.tensor([eps], **factory_kwargs)) @@ -829,7 +830,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase): self.max_val.resize_(val.shape) else: warnings.warn( - f"Observer load_from_state_dict got unexpected name {name}" + f"Observer load_from_state_dict got unexpected name {name}", + stacklevel=2, ) # For torchscript module we need to update the attributes here since we do not # call the `_load_from_state_dict` function defined module.py @@ -840,7 +842,8 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase): self.max_val.copy_(val) else: warnings.warn( - f"Observer load_from_state_dict got unexpected name {name}" + f"Observer load_from_state_dict got unexpected name {name}", + stacklevel=2, ) elif strict: missing_keys.append(key) @@ -1289,7 +1292,9 @@ class HistogramObserver(UniformQuantizationObserverBase): # want to make our quantization range infinite # and in practice those values will be clamped if x_min == -torch.inf or x_max == torch.inf: - warnings.warn("torch.inf detected in input tensor, ignoring input") + warnings.warn( + "torch.inf detected in input tensor, ignoring input", stacklevel=2 + ) x = x[x.abs() != torch.inf] if x.numel() == 0: return x_orig @@ -1345,7 +1350,8 @@ class HistogramObserver(UniformQuantizationObserverBase): if is_uninitialized: warnings.warn( "must run observer before calling calculate_qparams.\ - Returning default scale and zero point " + Returning default scale and zero point ", + stacklevel=2, ) return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor( [0], device=self.min_val.device.type @@ -1509,7 +1515,8 @@ class PlaceholderObserver(ObserverBase): warnings.warn( "Please use `is_dynamic` instead of `compute_dtype`. \ `compute_dtype` will be deprecated in a future release \ - of PyTorch." + of PyTorch.", + stacklevel=2, ) def forward(self, x): diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py index 623fd12434a..c3d9f773390 100644 --- a/torch/ao/quantization/qconfig.py +++ b/torch/ao/quantization/qconfig.py @@ -292,7 +292,8 @@ def get_default_qconfig(backend="x86", version=0): if not torch.cpu._is_vnni_supported(): warnings.warn( "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues " - "on CPU without Vector Neural Network Instruction support." + "on CPU without Vector Neural Network Instruction support.", + stacklevel=2, ) qconfig = QConfig( activation=HistogramObserver.with_args(reduce_range=False), diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index 5a0037b6620..3c53876081e 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -392,7 +392,8 @@ def prepare( warnings.warn( "None of the submodule got qconfig applied. Make sure you " "passed correct configuration through `qconfig_dict` or " - "by assigning the `.qconfig` attribute directly on submodules" + "by assigning the `.qconfig` attribute directly on submodules", + stacklevel=2, ) _add_observer_( diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py index db47aa04790..c6fed271a3a 100644 --- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py +++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py @@ -372,6 +372,7 @@ def _config_checker(method: Callable) -> Callable: if quantizer._need_skip_config(quantization_config): warnings.warn( f"Skip the quantization config for {name}.", + stacklevel=2, ) return quantizer return method(quantizer, name, quantization_config) @@ -464,7 +465,10 @@ class X86InductorQuantizer(Quantizer): current_mode.qat_state is not None and current_mode.qat_state != quantization_config.is_qat ): - warnings.warn("Mixed QAT and Non-QAT quantization config is not supported.") + warnings.warn( + "Mixed QAT and Non-QAT quantization config is not supported.", + stacklevel=2, + ) need_skip = True if current_mode.dynamic_state is not None: input_activation_spec = quantization_config.input_activation @@ -473,14 +477,15 @@ class X86InductorQuantizer(Quantizer): and current_mode.dynamic_state != input_activation_spec.is_dynamic ): warnings.warn( - "Mixed dynamic and static quantization config is not supported." + "Mixed dynamic and static quantization config is not supported.", + stacklevel=2, ) need_skip = True return need_skip def set_global(self, quantization_config: QuantizationConfig): if self._need_skip_config(quantization_config): - warnings.warn("Skip the global quantization config.") + warnings.warn("Skip the global quantization config.", stacklevel=2) return self self.global_config = quantization_config return self @@ -489,7 +494,8 @@ class X86InductorQuantizer(Quantizer): if not isinstance(self.global_config, QuantizationConfig): warnings.warn( "The global_config for X86InductorQuantizer is currently invalid. \ - Please ensure that you use set_global to establish the global quantization configuration." + Please ensure that you use set_global to establish the global quantization configuration.", + stacklevel=2, ) return self.global_config @@ -508,7 +514,8 @@ class X86InductorQuantizer(Quantizer): ) else: warnings.warn( - f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer." + f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer.", + stacklevel=2, ) return self @@ -525,7 +532,8 @@ class X86InductorQuantizer(Quantizer): ) else: warnings.warn( - f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer." + f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer.", + stacklevel=2, ) return self @@ -551,7 +559,8 @@ class X86InductorQuantizer(Quantizer): self.operator_type_qconfig[operator_type] = quantization_config else: warnings.warn( - f"operator: Unable to quantize {operator} by X86InductorQuantizer." + f"operator: Unable to quantize {operator} by X86InductorQuantizer.", + stacklevel=2, ) return self @@ -1317,7 +1326,8 @@ class X86InductorQuantizer(Quantizer): if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check): if quantization_config is not None: warnings.warn( - f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}." + f"The input of maxpool2d is not quantized, skip annotate maxpool2d with config {quantization_config}.", + stacklevel=2, ) return diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py index 1874dc6e20b..63c635565c4 100644 --- a/torch/ao/quantization/utils.py +++ b/torch/ao/quantization/utils.py @@ -427,7 +427,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool: if min_val.numel() == 0 or max_val.numel() == 0: warnings.warn( "must run observer before calling calculate_qparams. " - + "Returning default values." + + "Returning default values.", + stacklevel=2, ) return False @@ -435,7 +436,8 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool: if min_val == float("inf") and max_val == float("-inf"): warnings.warn( "must run observer before calling calculate_qparams. " - + "Returning default values." + + "Returning default values.", + stacklevel=2, ) return False @@ -806,7 +808,8 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any: """ if {torch.device("cpu"), torch.device("meta")} == devices: warnings.warn( - "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'." + "Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.", + stacklevel=2, ) devices = {torch.device("cpu")} "" diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py index 674e42b34ad..956075590bb 100644 --- a/torch/autograd/gradcheck.py +++ b/torch/autograd/gradcheck.py @@ -944,7 +944,8 @@ def _check_inputs(tupled_inputs) -> bool: f"Input #{idx} requires gradient and " "is not a double precision floating point or complex. " "This check will likely fail if all the inputs are " - "not of double precision floating point or complex. " + "not of double precision floating point or complex. ", + stacklevel=2, ) if inp.is_sparse: content = inp._values() @@ -1325,7 +1326,8 @@ def _test_undefined_backward_mode(func, outputs, inputs) -> bool: "Backwards compatibility: New undefined gradient support checking " "feature is enabled by default, but it may break existing callers " "of this function. If this is true for you, you can call this " - 'function with "check_undefined_grad=False" to disable the feature' + 'function with "check_undefined_grad=False" to disable the feature', + stacklevel=2, ) def check_undefined_grad_support(output_to_check): diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 5c478e514d0..de821b7513d 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -265,22 +265,24 @@ class profile: if _get_privateuse1_backend_name() != "privateuseone": VALID_DEVICE_OPTIONS.append(_get_privateuse1_backend_name()) if self.use_device not in VALID_DEVICE_OPTIONS: - warn(f"The {self.use_device} is not a valid device option.") + warn( + f"The {self.use_device} is not a valid device option.", stacklevel=2 + ) self.use_device = None if self.use_device == "cuda" and not torch.cuda.is_available(): - warn("CUDA is not available, disabling CUDA profiling") + warn("CUDA is not available, disabling CUDA profiling", stacklevel=2) self.use_cuda = False self.use_device = None if self.use_device == "xpu" and not torch.xpu.is_available(): - warn("XPU is not available, disabling XPU profiling") + warn("XPU is not available, disabling XPU profiling", stacklevel=2) self.use_device = None if self.use_device == "hpu" and not ( hasattr(torch, "hpu") and torch.hpu.is_available() ): - warn("HPU is not available, disabling HPU profiling") + warn("HPU is not available, disabling HPU profiling", stacklevel=2) self.use_device = None self.kineto_activities = set() @@ -1224,7 +1226,8 @@ class KinetoStepTracker: if delta > 1: warn( "Profiler step count has increased more than 1 - " - f"current_step = {cls._current_step} step dict = {cls._step_dict}" + f"current_step = {cls._current_step} step dict = {cls._step_dict}", + stacklevel=2, ) for _ in range(delta): _kineto_step() diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py index 907b6d0b862..3423490d514 100644 --- a/torch/backends/cudnn/__init__.py +++ b/torch/backends/cudnn/__init__.py @@ -118,7 +118,8 @@ def is_acceptable(tensor): if not is_available(): warnings.warn( "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild " - "PyTorch making sure the library is visible to the build system." + "PyTorch making sure the library is visible to the build system.", + stacklevel=2, ) return False if not _init(): @@ -127,7 +128,8 @@ def is_acceptable(tensor): libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get( sys.platform, "LD_LIBRARY_PATH" ) - ) + ), + stacklevel=2, ) return False return True diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index bb4a1e29dae..1d7155a1a61 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -293,7 +293,8 @@ def _check_capability(): min_arch % 10, max_arch // 10, max_arch % 10, - ) + ), + stacklevel=2, ) matched_arches = "" for arch, arch_info in CUDA_ARCHES_SUPPORTED.items(): @@ -303,7 +304,9 @@ def _check_capability(): ): matched_arches += f" {arch}" if matched_arches != "": - warnings.warn(matched_cuda_warn.format(matched_arches)) + warnings.warn( + matched_cuda_warn.format(matched_arches), stacklevel=2 + ) def _check_cubins(): @@ -328,7 +331,8 @@ If you want to use the {} GPU with PyTorch, please check the instructions at htt warnings.warn( incompatible_device_warn.format( device_name, capability, " ".join(arch_list), device_name - ) + ), + stacklevel=2, ) @@ -818,7 +822,9 @@ def _raw_device_count_amdsmi() -> int: try: amdsmi.amdsmi_init() except amdsmi.AmdSmiException as e: - warnings.warn(f"Can't initialize amdsmi - Error code: {e.err_code}") + warnings.warn( + f"Can't initialize amdsmi - Error code: {e.err_code}", stacklevel=2 + ) return -1 socket_handles = amdsmi.amdsmi_get_processor_handles() return len(socket_handles) @@ -831,12 +837,12 @@ def _raw_device_count_nvml() -> int: nvml_h = CDLL("libnvidia-ml.so.1") rc = nvml_h.nvmlInit() if rc != 0: - warnings.warn("Can't initialize NVML") + warnings.warn("Can't initialize NVML", stacklevel=2) return -1 dev_count = c_int(-1) rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) if rc != 0: - warnings.warn("Can't get nvml device count") + warnings.warn("Can't get nvml device count", stacklevel=2) return -1 del nvml_h return dev_count.value @@ -850,27 +856,27 @@ def _raw_device_uuid_amdsmi() -> Optional[list[str]]: try: amdsmi.amdsmi_init() except amdsmi.AmdSmiException: - warnings.warn("Can't initialize amdsmi") + warnings.warn("Can't initialize amdsmi", stacklevel=2) return None try: socket_handles = amdsmi.amdsmi_get_processor_handles() dev_count = len(socket_handles) except amdsmi.AmdSmiException: - warnings.warn("Can't get amdsmi device count") + warnings.warn("Can't get amdsmi device count", stacklevel=2) return None uuids: list[str] = [] for idx in range(dev_count): try: handler = amdsmi.amdsmi_get_processor_handles()[idx] except amdsmi.AmdSmiException: - warnings.warn("Cannot get amd device handler") + warnings.warn("Cannot get amd device handler", stacklevel=2) return None try: uuid = amdsmi.amdsmi_get_gpu_asic_info(handler)["asic_serial"][ 2: ] # Removes 0x prefix from serial except amdsmi.AmdSmiException: - warnings.warn("Cannot get uuid for amd device") + warnings.warn("Cannot get uuid for amd device", stacklevel=2) return None uuids.append( str(uuid).lower() @@ -885,25 +891,25 @@ def _raw_device_uuid_nvml() -> Optional[list[str]]: nvml_h = CDLL("libnvidia-ml.so.1") rc = nvml_h.nvmlInit() if rc != 0: - warnings.warn("Can't initialize NVML") + warnings.warn("Can't initialize NVML", stacklevel=2) return None dev_count = c_int(-1) rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) if rc != 0: - warnings.warn("Can't get nvml device count") + warnings.warn("Can't get nvml device count", stacklevel=2) return None uuids: list[str] = [] for idx in range(dev_count.value): dev_id = c_void_p() rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) if rc != 0: - warnings.warn("Can't get device handle") + warnings.warn("Can't get device handle", stacklevel=2) return None buf_len = 96 buf = create_string_buffer(buf_len) rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) if rc != 0: - warnings.warn("Can't get device UUID") + warnings.warn("Can't get device UUID", stacklevel=2) return None uuids.append(buf.raw.decode("ascii").strip("\0")) del nvml_h diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py index dc4c3827c8a..b39c6a63f92 100644 --- a/torch/cuda/memory.py +++ b/torch/cuda/memory.py @@ -492,6 +492,7 @@ def reset_max_memory_allocated(device: "Device" = None) -> None: "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " "which resets /all/ peak memory stats.", FutureWarning, + stacklevel=2, ) return reset_peak_memory_stats(device=device) @@ -518,6 +519,7 @@ def reset_max_memory_cached(device: "Device" = None) -> None: "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " "which resets /all/ peak memory stats.", FutureWarning, + stacklevel=2, ) return reset_peak_memory_stats(device=device) diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py index 7fa06bd7c12..bef781c19a0 100644 --- a/torch/cuda/nccl.py +++ b/torch/cuda/nccl.py @@ -14,7 +14,7 @@ SUM = 0 # ncclRedOp_t def is_available(tensors): if not hasattr(torch._C, "_nccl_all_reduce"): - warnings.warn("PyTorch is not compiled with NCCL support") + warnings.warn("PyTorch is not compiled with NCCL support", stacklevel=2) return False devices = set() diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py index 262c6870d40..4a5ee73cbdd 100644 --- a/torch/cuda/tunable.py +++ b/torch/cuda/tunable.py @@ -626,7 +626,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None: else: warnings.warn( "Offline tuning is not supported for this GEMM. Use online tuning instead. " - + f"Skipped tuning for: {untuned_gemm[1]}" + + f"Skipped tuning for: {untuned_gemm[1]}", + stacklevel=2, ) return @@ -644,7 +645,8 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None: if m == 1 or n == 1 or k == 1: warnings.warn( "Offline tuning is not support for this GEMM. Use online tuning instead. " - + f"Skipped tuning for: {untuned_gemm[1]}" + + f"Skipped tuning for: {untuned_gemm[1]}", + stacklevel=2, ) return @@ -747,7 +749,7 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None: matA = matA.t() torch.nn.functional.linear(X, matA, bias) else: - warnings.warn(f"error: unknown op {op_sig}") + warnings.warn(f"error: unknown op {op_sig}", stacklevel=2) def _check_tuning_assertions() -> None: @@ -756,7 +758,7 @@ def _check_tuning_assertions() -> None: """ if is_enabled() is False: - warnings.warn("TunableOp was disabled. Trying to enable now.") + warnings.warn("TunableOp was disabled. Trying to enable now.", stacklevel=2) enable(True) assert is_enabled() is True assert tuning_is_enabled() is True diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py index e760a1a0744..8574e258335 100644 --- a/torch/distributed/_functional_collectives.py +++ b/torch/distributed/_functional_collectives.py @@ -23,7 +23,8 @@ try: from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling except Exception: warnings.warn( - "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly" + "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly", + stacklevel=2, ) def is_torchdynamo_compiling(): # type: ignore[misc] diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py index 7b709a2965c..87fcc014257 100644 --- a/torch/distributed/_shard/sharded_tensor/api.py +++ b/torch/distributed/_shard/sharded_tensor/api.py @@ -470,7 +470,8 @@ class ShardedTensor(ShardedTensorBase): src = shard.tensor.flatten() if src.nelement() == 0: warnings.warn( - "Gathering a tensor with zero elements on rank " + str(rank) + "Gathering a tensor with zero elements on rank " + str(rank), + stacklevel=2, ) continue shard_offset = shard_placement[shard.metadata][1] @@ -671,7 +672,8 @@ class ShardedTensor(ShardedTensorBase): if device_to.index != current_idx: warnings.warn( "ShardedTensor.to only move tensor to its current device" - "If you want to put to different device, use `reshard` instead." + "If you want to put to different device, use `reshard` instead.", + stacklevel=2, ) device_to = torch.device(current_idx) diff --git a/torch/distributed/_tools/mod_tracker.py b/torch/distributed/_tools/mod_tracker.py index 3d5c1783d8a..ad736a8302f 100644 --- a/torch/distributed/_tools/mod_tracker.py +++ b/torch/distributed/_tools/mod_tracker.py @@ -182,7 +182,8 @@ class ModTracker: warnings.formatwarning = custom_formatwarning warnings.warn( "The module hierarchy tracking maybe be messed up." - " Please file a bug to PyTorch, if it is the case." + " Please file a bug to PyTorch, if it is the case.", + stacklevel=2, ) if name not in self.parents: self._active_module_cnt[name] = 1 diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py index ee07c75f7ee..bf7cb117f87 100644 --- a/torch/distributed/algorithms/join.py +++ b/torch/distributed/algorithms/join.py @@ -257,7 +257,8 @@ class Join: f"{self._rank} has at least {WARN_THRESHOLD} " f"fewer inputs than other currently-active ranks. " "This level of skew could lead to performance " - "degradation during training." + "degradation during training.", + stacklevel=2, ) # Shadow the all-reduce in non-joined processes num_nonjoined_procs = self._get_num_nonjoined_procs() diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py index eec08464167..dd97e519180 100644 --- a/torch/distributed/algorithms/model_averaging/averagers.py +++ b/torch/distributed/algorithms/model_averaging/averagers.py @@ -101,7 +101,8 @@ class PeriodicModelAverager(ModelAverager): "When period is 1, no need to use model averaging because the communication cost " "of all-reducing parameters will be no less than the cost of all-reducing gradients " "by DistributedDataParallel in the backward pass. Therefore, only " - "DistributedDataParallel should be used for this case." + "DistributedDataParallel should be used for this case.", + stacklevel=2, ) self.period = period diff --git a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py index a52fc2babed..33cde4cb3a7 100644 --- a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py +++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py @@ -114,7 +114,8 @@ class HierarchicalModelAverager(averagers.ModelAverager): "no need to use model averaging because the communication cost " "of all-reducing parameters will be no less than the cost of all-reducing gradients " "by DistributedDataParallel in the backward pass. Therefore, only " - "DistributedDataParallel should be used for this case." + "DistributedDataParallel should be used for this case.", + stacklevel=2, ) overall_group_size = dist.get_world_size(group=self.process_group) if list(period_group_size_dict.values())[-1] != overall_group_size: diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py index 5def6c13dc1..b21cac12ff9 100644 --- a/torch/distributed/checkpoint/filesystem.py +++ b/torch/distributed/checkpoint/filesystem.py @@ -660,7 +660,8 @@ class _FileSystemWriter(StorageWriter): warnings.warn( f"Detected an existing checkpoint in {self.path}, overwriting since {self.overwrite=}." " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to" - " maintain this functionality or False to raise when an existing checkpoint is found." + " maintain this functionality or False to raise when an existing checkpoint is found.", + stacklevel=2, ) else: raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.") diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py index f50a0ee8e60..0d898c3ff06 100644 --- a/torch/distributed/checkpoint/state_dict.py +++ b/torch/distributed/checkpoint/state_dict.py @@ -290,6 +290,7 @@ def _verify_options( "will be removed in 2.5. This feature can be achieved by manually " "filtering out the state_dict returned from get_state_dict.", FutureWarning, + stacklevel=2, ) if optim_only and not optims: raise RuntimeError( @@ -1234,6 +1235,7 @@ def _unflatten_model_state_dict( "feature, please preprocessing the model_state_dict to achieve the " "same functionality.", FutureWarning, + stacklevel=2, ) cast_state_dict = cast(dict[nn.Module, dict[str, ValueType]], state_dict) new_state_dict: dict[str, ValueType] = {} diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py index 389dc0e5e57..178e190e937 100644 --- a/torch/distributed/checkpoint/state_dict_loader.py +++ b/torch/distributed/checkpoint/state_dict_loader.py @@ -158,7 +158,8 @@ def load( no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized()) if no_dist: warnings.warn( - "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process." + "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to load in a single process.", + stacklevel=2, ) with _profile(): @@ -365,7 +366,8 @@ def _load_state_dict_from_keys( no_dist = not (dist.is_available() and dist.is_initialized()) if no_dist: warnings.warn( - "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process." + "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process.", + stacklevel=2, ) storage_reader = cast( diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py index ef0be9f9309..38ab2dcb510 100644 --- a/torch/distributed/checkpoint/state_dict_saver.py +++ b/torch/distributed/checkpoint/state_dict_saver.py @@ -182,7 +182,8 @@ def save( no_dist = no_dist or (not dist.is_available()) or (not dist.is_initialized()) if no_dist: warnings.warn( - "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process." + "torch.distributed is disabled, unavailable or uninitialized, assuming the intent is to save in a single process.", + stacklevel=2, ) with _profile(): @@ -414,7 +415,8 @@ def _save_state_dict( warnings.warn( "The function definition for SavePlanner.set_up_planner has been updated" " to include the storage_meta argument. Please update your implementation" - " to include this parameter." + " to include this parameter.", + stacklevel=2, ) planner.set_up_planner(state_dict, distW.is_coordinator) # type: ignore[call-arg, arg-type] else: diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py index 94844812b52..073649c5f12 100644 --- a/torch/distributed/checkpoint/utils.py +++ b/torch/distributed/checkpoint/utils.py @@ -461,7 +461,8 @@ def _api_bc_check(func): if len(args) == 2: warnings.warn( f"The argument order of {func.__name__} has been changed. " - "Please check the document to avoid future breakages." + "Please check the document to avoid future breakages.", + stacklevel=2, ) sig = inspect.signature(func) kwonlyargs = [ diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py index 052b74ba479..0a58eab1abf 100644 --- a/torch/distributed/device_mesh.py +++ b/torch/distributed/device_mesh.py @@ -85,7 +85,8 @@ else: # We keep this function for backward compatibility. warnings.warn( "This get_root_mesh API will be deprecated soon." - "Please use `get_root_mesh` inside DeviceMesh instead." + "Please use `get_root_mesh` inside DeviceMesh instead.", + stacklevel=2, ) if not device_mesh: return device_mesh @@ -108,7 +109,8 @@ else: ) -> list["DeviceMesh"]: warnings.warn( "This _get_all_submeshes API will be deprecated soon." - "Please use `_get_all_submeshes` inside DeviceMesh instead." + "Please use `_get_all_submeshes` inside DeviceMesh instead.", + stacklevel=2, ) return device_mesh._get_all_submeshes(mesh_dim_name) @@ -329,7 +331,8 @@ else: "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that " "the underlying communicator (i.e. NCCL) can be initialized properly. " "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the " - "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. " + "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. ", + stacklevel=2, ) # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host # NOTE: This device selection would only work for homogeneous hardware. @@ -766,7 +769,8 @@ else: warnings.warn( "You are attempting to slice a submesh from another submesh. While we support this operation, " "it is users' responsibility to ensure that the submesh is consistently sliced across all ranks. " - "If not, this may result in some ranks receiving the submesh while others encounter errors." + "If not, this may result in some ranks receiving the submesh while others encounter errors.", + stacklevel=2, ) slice_from_root = False @@ -803,7 +807,8 @@ else: elif name in flatten_name_to_root_layout: warnings.warn( "Slicing a flattened dim from root mesh will be deprecated in PT 2.11. " - "Users need to bookkeep the flattened mesh directly. " + "Users need to bookkeep the flattened mesh directly. ", + stacklevel=2, ) layout_sliced.append(flatten_name_to_root_layout[name]) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 52370a4545f..0cebfaff6d6 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -352,7 +352,8 @@ class Backend(str): # noqa: SLOT000 warnings.warn( f"Device capability of {name} unspecified, assuming `cpu` and " "`cuda` or `xpu`. Please specify it via the `devices` argument of " - "`register_backend`." + "`register_backend`.", + stacklevel=2, ) Backend.backend_capability[name.lower()] = ( ["cpu", "cuda", "xpu"] if torch.xpu.is_available() else ["cpu", "cuda"] @@ -427,7 +428,8 @@ class BackendConfig: warnings.warn( f"Device capability of {backend} unknown, assuming `cpu` and " "`cuda`. You can specify it in `device:backend` format in " - "`init_process_group` call." + "`init_process_group` call.", + stacklevel=2, ) backend_val = Backend(backend) self.device_backend_map = { @@ -751,7 +753,8 @@ def _get_default_timeout(backend: Backend) -> timedelta: # TODO moco benchmark on CPU initializes pgnccl backend today, triggered this assert in CI before it was # changed to be a warning. We should fix the moco model. warnings.warn( - "Attempted to get default timeout for nccl backend, but NCCL support is not compiled" + "Attempted to get default timeout for nccl backend, but NCCL support is not compiled", + stacklevel=2, ) return default_pg_timeout return default_pg_nccl_timeout @@ -802,6 +805,7 @@ def _get_object_coll_device(group: Optional[ProcessGroup] = None) -> str: f"You are using a Backend {type(group)} as a ProcessGroup. " "This usage is deprecated since PyTorch 2.0. Please use a public API " "of PyTorch Distributed instead.", + stacklevel=2, ) # Provide backward compatibility to cases where `group` passed in is # actually a Backend (like `ProcessGroupGloo`) rather than a @@ -868,7 +872,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device "backward-compatiblity reason. If you need to find a device for object " "collectives, please use `_get_object_coll_device`. If you need to query " "the device types supported by group, please use " - "`_device_capability(group)`. " + "`_device_capability(group)`. ", + stacklevel=2, ) group = group or _get_default_group() @@ -910,7 +915,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device warnings.warn( "Multiple backends are registered with this ProcessGroup. We cannot " f"determine which one is the default. Returning {rv}. " - "Please consider using other APIs." + "Please consider using other APIs.", + stacklevel=2, ) return rv @@ -1010,7 +1016,8 @@ def _warn_not_in_group(op_name) -> None: global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank() warnings.warn( f"Running {op_name} on global rank {global_rank} which does not " - "belong to the given group." + "belong to the given group.", + stacklevel=2, ) @@ -1557,7 +1564,9 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> elif is_gloo_available() and isinstance(backend, ProcessGroupGloo): backends.add(backend) # type: ignore[arg-type] if len(backends) == 0: - warnings.warn("Set timeout is now only supported for either nccl or gloo.") + warnings.warn( + "Set timeout is now only supported for either nccl or gloo.", stacklevel=2 + ) for backend in backends: backend._set_default_timeout(timeout) @@ -1758,7 +1767,8 @@ def init_process_group( warnings.warn( f"For MPI backend, world_size ({world_size}) and rank ({rank}) " "are ignored since they are assigned by the " - "MPI runtime." + "MPI runtime.", + stacklevel=2, ) default_pg, _ = _new_process_group_helper( @@ -2038,7 +2048,8 @@ def _new_process_group_helper( if backend_options._timeout != timeout: warnings.warn( "backend_options._timeout was specified, " - "but timeout kwarg has a default value that will always override it. " + "but timeout kwarg has a default value that will always override it. ", + stacklevel=2, ) else: # default backend_options for NCCL @@ -2259,7 +2270,8 @@ def destroy_process_group(group: Optional[ProcessGroup] = None): if pg in _world.pg_coalesce_state.keys(): warnings.warn( "Some coalesced collectives haven't been launched when " - "ProcessGroup is destroyed. They will be cleaned." + "ProcessGroup is destroyed. They will be cleaned.", + stacklevel=2, ) del _world.pg_coalesce_state[pg] @@ -2349,7 +2361,8 @@ def _abort_process_group(group: Optional[ProcessGroup] = None): if pg in _world.pg_coalesce_state.keys(): warnings.warn( "Some coalesced collectives haven't been launched when " - "ProcessGroup is aborted. They will be cleaned." + "ProcessGroup is aborted. They will be cleaned.", + stacklevel=2, ) del _world.pg_coalesce_state[pg] @@ -4919,7 +4932,8 @@ def barrier( if group.rank() == 0: warnings.warn( # warn only once "barrier(): using the device under current context. " - "You can specify `device_id` in `init_process_group` to mute this warning." + "You can specify `device_id` in `init_process_group` to mute this warning.", + stacklevel=2, ) work = group.barrier(opts=opts) @@ -5001,6 +5015,7 @@ def monitored_barrier( warnings.warn( "Please specify timeout arg as a timedelta. " f"Converting current value of {timeout} assuming it represents seconds", + stacklevel=2, ) timeout = timedelta(seconds=timeout) diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py index edcac432b66..d56d61e7eaa 100644 --- a/torch/distributed/elastic/agent/server/api.py +++ b/torch/distributed/elastic/agent/server/api.py @@ -106,6 +106,7 @@ class WorkerSpec: warnings.warn( "WorkerSpec.fn will be deprecated," " please use WorkerSpec.entrypoint instead", + stacklevel=2, category=DeprecationWarning, ) self.entrypoint = self.fn diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py index f15ce4f241d..437a9c07d2c 100644 --- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py +++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py @@ -52,7 +52,9 @@ class ErrorHandler: try: faulthandler.enable(all_threads=True) except Exception as e: - warnings.warn(f"Unable to enable fault handler. {type(e).__name__}: {e}") + warnings.warn( + f"Unable to enable fault handler. {type(e).__name__}: {e}", stacklevel=2 + ) def _write_error_file(self, file_path: str, error_msg: str) -> None: """Write error message to the file.""" @@ -60,7 +62,9 @@ class ErrorHandler: with open(file_path, "w") as fp: fp.write(error_msg) except Exception as e: - warnings.warn(f"Unable to write error to file. {type(e).__name__}: {e}") + warnings.warn( + f"Unable to write error to file. {type(e).__name__}: {e}", stacklevel=2 + ) def record_exception(self, e: BaseException) -> None: """ diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py index 8f0370173b7..c7d56374e7d 100644 --- a/torch/distributed/elastic/utils/logging.py +++ b/torch/distributed/elastic/utils/logging.py @@ -65,5 +65,6 @@ def _derive_module_name(depth: int = 1) -> Optional[str]: warnings.warn( f"Error deriving logger module name, using . Exception: {e}", RuntimeWarning, + stacklevel=2, ) return None diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py index a995e567bba..54d6c974cae 100644 --- a/torch/distributed/fsdp/_common_utils.py +++ b/torch/distributed/fsdp/_common_utils.py @@ -336,7 +336,8 @@ def _get_param_to_fqns( warnings.warn( "FlatParameter is being traversed more than once. " "This case should only happen when using " - "DistributedModelParallel with FullyShardedDataParallel." + "DistributedModelParallel with FullyShardedDataParallel.", + stacklevel=2, ) param_to_fqns[param] = global_fqns elif not dedup_shared_params: diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py index 778302a957a..db2ea7bfae0 100644 --- a/torch/distributed/fsdp/_exec_order_utils.py +++ b/torch/distributed/fsdp/_exec_order_utils.py @@ -299,7 +299,8 @@ class _ExecOrderData: warnings.warn( "Forward order differs from that of the first iteration " f"on rank {self.rank}. Collectives are unchecked and may " - f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}" + f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}", + stacklevel=2, ) self.warn_status = _ExecOrderWarnStatus.WARNING self.current_order_index += 1 diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py index 2d742c30302..8adde16de6b 100644 --- a/torch/distributed/fsdp/_flat_param.py +++ b/torch/distributed/fsdp/_flat_param.py @@ -1585,7 +1585,8 @@ class FlatParamHandle: warnings.warn( f"[Rank {self.rank}] Only some but not all ranks have a " "`None` `FlatParameter` gradient, so FSDP is using zeros to " - "approximate those ranks' sharded gradients being `None`" + "approximate those ranks' sharded gradients being `None`", + stacklevel=2, ) flat_param._saved_grad_shard = None # type: ignore[assignment] sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device) # type: ignore[attr-defined] @@ -2434,7 +2435,8 @@ class FlatParamHandle: f"[Rank {rank}] {'Parameter' if is_param else 'Gradient'} needs " f"writeback in {self._training_state}\n" f"expected shape={expected_shape} shape={src_shape} " - f"expected device={dst_tensor.device} device={src_device}" + f"expected device={dst_tensor.device} device={src_device}", + stacklevel=2, ) if src_tensor is not None and src_tensor.shape != expected_shape: # NOTE: Gradient shape mismatch is not possible in practice since diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py index 74cc12dc889..36bdc23e741 100644 --- a/torch/distributed/fsdp/_init_utils.py +++ b/torch/distributed/fsdp/_init_utils.py @@ -431,7 +431,8 @@ def _init_core_state( warnings.warn( "FSDP is switching to use `NO_SHARD` instead of " f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since " - "the world size is 1." + "the world size is 1.", + stacklevel=2, ) sharding_strategy = ShardingStrategy.NO_SHARD elif sharding_strategy == ShardingStrategy.NO_SHARD: @@ -704,7 +705,8 @@ def _get_ignored_modules( warnings.warn( "Trying to ignore the top-level module passed into the FSDP " "constructor itself will result in all parameters being " - f"ignored and is not well-supported: {module}" + f"ignored and is not well-supported: {module}", + stacklevel=2, ) # Include nested FSDP modules' ignored modules for submodule in root_module.modules(): @@ -847,7 +849,8 @@ def _get_device_from_device_id( f"FSDP will use the current device {device_handle.current_device()}. " f"If this is incorrect, please explicitly call `torch.{device.type}.set_device()` " "before FSDP initialization or pass in the explicit device " - "index as the `device_id` argument." + "index as the `device_id` argument.", + stacklevel=2, ) device = torch.device(device_handle.current_device()) return device @@ -929,7 +932,8 @@ def _materialize_meta_module( warnings.warn( "Unable to call `reset_parameters()` for module on meta " f"device with error {str(e)}. Please ensure that your module of" - f"type {type(module)} implements a `reset_parameters()` method." # type: ignore[possibly-undefined] + f"type {type(module)} implements a `reset_parameters()` method.", + stacklevel=2, # type: ignore[possibly-undefined] ) raise e @@ -1049,7 +1053,8 @@ def _warn_cpu_init(): "recommend passing in the `device_id` argument for FSDP to move " "`module` to GPU for the sharding initialization. `module` must also " "be on GPU device to work with the `sync_module_states=True` flag " - "since that requires GPU communication." + "since that requires GPU communication.", + stacklevel=2, ) diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py index 3c64bfbf2f6..c8afeb058ba 100644 --- a/torch/distributed/fsdp/_optim_utils.py +++ b/torch/distributed/fsdp/_optim_utils.py @@ -506,7 +506,8 @@ def _flatten_optim_state_dict( flat_osd_state[key] = copy.deepcopy(state) else: warnings.warn( - f"optim_state[{key}] is not on rank{fsdp_state.rank}." + f"optim_state[{key}] is not on rank{fsdp_state.rank}.", + stacklevel=2, ) else: @@ -2051,7 +2052,8 @@ def _optim_state_dict( "most cases, this is a user-defined state that is not " "associated with any particular parameter. Another possible " "case is this state is managed by TorchRec. Otherwise, there may " - " be a mismatched assumption of optim_state_dict of this mode." + " be a mismatched assumption of optim_state_dict of this mode.", + stacklevel=2, ) fsdp_osd_state[key] = value diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py index 496475b5b11..ec648ced837 100644 --- a/torch/distributed/fsdp/_state_dict_utils.py +++ b/torch/distributed/fsdp/_state_dict_utils.py @@ -337,7 +337,8 @@ def _full_post_state_dict_hook( "This may mean that this state_dict entry could point to invalid " "memory regions after returning from state_dict() call if this " "parameter is managed by FSDP. Please check clone " - f"implementation of {fqn}. Error: {str(e)}" + f"implementation of {fqn}. Error: {str(e)}", + stacklevel=2, ) return _common_unshard_post_state_dict_hook( @@ -708,7 +709,8 @@ def _post_state_dict_hook( context = _replace_with_full_state_dict_type(fsdp_state) warnings.warn( "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will " - "be returned." + "be returned.", + stacklevel=2, ) else: context = contextlib.nullcontext() @@ -770,7 +772,8 @@ def _pre_state_dict_hook( context = _replace_with_full_state_dict_type(fsdp_state) warnings.warn( "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will " - "be returned." + "be returned.", + stacklevel=2, ) else: _set_use_dtensor(fsdp_state) @@ -824,7 +827,8 @@ def _pre_load_state_dict_hook( context = _replace_with_full_state_dict_type(fsdp_state) warnings.warn( "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will" - "be returned." + "be returned.", + stacklevel=2, ) else: _set_use_dtensor(fsdp_state) @@ -861,7 +865,8 @@ def _post_load_state_dict_hook( context = _replace_with_full_state_dict_type(fsdp_state) warnings.warn( "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will" - "be returned." + "be returned.", + stacklevel=2, ) else: context = contextlib.nullcontext() diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py index bd24583d919..71dc1a9f4e2 100644 --- a/torch/distributed/fsdp/_unshard_param_utils.py +++ b/torch/distributed/fsdp/_unshard_param_utils.py @@ -153,7 +153,8 @@ def _validate_unshard_params_args( "offload_to_cpu=True and rank0_only=False may result in the" "unsharded parameters being redundantly copied to CPU memory for " "GPUs sharing the same CPU memory, which risks CPU OOM. We " - "recommend using offload_to_cpu=True with rank0_only=True." + "recommend using offload_to_cpu=True with rank0_only=True.", + stacklevel=2, ) diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py index 0a83e6307e1..41dc4d85751 100644 --- a/torch/distributed/fsdp/_wrap_utils.py +++ b/torch/distributed/fsdp/_wrap_utils.py @@ -120,7 +120,8 @@ def _warn_on_overridden_mixed_precision( "Both mixed precision and an auto_wrap_policy were specified to FSDP, " f"where the wrapped module has submodules of type:\n{overridden_module_classes}\n" "These modules will be wrapped as separate FSDP instacnes with mixed " - "precision disabled." + "precision disabled.", + stacklevel=2, ) @@ -172,7 +173,7 @@ def _validate_frozen_params( f"The following parameters have requires_grad=False:\n{frozen_param_fqns}" ) if use_orig_params: - warnings.warn(msg) + warnings.warn(msg, stacklevel=2) else: raise ValueError(msg) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index ce396a84777..cdc5ef424e7 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -680,6 +680,7 @@ class FullyShardedDataParallel(nn.Module, _FSDPState): "#torch.distributed.checkpoint.state_dict.get_state_dict ." "Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .", FutureWarning, + stacklevel=2, ) _state_dict_type_to_config = { StateDictType.FULL_STATE_DICT: FullStateDictConfig, @@ -1208,7 +1209,8 @@ class FullyShardedDataParallel(nn.Module, _FSDPState): warnings.warn( f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no " "gradients -- returning the total norm in the default dtype " - f"{total_norm.dtype}" + f"{total_norm.dtype}", + stacklevel=2, ) # warn since this is generally unexpected return total_norm total_norm_dtype = functools.reduce( diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py index 65d3944ae71..b5135ae5411 100644 --- a/torch/distributed/optim/named_optimizer.py +++ b/torch/distributed/optim/named_optimizer.py @@ -87,7 +87,8 @@ class _NamedOptimizer(optim.Optimizer): else: warnings.warn( "Since we pass in param_groups, we will use param_groups to " - "initialize the optimizer, not all parameters of the module." + "initialize the optimizer, not all parameters of the module.", + stacklevel=2, ) param_to_key = {param: key for key, param in self.named_parameters.items()} # type: ignore[misc, has-type] ordered_param_keys = [] diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py index 44d59cab44e..c7b78510ed1 100644 --- a/torch/distributed/optim/post_localSGD_optimizer.py +++ b/torch/distributed/optim/post_localSGD_optimizer.py @@ -92,7 +92,8 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer): else: warnings.warn( "Loaded state dict does not contain a step counter for an averager. " - "Setting step counter to 0." + "Setting step counter to 0.", + stacklevel=2, ) self.averager.step = 0 diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py index 4f91e3444b0..ba04eeb30df 100644 --- a/torch/distributed/tensor/_dispatch.py +++ b/torch/distributed/tensor/_dispatch.py @@ -513,7 +513,8 @@ class OpDispatcher: "Found a non-scalar tensor with numel=1 and ndim!=0, " "we are implicitly creating a replicated DTensor for it. " "However, please consider changing it to a scalar tensor " - "or explicitly create a DTensor under distributed environment." + "or explicitly create a DTensor under distributed environment.", + stacklevel=2, ) if tensor_arg.numel() == 1 or self._allow_implicit_replication: diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py index d81f58520aa..f8325c83d55 100644 --- a/torch/distributed/tensor/_random.py +++ b/torch/distributed/tensor/_random.py @@ -43,7 +43,8 @@ def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool: else: # TODO: Logs way too much warnings.warn( - f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh" + f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh", + stacklevel=2, ) return False @@ -72,7 +73,8 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None: if not is_rng_supported_mesh(device_mesh): warnings.warn( "DTensor manual_seed() may not have complete support " - f"on {device_mesh.device_type} device mesh" + f"on {device_mesh.device_type} device mesh", + stacklevel=2, ) return diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py index 2a3369a8edd..51cfd0f144b 100644 --- a/torch/distributed/tensor/parallel/api.py +++ b/torch/distributed/tensor/parallel/api.py @@ -74,7 +74,8 @@ def parallelize_module( # type: ignore[return] if parallelize_plan is None: warnings.warn( "No parallelize_plan is provided and auto-parallel is not supported " - "at the moment, so this parallelize_module call will do nothing." + "at the moment, so this parallelize_module call will do nothing.", + stacklevel=2, ) return module @@ -108,7 +109,8 @@ def parallelize_module( # type: ignore[return] warnings.warn( f"Parallelize plan key '{module_path}' could not be resolved: " f"no submodule matching token '{token}' in module {module}, " - f"skipping this plan entry." + f"skipping this plan entry.", + stacklevel=2, ) continue diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index a72c90789cc..dcdb2762cff 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -62,7 +62,8 @@ class Distribution: warnings.warn( f"{self.__class__} does not define `arg_constraints`. " + "Please set `arg_constraints = {}` or initialize the distribution " - + "with `validate_args=False` to turn off validation." + + "with `validate_args=False` to turn off validation.", + stacklevel=2, ) for param, constraint in arg_constraints.items(): if constraints.is_dependent(constraint): @@ -313,7 +314,8 @@ class Distribution: warnings.warn( f"{self.__class__} does not define `support` to enable " + "sample validation. Please initialize the distribution with " - + "`validate_args=False` to turn off validation." + + "`validate_args=False` to turn off validation.", + stacklevel=2, ) return assert support is not None diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py index ca82802bcc8..85932828d21 100644 --- a/torch/distributions/kl.py +++ b/torch/distributions/kl.py @@ -133,6 +133,7 @@ def _dispatch_kl(type_p, type_q): f"Ambiguous kl_divergence({type_p.__name__}, {type_q.__name__}). " f"Please register_kl({left_p.__name__}, {right_q.__name__})", RuntimeWarning, + stacklevel=2, ) return left_fun diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py index 5aaa3ddc9d0..96918a68abc 100644 --- a/torch/distributions/wishart.py +++ b/torch/distributions/wishart.py @@ -127,7 +127,8 @@ class Wishart(ExponentialFamily): if self.df.lt(event_shape[-1]).any(): warnings.warn( - "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim." + "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim.", + stacklevel=2, ) super().__init__(batch_shape, event_shape, validate_args=validate_args) @@ -279,7 +280,7 @@ class Wishart(ExponentialFamily): else: # More optimized version with data-dependent control flow. if is_singular.any(): - warnings.warn("Singular sample detected.") + warnings.warn("Singular sample detected.", stacklevel=2) for _ in range(max_try_correction): sample_new = self._bartlett_sampling(is_singular[is_singular].shape) diff --git a/torch/export/__init__.py b/torch/export/__init__.py index a76cda0682c..c300df11a0c 100644 --- a/torch/export/__init__.py +++ b/torch/export/__init__.py @@ -500,10 +500,10 @@ def load( if file_info.filename == "serialized_exported_program.json": serialized_exported_program = file_content elif file_info.filename == "serialized_state_dict.json": - warnings.warn("This version of file is deprecated") + warnings.warn("This version of file is deprecated", stacklevel=2) serialized_state_dict = file_content elif file_info.filename == "serialized_constants.json": - warnings.warn("This version of file is deprecated") + warnings.warn("This version of file is deprecated", stacklevel=2) serialized_constants = file_content elif file_info.filename == "serialized_state_dict.pt": serialized_state_dict = file_content diff --git a/torch/export/_trace.py b/torch/export/_trace.py index a2c47ca3e5d..d770cad63a8 100644 --- a/torch/export/_trace.py +++ b/torch/export/_trace.py @@ -2113,7 +2113,7 @@ def _export_for_training( if torch._export.config.error_on_lifted_constant_tensors: raise RuntimeError(error_msg) else: - warnings.warn(error_msg) + warnings.warn(error_msg, stacklevel=2) export_graph_signature = export_artifact.aten.sig @@ -2189,7 +2189,8 @@ def _export_for_training( f"This is likely result of torch.export.export not being able to track side effects " f"that is happening outside of model scope.\n\n" f"Leaked tensors:\n {leak_details}\n\n" - f"Alternatively, please file a bug report to PyTorch team for further debugging help." + f"Alternatively, please file a bug report to PyTorch team for further debugging help.", + stacklevel=2, ) del legit_leak diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py index 4ce7c28f4b0..b9e82481322 100644 --- a/torch/export/_unlift.py +++ b/torch/export/_unlift.py @@ -530,7 +530,8 @@ def _create_stateful_graph_module( f"A model attribute `{constant_fqn}` requires gradient. " f"but it's not properly registered as a parameter. " f"torch.export will detach it and treat it as a constant tensor " - f"but please register it as parameter instead." + f"but please register it as parameter instead.", + stacklevel=2, ) detached_buffer = buffer.detach() original_tensor_to_detached_tensor[buffer] = detached_buffer @@ -549,7 +550,8 @@ def _create_stateful_graph_module( f"A model attribute `{const_name}` requires gradient " f"but it's not properly registered as a parameter. " f"torch.export will detach it and treat it as a constant tensor " - f"but please register it as parameter instead." + f"but please register it as parameter instead.", + stacklevel=2, ) if value in original_tensor_to_detached_tensor: value = original_tensor_to_detached_tensor[value] diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py index eec86b28c04..6263a5ea44d 100644 --- a/torch/export/exported_program.py +++ b/torch/export/exported_program.py @@ -1684,7 +1684,8 @@ def _create_graph_module_for_export(root, graph): "Unable to execute the generated python source code from " "the graph. The graph module will no longer be directly callable, " "but you can still run the ExportedProgram, and if needed, you can " - "run the graph module eagerly using torch.fx.Interpreter." + "run the graph module eagerly using torch.fx.Interpreter.", + stacklevel=2, ) gm = torch.fx.GraphModule(root, torch.fx.Graph()) gm._graph = graph diff --git a/torch/export/pt2_archive/_package_weights.py b/torch/export/pt2_archive/_package_weights.py index d7f8d4fb2f8..a6fddaaf4cf 100644 --- a/torch/export/pt2_archive/_package_weights.py +++ b/torch/export/pt2_archive/_package_weights.py @@ -108,7 +108,8 @@ def get_complete( warnings.warn( "No complete tensor found in the group! Returning the first one. " - "This may cause issues when your weights are not on CPU." + "This may cause issues when your weights are not on CPU.", + stacklevel=2, ) assert len(group) > 0 return next(iter(group)) diff --git a/torch/hub.py b/torch/hub.py index 84740905ecc..3d6183ee7b2 100644 --- a/torch/hub.py +++ b/torch/hub.py @@ -279,7 +279,8 @@ def _get_cache_or_reload( f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? " "Torchhub will now assume that it's a branch. " "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or " - "refs/tags/tag_name as the ref. That might require using skip_validation=True." + "refs/tags/tag_name as the ref. That might require using skip_validation=True.", + stacklevel=2, ) disambiguated_branch_ref = f"refs/heads/{ref}" url = _git_archive_link( @@ -338,7 +339,8 @@ def _check_repo_is_trusted( "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, " f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with " f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for " - f"confirmation if the repo is not already trusted. This will eventually be the default behaviour" + f"confirmation if the repo is not already trusted. This will eventually be the default behaviour", + stacklevel=2, ) return @@ -406,7 +408,9 @@ def get_dir() -> str: """ # Issue warning to move data if old env is set if os.getenv("TORCH_HUB"): - warnings.warn("TORCH_HUB is deprecated, please use env TORCH_HOME instead") + warnings.warn( + "TORCH_HUB is deprecated, please use env TORCH_HOME instead", stacklevel=2 + ) if _hub_dir is not None: return _hub_dir @@ -853,7 +857,8 @@ def load_state_dict_from_url( # Issue warning to move data if old env is set if os.getenv("TORCH_MODEL_ZOO"): warnings.warn( - "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead" + "TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead", + stacklevel=2, ) if model_dir is None: diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f62ec3afe12..9decaeecc86 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -257,7 +257,7 @@ class strict_fusion: def __init__(self) -> None: if not torch._jit_internal.is_scripting(): - warnings.warn("Only works in script mode") + warnings.warn("Only works in script mode", stacklevel=2) def __enter__(self): pass diff --git a/torch/jit/_check.py b/torch/jit/_check.py index f708ee87f30..042d0241897 100644 --- a/torch/jit/_check.py +++ b/torch/jit/_check.py @@ -180,7 +180,8 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor): "instance-level annotations on empty non-base " "types in `__init__`. Instead, either 1) use a " "type annotation in the class body, or 2) wrap " - "the type in `torch.jit.Attribute`." + "the type in `torch.jit.Attribute`.", + stacklevel=2, ) def visit_Call(self, node): @@ -245,5 +246,6 @@ class AttributeTypeIsSupportedChecker(ast.NodeVisitor): "instance-level annotations on empty non-base " "types in `__init__`. Instead, either 1) use a " "type annotation in the class body, or 2) wrap " - "the type in `torch.jit.Attribute`." + "the type in `torch.jit.Attribute`.", + stacklevel=2, ) diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py index b4d2d624669..426d4a9de3c 100644 --- a/torch/jit/_decompositions.py +++ b/torch/jit/_decompositions.py @@ -48,7 +48,9 @@ def signatures_match(decomposition_sig, torch_op_sig): inspect_empty = inspect._empty # type: ignore[attr-defined] for field in ["name", "annotation"]: if field == "name" and decomp_param.name == "self": - warnings.warn("PyTorch uses 'input' instead of 'self' on public api") + warnings.warn( + "PyTorch uses 'input' instead of 'self' on public api", stacklevel=2 + ) if getattr(decomp_param, field) != getattr(op_param, field): return False diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py index 530266fa9dc..52c7ac88d3f 100644 --- a/torch/jit/_recursive.py +++ b/torch/jit/_recursive.py @@ -309,7 +309,8 @@ def infer_concrete_type_builder(nn_module, share_types=True): warnings.warn( f"'{name}' was found in ScriptModule constants, " - f" but it is a non-constant {hint}. Consider removing it." + f" but it is a non-constant {hint}. Consider removing it.", + stacklevel=2, ) continue if not hasattr(nn_module, name): @@ -318,7 +319,8 @@ def infer_concrete_type_builder(nn_module, share_types=True): warnings.warn( f"'{name}' was found in ScriptModule constants, " "but was not actually set in __init__. " - "Consider removing it." + "Consider removing it.", + stacklevel=2, ) continue value = getattr(nn_module, name) diff --git a/torch/jit/_script.py b/torch/jit/_script.py index 86b72d1d465..804f44f80e3 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -775,6 +775,7 @@ if _enabled: "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ https://docs.pytorch.org/executorch/stable/getting-started.html", DeprecationWarning, + stacklevel=2, ) return self._c._save_for_mobile(*args, **kwargs) @@ -787,6 +788,7 @@ if _enabled: "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \ https://docs.pytorch.org/executorch/stable/getting-started.html", DeprecationWarning, + stacklevel=2, ) return self._c._save_to_buffer_for_mobile(*args, **kwargs) @@ -1165,7 +1167,8 @@ def _script_impl( warnings.warn( "Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType " "to enable Profile-Directed Typing in TorchScript. Refer to " - "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. " + "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. ", + stacklevel=2, ) if isinstance(obj, torch.nn.Module): diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py index 5b1713e77d3..e17700b8cac 100644 --- a/torch/jit/_trace.py +++ b/torch/jit/_trace.py @@ -686,7 +686,8 @@ def _trace_impl( # it is hard to trace it because the forward method on ScriptModule is already defined, so it # would result in an error. warnings.warn( - "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is." + "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is.", + stacklevel=2, ) return func diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index 9bfa6832944..50b131aeab4 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -389,7 +389,8 @@ def is_tensor(ann): warnings.warn( "TorchScript will treat type annotations of Tensor " "dtype-specific subtypes as if they are normal Tensors. " - "dtype constraints are not enforced in compilation either." + "dtype constraints are not enforced in compilation either.", + stacklevel=2, ) return True diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py index fd7d19b0284..e330b59f47a 100644 --- a/torch/masked/_ops.py +++ b/torch/masked/_ops.py @@ -44,7 +44,8 @@ def _apply_docstring_templates(func: Callable[_P, _T]) -> Callable[_P, _T]: warnings.warn( f"No documentation string available for {func.__name__}." " PyTorch team should run `python tools/update_masked_docs.py`" - " to generate the missing docstrings." + " to generate the missing docstrings.", + stacklevel=2, ) else: func.__doc__ = doc_string diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py index 0b3fa9b858f..75a41e705b1 100644 --- a/torch/masked/maskedtensor/core.py +++ b/torch/masked/maskedtensor/core.py @@ -322,7 +322,7 @@ class MaskedTensor(torch.Tensor): "In the case that the semantics for the operator are not trivial, it would be appreciated " "to also include a proposal for the semantics." ) - warnings.warn(msg) + warnings.warn(msg, stacklevel=2) return NotImplemented def __lt__(self, other): diff --git a/torch/masked/maskedtensor/reductions.py b/torch/masked/maskedtensor/reductions.py index fedab1c12a6..6acc8415267 100644 --- a/torch/masked/maskedtensor/reductions.py +++ b/torch/masked/maskedtensor/reductions.py @@ -90,7 +90,7 @@ def _torch_reduce_dim(fn): "In the case that the semantics for the operator are not trivial, it would be appreciated " "to also include a proposal for the semantics." ) - warnings.warn(msg) + warnings.warn(msg, stacklevel=2) return NotImplemented if not is_masked_tensor(self): raise TypeError("Input to reduce_dim must be a MaskedTensor") diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py index d4652ab32ff..272335a538b 100644 --- a/torch/multiprocessing/spawn.py +++ b/torch/multiprocessing/spawn.py @@ -223,7 +223,9 @@ class ProcessContext: class SpawnContext(ProcessContext): def __init__(self, processes, error_files): - warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.") + warnings.warn( + "SpawnContext is renamed to ProcessContext since 1.4 release.", stacklevel=2 + ) super().__init__(processes, error_files) diff --git a/torch/nn/_reduction.py b/torch/nn/_reduction.py index 93b00dc6feb..9764f935b7c 100644 --- a/torch/nn/_reduction.py +++ b/torch/nn/_reduction.py @@ -13,7 +13,8 @@ def get_enum(reduction: str) -> int: elif reduction == "elementwise_mean": warnings.warn( "reduction='elementwise_mean' is deprecated. " - "Please use reduction='mean' instead." + "Please use reduction='mean' instead.", + stacklevel=2, ) ret = 1 elif reduction == "sum": @@ -48,7 +49,7 @@ def legacy_get_string( else: ret = "none" if emit_warning: - warnings.warn(warning.format(ret)) + warnings.warn(warning.format(ret), stacklevel=2) return ret diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py index 4aa6c14f811..9113fd7e379 100644 --- a/torch/nn/attention/__init__.py +++ b/torch/nn/attention/__init__.py @@ -60,10 +60,10 @@ def _raise_kernel_warnings(params: SDPAParams) -> None: """ if WARN_FOR_UNFUSED_KERNELS: if not can_use_efficient_attention(params): - warn("Efficient attention can't be used because:") + warn("Efficient attention can't be used because:", stacklevel=2) can_use_efficient_attention(params, True) if not can_use_flash_attention(params): - warn("Flash attention can't be used because:") + warn("Flash attention can't be used because:", stacklevel=2) can_use_flash_attention(params, True) diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py index 2a1a97fc756..551a57e6963 100644 --- a/torch/nn/attention/bias.py +++ b/torch/nn/attention/bias.py @@ -134,7 +134,8 @@ class CausalBias(torch.Tensor): self.seq_len_kv = seq_len_kv if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT: warn( - "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!" + "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!", + stacklevel=2, ) def _upper_left(self, device: torch.device) -> torch.Tensor: diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py index 01f5fe84356..fae220f7545 100644 --- a/torch/nn/attention/flex_attention.py +++ b/torch/nn/attention/flex_attention.py @@ -1152,6 +1152,7 @@ def create_block_mask( warnings.warn( "_compile flag on create_block_mask was originally added to work around a torch.compile limitation. That limitation has since been addressed. So, to compile create_block_mask, we suggest doing torch.compile(create_block_mask). This still works for now, but will be removed in the future.", DeprecationWarning, + stacklevel=2, ) return torch.compile(create_block_mask)( mask_mod, B, H, Q_LEN, KV_LEN, device, BLOCK_SIZE diff --git a/torch/nn/functional.py b/torch/nn/functional.py index c562bc63dc4..360d687094d 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1551,7 +1551,7 @@ def dropout2d( "exists to provide channel-wise dropout on inputs with 2 spatial dimensions, " "a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs)." ) - warnings.warn(warn_msg) + warnings.warn(warn_msg, stacklevel=2) # TODO: Properly support no-batch-dim inputs. For now, these are NOT supported; passing # a 3D input will perform dropout1d behavior instead. This was done historically and the @@ -1563,7 +1563,8 @@ def dropout2d( "1D dropout behavior is desired - input is interpreted as shape (N, C, L), where C " "is the channel dim. This behavior will change in a future release to interpret the " "input as one without a batch dimension, i.e. shape (C, H, W). To maintain the 1D " - "channel-wise dropout behavior, please switch to using dropout1d instead." + "channel-wise dropout behavior, please switch to using dropout1d instead.", + stacklevel=2, ) result = ( @@ -1610,7 +1611,7 @@ def dropout3d( "exists to provide channel-wise dropout on inputs with 3 spatial dimensions, " "a channel dimension, and an optional batch dimension (i.e. 4D or 5D inputs)." ) - warnings.warn(warn_msg) + warnings.warn(warn_msg, stacklevel=2) is_batched = inp_dim == 5 if not is_batched: @@ -2210,7 +2211,7 @@ def gumbel_softmax( gumbel_softmax, (logits,), logits, tau=tau, hard=hard, eps=eps, dim=dim ) if eps != 1e-10: - warnings.warn("`eps` parameter is deprecated and has no effect.") + warnings.warn("`eps` parameter is deprecated and has no effect.", stacklevel=2) gumbels = ( -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format) @@ -2681,7 +2682,8 @@ def embedding_bag( warnings.warn( "Argument order of nn.functional.embedding_bag was changed. " "Usage `embedding_bag(weight, input, ...)` is deprecated, " - "and should now be `embedding_bag(input, weight, ...)`." + "and should now be `embedding_bag(input, weight, ...)`.", + stacklevel=2, ) weight, input = input, weight @@ -3392,7 +3394,8 @@ def kl_div( warnings.warn( "reduction: 'mean' divides the total loss by both the batch size and the support size." "'batchmean' divides only by the batch size, and aligns with the KL div math definition." - "'mean' will be changed to behave the same as 'batchmean' in the next major release." + "'mean' will be changed to behave the same as 'batchmean' in the next major release.", + stacklevel=2, ) # special case for batchmean @@ -5213,7 +5216,8 @@ def grid_sample( "Default grid_sample and affine_grid behavior has changed " "to align_corners=False since 1.3.0. Please specify " "align_corners=True if the old behavior is desired. " - "See the documentation of grid_sample for details." + "See the documentation of grid_sample for details.", + stacklevel=2, ) align_corners = False @@ -5280,7 +5284,8 @@ def affine_grid( "Default grid_sample and affine_grid behavior has changed " "to align_corners=False since 1.3.0. Please specify " "align_corners=True if the old behavior is desired. " - "See the documentation of grid_sample for details." + "See the documentation of grid_sample for details.", + stacklevel=2, ) align_corners = False @@ -5314,7 +5319,8 @@ def affine_grid( "Since version 1.3.0, affine_grid behavior has changed " "for unit-size grids when align_corners=True. " "This is not an intended use case of affine_grid. " - "See the documentation of affine_grid for details." + "See the documentation of affine_grid for details.", + stacklevel=2, ) elif min(size) <= 0: raise ValueError(f"Expected non-zero, positive output size. Got {size}") @@ -6158,7 +6164,8 @@ def _canonical_mask( if _mask_dtype != other_type: warnings.warn( f"Support for mismatched {mask_name} and {other_name} " - "is deprecated. Use same type for both instead." + "is deprecated. Use same type for both instead.", + stacklevel=2, ) if not _mask_is_float: mask = torch.zeros_like(mask, dtype=target_type).masked_fill_( diff --git a/torch/nn/init.py b/torch/nn/init.py index 18358dbabbb..78fe7cd7ff8 100644 --- a/torch/nn/init.py +++ b/torch/nn/init.py @@ -565,7 +565,7 @@ def kaiming_uniform_( ) if 0 in tensor.shape: - warnings.warn("Initializing zero-element tensors is a no-op") + warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2) return tensor fan = _calculate_correct_fan(tensor, mode) gain = calculate_gain(nonlinearity, a) @@ -619,7 +619,7 @@ def kaiming_normal_( pass in a transposed weight matrix, i.e. ``nn.init.kaiming_normal_(w.T, ...)``. """ if 0 in tensor.shape: - warnings.warn("Initializing zero-element tensors is a no-op") + warnings.warn("Initializing zero-element tensors is a no-op", stacklevel=2) return tensor fan = _calculate_correct_fan(tensor, mode) gain = calculate_gain(nonlinearity, a) diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py index 25f0c45d5c1..da3d3658553 100644 --- a/torch/nn/modules/instancenorm.py +++ b/torch/nn/modules/instancenorm.py @@ -115,7 +115,8 @@ class _InstanceNorm(_NormBase): warnings.warn( f"input's size at dim={feature_dim} does not match num_features. " "You can silence this warning by not passing in num_features, " - "which is not used because affine=False" + "which is not used because affine=False", + stacklevel=2, ) if input.dim() == self._get_no_batch_dim(): diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 194e68046e8..f7e3d2f262d 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -1353,7 +1353,8 @@ class Module: "Complex modules are a new feature under active development whose design may change, " "and some modules might not work as expected when using complex tensors as parameters or buffers. " "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml " - "if a complex module does not work as expected." + "if a complex module does not work as expected.", + stacklevel=2, ) def convert(t): @@ -1855,7 +1856,7 @@ class Module: if not isinstance(result, (torch.Tensor, tuple)): warnings.warn("For backward hooks to be called," " module output should be a Tensor or a tuple of Tensors" - f" but received {type(result)}") + f" but received {type(result)}", stacklevel=2) result = bw_hook.setup_output_hook(result) # Handle the non-full backward hooks @@ -1898,7 +1899,7 @@ class Module: result = hook_result except Exception as e: warnings.warn("global module forward hook with ``always_call=True`` raised an exception " - f"that was silenced as another error was raised in forward: {str(e)}") + f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2) continue for hook_id, hook in self._forward_hooks.items(): @@ -1912,7 +1913,7 @@ class Module: result = hook_result except Exception as e: warnings.warn("module forward hook with ``always_call=True`` raised an exception " - f"that was silenced as another error was raised in forward: {str(e)}") + f"that was silenced as another error was raised in forward: {str(e)}", stacklevel=2) continue # raise exception raised in try block raise @@ -2457,7 +2458,8 @@ class Module: f"for {key}: copying from a non-meta parameter in the checkpoint to a meta " "parameter in the current model, which is a no-op. (Did you mean to " "pass `assign=True` to assign items in the state dictionary to their " - "corresponding key in the module instead of copying them in place?)" + "corresponding key in the module instead of copying them in place?)", + stacklevel=2, ) try: @@ -2956,7 +2958,8 @@ class Module: "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. " "The parameters are copied (in a differentiable manner) from the original module. " "This means they are not leaf nodes in autograd and so don't accumulate gradients. " - "If you need gradients in your forward method, consider using autograd.grad instead." + "If you need gradients in your forward method, consider using autograd.grad instead.", + stacklevel=2, ) for p in self.parameters(): diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py index bff265bd92a..c7b44b61354 100644 --- a/torch/nn/modules/rnn.py +++ b/torch/nn/modules/rnn.py @@ -124,7 +124,8 @@ class RNNBase(Module): "dropout option adds dropout after all but last " "recurrent layer, so non-zero dropout expects " f"num_layers greater than 1, but got dropout={dropout} and " - f"num_layers={num_layers}" + f"num_layers={num_layers}", + stacklevel=2, ) if not isinstance(hidden_size, int): diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py index 2f69d89b19e..5f445bf26c7 100644 --- a/torch/nn/modules/transformer.py +++ b/torch/nn/modules/transformer.py @@ -399,7 +399,8 @@ class TransformerEncoder(Module): if enable_nested_tensor and why_not_sparsity_fast_path: warnings.warn( - f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}" + f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}", + stacklevel=2, ) self.use_nested_tensor = False diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py index 5170b172fbb..e88a8e1795f 100644 --- a/torch/nn/parallel/_functions.py +++ b/torch/nn/parallel/_functions.py @@ -71,7 +71,8 @@ class Gather(Function): warnings.warn( "Was asked to gather along dimension 0, but all " "input tensors were scalars; will instead unsqueeze " - "and return a vector." + "and return a vector.", + stacklevel=2, ) ctx.unsqueezed_scalar = True else: diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py index 56ad3b8b201..9a0f4973d31 100644 --- a/torch/nn/parallel/data_parallel.py +++ b/torch/nn/parallel/data_parallel.py @@ -36,7 +36,8 @@ def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None: max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1)) if min_val / max_val < 0.75: warnings.warn( - imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]) + imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]), + stacklevel=2, ) return True return False diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 4444f557f4a..1072b68ea11 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -2365,7 +2365,8 @@ class DistributedDataParallel(Module, Joinable): # If self.static_graph has been set, no need to set it again if self.static_graph: warnings.warn( - "You've set static_graph to be True, no need to set it again." + "You've set static_graph to be True, no need to set it again.", + stacklevel=2, ) return self.static_graph = True @@ -2379,7 +2380,8 @@ class DistributedDataParallel(Module, Joinable): "`_set_static_graph` will detect unused parameters automatically, so " "you do not need to set find_unused_parameters=true, just be sure these " "unused parameters will not change during training loop while calling " - "`_set_static_graph`." + "`_set_static_graph`.", + stacklevel=2, ) def _remove_autograd_hooks(self): diff --git a/torch/nn/utils/_deprecation_utils.py b/torch/nn/utils/_deprecation_utils.py index 995da89c70b..a25b6473079 100644 --- a/torch/nn/utils/_deprecation_utils.py +++ b/torch/nn/utils/_deprecation_utils.py @@ -45,7 +45,7 @@ def lazy_deprecated_import( if name in all: # We are using the "RuntimeWarning" to make sure it is not # ignored by default. - warnings.warn(warning_message, RuntimeWarning) + warnings.warn(warning_message, RuntimeWarning, stacklevel=2) package = importlib.import_module(new_module) return getattr(package, name) raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.") diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py index 20651017f3e..e128ecf74e9 100644 --- a/torch/onnx/_internal/exporter/_dynamic_shapes.py +++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py @@ -271,7 +271,8 @@ def create_rename_mapping( if input.shape[dim].value in rename_mapping: warnings.warn( f"# The axis name: {custom_name} will not be used, since it shares " - f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}." + f"the same shape constraints with another axis: {rename_mapping[input.shape[dim].value]}.", + stacklevel=2, ) continue rename_mapping[input.shape[dim].value] = custom_name diff --git a/torch/onnx/_internal/torchscript_exporter/registration.py b/torch/onnx/_internal/torchscript_exporter/registration.py index f073227f87b..e35903e6823 100644 --- a/torch/onnx/_internal/torchscript_exporter/registration.py +++ b/torch/onnx/_internal/torchscript_exporter/registration.py @@ -164,6 +164,7 @@ class _SymbolicFunctionGroup: f"Replacing the existing function with new function. This is unexpected. " f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.", errors.OnnxExporterWarning, + stacklevel=2, ) self._functions.set_base(opset, func) @@ -184,7 +185,8 @@ class _SymbolicFunctionGroup: """ if not self._functions.overridden(opset): warnings.warn( - f"No custom function registered for '{self._name}' opset {opset}" + f"No custom function registered for '{self._name}' opset {opset}", + stacklevel=2, ) return self._functions.remove_override(opset) diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py index 59cd0eb0f89..6b0da4a6ae0 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py @@ -909,7 +909,8 @@ def _interpolate_warning(interpolate_mode): "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. " "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 " "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n" - "We recommend using opset 11 and above for models using this operator." + "We recommend using opset 11 and above for models using this operator.", + stacklevel=2, ) @@ -1236,7 +1237,8 @@ def __interpolate_helper( if not is_scalar: warnings.warn( "Cannot verify if the output_size is a scalar " - "while exporting interpolate. Assuming that it is not a scalar." + "while exporting interpolate. Assuming that it is not a scalar.", + stacklevel=2, ) if is_scalar: @@ -1577,7 +1579,8 @@ def check_training_mode(op_train_mode: int, op_name: str) -> None: # in training. warnings.warn( f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' " - f"is set to {op_mode_text}. Exporting with {op_mode_text}." + f"is set to {op_mode_text}. Exporting with {op_mode_text}.", + stacklevel=2, ) diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py index 6bb09ef3ec2..a757409bf10 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py @@ -613,7 +613,8 @@ def embedding_bag( warnings.warn( "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. " - "Please use opset 11 or higher to export model for dynamic input shape.'" + "Please use opset 11 or higher to export model for dynamic input shape.'", + stacklevel=2, ) offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0) if offsets_dim_0 is not None: diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py index cbba5d2e61c..c46af044a3e 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py @@ -914,7 +914,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None): + str(dim_size) + ". The model will " + "be exported without the squeeze node. If the model is intended to be used with dynamic " - + "input shapes, please export with dynamic_axes argument." + + "input shapes, please export with dynamic_axes argument.", + stacklevel=2, ) return self return symbolic_helper._squeeze_helper(g, self, [dim]) diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py index d11750b1ee8..ae9a5039d39 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py @@ -48,7 +48,8 @@ def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None): warnings.warn( "Multidirectional broadcasting is not supported in opset 7. " "This might cause the onnx model to be incorrect, if inputs to max operators " - "have different shapes" + "have different shapes", + stacklevel=2, ) return opset9.max(g, self, dim_or_y, keepdim) @@ -60,7 +61,8 @@ def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None): warnings.warn( "Multidirectional broadcasting is not supported in opset 7. " "This might cause the onnx model to be incorrect, if inputs to min operators " - "have different shapes" + "have different shapes", + stacklevel=2, ) return opset9.min(g, self, dim_or_y, keepdim) diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py index 8ba8e6ee662..3e05e82842f 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py @@ -183,7 +183,8 @@ def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args): warnings.warn( "Only floating datatype is supported for these operators: " "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause " - "the onnx model to be incorrect, if inputs have integer datatypes." + "the onnx model to be incorrect, if inputs have integer datatypes.", + stacklevel=2, ) return (old_type,) + args diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py index 16e94b91f89..53e6e592da0 100644 --- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py +++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py @@ -925,7 +925,8 @@ def embedding( warnings.warn( "Warning: ONNX export of embedding with padding_idx >= 0 " "for training mode. " - "ONNX does not support not updating the embedding vector at padding_idx during training." + "ONNX does not support not updating the embedding vector at padding_idx during training.", + stacklevel=2, ) return g.op("Gather", weight, indices) @@ -1142,7 +1143,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None): + "Axis is converted to " + str(squeeze_dim + rank) + " based on input shape at export time. " - + "Passing an tensor of different rank in execution will be incorrect." + + "Passing an tensor of different rank in execution will be incorrect.", + stacklevel=2, ) squeeze_dim += rank else: @@ -1161,7 +1163,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None): + " of the input " + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on " + "non-singleton dimensions, it is recommended to export this model using opset " - + "version 11 or higher." + + "version 11 or higher.", + stacklevel=2, ) return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim]) if dim_size > 1: @@ -1174,7 +1177,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None): + ". The model will " + "be exported without the squeeze node. If the model is intended to be used with dynamic " + "input shapes, please use opset version 11 to " - + "export the model." + + "export the model.", + stacklevel=2, ) return self @@ -1182,7 +1186,8 @@ def squeeze(g: jit_utils.GraphContext, self, dim=None): "This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". If the model is " - + "intended to be used with dynamic input shapes, please use opset version 11 to export the model." + + "intended to be used with dynamic input shapes, please use opset version 11 to export the model.", + stacklevel=2, ) return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim]) @@ -3859,7 +3864,8 @@ def unsqueeze(g: jit_utils.GraphContext, self, dim): + "Axis is converted to " + str(dim + rank + 1) + " based on input shape at export time. " - + "Passing an tensor of different rank in execution will be incorrect." + + "Passing an tensor of different rank in execution will be incorrect.", + stacklevel=2, ) dim = dim + rank + 1 else: @@ -4266,7 +4272,8 @@ def _generic_rnn( + " can cause an error " + "when running the ONNX model with a different batch size. " + "Make sure to save the model with a batch size of 1, " - + "or define the initial states (h0/c0) as inputs of the model. " + + "or define the initial states (h0/c0) as inputs of the model. ", + stacklevel=2, ) onnxActivations = [ @@ -5316,7 +5323,8 @@ def index(g: jit_utils.GraphContext, self, index): warnings.warn( "Exporting aten::index operator with indices of type Byte. " "Only 1-D indices are supported. In any other case, " - "this will produce an incorrect ONNX graph." + "this will produce an incorrect ONNX graph.", + stacklevel=2, ) index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1]) return index @@ -5370,7 +5378,8 @@ def index(g: jit_utils.GraphContext, self, index): f"{GLOBALS.export_onnx_opset_version}" " is achieved by combination of multiple ONNX operators, " "including Reshape, Transpose, Concat, and Gather. " - "If indices include negative values, the exported graph will produce incorrect results." + "If indices include negative values, the exported graph will produce incorrect results.", + stacklevel=2, ) adv_idx_count = len(adv_idx_indices) shape_tensor = _shape_as_tensor(g, self) @@ -6061,7 +6070,8 @@ def fill(g: jit_utils.GraphContext, self, value): def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None): warnings.warn( "Warning: ONNX export does not support duplicated values in 'index' field, " - + "this will cause the ONNX model to be incorrect." + + "this will cause the ONNX model to be incorrect.", + stacklevel=2, ) # ONNX does not support "alpha" argument, unlike aten index_add diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py index f2004ac0232..d66962f690e 100644 --- a/torch/onnx/_internal/torchscript_exporter/utils.py +++ b/torch/onnx/_internal/torchscript_exporter/utils.py @@ -121,7 +121,8 @@ def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode): "You are exporting the model in training mode with onnx opset " f"version {GLOBALS.export_onnx_opset_version}. " "Opset versions lower than opset 12 will not be able to export " - "nodes such as Dropout and BatchNorm correctly." + "nodes such as Dropout and BatchNorm correctly.", + stacklevel=2, ) else: GLOBALS.export_training = False @@ -532,6 +533,7 @@ def export( warnings.warn( "Setting `operator_export_type` to something other than default is deprecated. " "The option will be removed in a future release.", + stacklevel=2, category=DeprecationWarning, ) if training == _C_onnx.TrainingMode.TRAINING: @@ -539,6 +541,7 @@ def export( "Setting `training` to something other than default is deprecated. " "The option will be removed in a future release. Please set the training mode " "before exporting the model.", + stacklevel=2, category=DeprecationWarning, ) @@ -738,14 +741,14 @@ def warn_on_static_input_change(input_states): "for configuration use. " "Also note that the order and values of the keys must remain the same. " ) - warnings.warn(warning) + warnings.warn(warning, stacklevel=2) elif isinstance(input, str): if input != traced_input: warning = ( "The model seems to have string inputs/outputs. " "Note that strings will not appear as inputs/outputs of the ONNX graph. " ) - warnings.warn(warning) + warnings.warn(warning, stacklevel=2) def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type): @@ -782,7 +785,8 @@ def _decide_keep_init_as_input( "8 or lower would lead to an invalid ONNX graph. Therefore, " "'keep_initializers_as_inputs=False' is ignored during export." "Exported model will have initializers as graph inputs (compliant " - " to ONNX IR v3)." + " to ONNX IR v3).", + stacklevel=2, ) return True # i.e. True == initializers are part of graph input (ONNX IR v3) val_keep_init_as_ip = ( @@ -815,7 +819,8 @@ def _decide_constant_folding(do_constant_folding, operator_export_type, training "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some " "learnable model parameters may not translate correctly in the exported ONNX model " "because constant folding mutates model parameters. Please consider " - "turning off constant folding or setting the training=TrainingMode.EVAL." + "turning off constant folding or setting the training=TrainingMode.EVAL.", + stacklevel=2, ) return do_constant_folding @@ -831,7 +836,7 @@ def _decide_input_format(model, args): try: sig = _signature(model) except ValueError as e: - warnings.warn(f"{e}, skipping _decide_input_format") + warnings.warn(f"{e}, skipping _decide_input_format", stacklevel=2) return args try: ordered_list_keys = list(sig.parameters.keys()) @@ -859,9 +864,9 @@ def _decide_input_format(model, args): args = args_list if isinstance(args, list) else tuple(args_list) # Cases of models with no input args except IndexError: - warnings.warn("No input args, skipping _decide_input_format") + warnings.warn("No input args, skipping _decide_input_format", stacklevel=2) except Exception as e: - warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}") + warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}", stacklevel=2) return args @@ -1449,6 +1454,7 @@ def _export( f"by 'torch.onnx.export()'. " f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. " f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ", + stacklevel=2, category=errors.OnnxExporterWarning, ) @@ -1901,12 +1907,14 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names): for key, value in dynamic_axes.items(): if key not in valid_names: warnings.warn( - f"Provided key {key} for dynamic axes is not a valid input/output name" + f"Provided key {key} for dynamic axes is not a valid input/output name", + stacklevel=2, ) if isinstance(value, list): warnings.warn( "No names were found for specified dynamic axes of provided input." - f"Automatically generated names will be applied to each dynamic axes of input {key}" + f"Automatically generated names will be applied to each dynamic axes of input {key}", + stacklevel=2, ) value_dict = {} @@ -1917,7 +1925,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names): ) if x in value_dict: warnings.warn( - f"Duplicate dynamic axis index {x} was provided for input {key}." + f"Duplicate dynamic axis index {x} was provided for input {key}.", + stacklevel=2, ) else: value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1) diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py index c3cb967c14c..4ef9742ad6f 100644 --- a/torch/onnx/_internal/torchscript_exporter/verification.py +++ b/torch/onnx/_internal/torchscript_exporter/verification.py @@ -244,15 +244,16 @@ def _compare_onnx_pytorch_outputs_in_np( warnings.warn( f"Suppressed AssertionError:\n{e}.\n" f"Error percentage {error_percentage} " - f"within acceptable range {acceptable_error_percentage}." + f"within acceptable range {acceptable_error_percentage}.", + stacklevel=2, ) continue # pyrefly: ignore # missing-attribute if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8: - warnings.warn("ONNX output is quantized") + warnings.warn("ONNX output is quantized", stacklevel=2) # pyrefly: ignore # missing-attribute if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8: - warnings.warn("PyTorch output is quantized") + warnings.warn("PyTorch output is quantized", stacklevel=2) raise @@ -369,7 +370,8 @@ def _try_clone_model(model): return copy.deepcopy(model) except Exception: warnings.warn( - "Failed to clone model. Model state might be mutated during verification." + "Failed to clone model. Model state might be mutated during verification.", + stacklevel=2, ) return model diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 3a6bc296d70..3cc6649e0d8 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -267,6 +267,7 @@ class LRScheduler: "`lr_scheduler.step()`. See more details at " "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning, + stacklevel=2, ) # Just check if there were two first lr_scheduler.step() calls before optimizer.step() @@ -279,11 +280,12 @@ class LRScheduler: "See more details at " "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning, + stacklevel=2, ) self._step_count += 1 if epoch is not None: - warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning) + warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2) self._update_lr(epoch) def _update_lr(self, epoch: Optional[int] = None): @@ -1696,7 +1698,7 @@ class ReduceLROnPlateau(LRScheduler): if epoch is None: epoch = self.last_epoch + 1 else: - warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning) + warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning, stacklevel=2) self.last_epoch = epoch if self._is_better(current, self.best): diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py index c542dbfd84f..5475b2755d4 100644 --- a/torch/optim/optimizer.py +++ b/torch/optim/optimizer.py @@ -483,7 +483,8 @@ class Optimizer: warnings.warn( "This instance was constructed with capturable=True or some of all the param_groups came with capturable=True, " "but step() is running without CUDA graph capture. If you never intend to graph-capture this " - "instance, capturable=True can impair performance, and you should set capturable=False." + "instance, capturable=True can impair performance, and you should set capturable=False.", + stacklevel=2, ) self._warned_capturable_if_run_uncaptured = True diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py index 347a7976a58..08cd0b504dc 100644 --- a/torch/optim/swa_utils.py +++ b/torch/optim/swa_utils.py @@ -491,6 +491,7 @@ class SWALR(LRScheduler): "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning, + stacklevel=2, ) # Set in `LRScheduler._initial_step()` step = self._step_count - 1 diff --git a/torch/overrides.py b/torch/overrides.py index 264edf07b91..db4a7535a36 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -1747,6 +1747,7 @@ def handle_torch_function( "Defining your `__torch_function__ as a plain method is deprecated and " "will be an error in future, please define it as a classmethod.", DeprecationWarning, + stacklevel=2, ) # Use `public_api` instead of `implementation` so __torch_function__ @@ -2057,7 +2058,8 @@ class TorchFunctionMode: @classmethod def push(cls, *args, **kwargs): warnings.warn( - "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`" + "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`", + stacklevel=2, ) instance = cls(*args, **kwargs) return instance diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py index e92aa3fafb7..aa046db4454 100644 --- a/torch/profiler/profiler.py +++ b/torch/profiler/profiler.py @@ -512,7 +512,10 @@ def schedule( wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0 ), "Invalid profiler schedule arguments" if warmup == 0: - warn("Profiler won't be using warmup, this can skew profiler results") + warn( + "Profiler won't be using warmup, this can skew profiler results", + stacklevel=2, + ) return schedule_fn @@ -930,7 +933,8 @@ class ExecutionTraceObserver(_ITraceObserver): fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False) except Exception as e: warn( - f"Execution trace will not be recorded. Exception on creating default temporary file: {e}" + f"Execution trace will not be recorded. Exception on creating default temporary file: {e}", + stacklevel=2, ) return None fp.close() @@ -1015,7 +1019,10 @@ class ExecutionTraceObserver(_ITraceObserver): try: os.mkdir(resource_dir) except Exception: - warn(f"Execution trace exception when creating {resource_dir}") + warn( + f"Execution trace exception when creating {resource_dir}", + stacklevel=2, + ) return None else: return None @@ -1031,7 +1038,8 @@ class ExecutionTraceObserver(_ITraceObserver): resource_dir = self.get_resources_dir() except Exception as e: warn( - f"Execution trace exception when generating resource directory: {e}" + f"Execution trace exception when generating resource directory: {e}", + stacklevel=2, ) return if not resource_dir: @@ -1066,7 +1074,7 @@ class ExecutionTraceObserver(_ITraceObserver): try: _save_triton_kernels() except Exception as e: - warn(f"Execution trace failed to save kernels: {e}") + warn(f"Execution trace failed to save kernels: {e}", stacklevel=2) _remove_execution_trace_observer() if self.output_file_path.endswith("gz"): diff --git a/torch/random.py b/torch/random.py index 18a1cf9a5d5..cf23e52db32 100644 --- a/torch/random.py +++ b/torch/random.py @@ -184,7 +184,7 @@ def fork_rng( f"and suppress this warning, set the '{_devices_kw}' keyword argument to " f"`range(torch.{device_type}.device_count())`." ) - warnings.warn(message) + warnings.warn(message, stacklevel=2) _fork_rng_warned_already = True devices = list(range(num_devices)) else: diff --git a/torch/serialization.py b/torch/serialization.py index 1ef46d63ca2..ed0f76a4412 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -524,7 +524,10 @@ def check_module_version_greater_or_equal( if error_if_malformed: raise RuntimeError(message) from e else: - warnings.warn(message + ", but continuing assuming that requirement is met") + warnings.warn( + message + ", but continuing assuming that requirement is met", + stacklevel=2, + ) requirement_is_met = True return requirement_is_met @@ -1021,7 +1024,8 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: warnings.warn( "Couldn't retrieve source code for container of " "type " + obj.__name__ + ". It won't be checked " - "for correctness upon loading." + "for correctness upon loading.", + stacklevel=2, ) return ("module", obj, source_file, source) @@ -1502,6 +1506,7 @@ def load( " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to" " silence this warning)", UserWarning, + stacklevel=2, ) if weights_only: raise RuntimeError( @@ -1603,7 +1608,8 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): warnings.warn( "Couldn't retrieve source code for container of " "type " + container_type.__name__ + ". It won't be checked " - "for correctness upon loading." + "for correctness upon loading.", + stacklevel=2, ) return if original_source != current_source: @@ -1645,7 +1651,7 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): "patch tool to revert the changes." ) msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}" - warnings.warn(msg, SourceChangeWarning) + warnings.warn(msg, SourceChangeWarning, stacklevel=2) def legacy_load(f): deserialized_objects: dict[int, Any] = {} @@ -1949,6 +1955,7 @@ def _load( "torch.serialization.set_default_load_endianness to set " "the desired default load endianness", UserWarning, + stacklevel=2, ) from torch.utils.serialization import config diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py index 903c0a5a9d6..78bdbf07b2b 100644 --- a/torch/sparse/_triton_ops_meta.py +++ b/torch/sparse/_triton_ops_meta.py @@ -194,7 +194,8 @@ def update(op, device_name, version, key, value): # skip storing possible optimization failures: if not value: warnings.warn( - f"skipping empty value for {op}: {device_name=} {version=} {key=}" + f"skipping empty value for {op}: {device_name=} {version=} {key=}", + stacklevel=2, ) return if (op, device_name, version) in _operation_device_version_data: @@ -217,7 +218,8 @@ def dump(): if begin_data_index == -1 or end_data_index == -1: warnings.warn( f"{current_file} cannot be updated:" - " BEGIN/END GENERATED DATA comment blocks appear to be corrupted" + " BEGIN/END GENERATED DATA comment blocks appear to be corrupted", + stacklevel=2, ) return diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py index 7fcdd868793..da5b8b4798a 100644 --- a/torch/sparse/semi_structured.py +++ b/torch/sparse/semi_structured.py @@ -121,6 +121,7 @@ class SparseSemiStructuredTensor(torch.Tensor): "module for further information about the project." ), UserWarning, + stacklevel=2, ) cls._PROTOTYPE_WARNING_SHOWN = True diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py index dd211599cf1..36c72f1d5c3 100644 --- a/torch/testing/_internal/common_fsdp.py +++ b/torch/testing/_internal/common_fsdp.py @@ -1551,7 +1551,9 @@ def compiled_fsdp_test(compile_compute_on_module: Optional[type] = None): original_fully_shard: Any = torch.distributed.fsdp.fully_shard for mode in FullyShardMode: if mode != FullyShardMode.EAGER and not has_triton(): - warnings.warn("Inductor on GPU needs Triton and recent GPU arch") + warnings.warn( + "Inductor on GPU needs Triton and recent GPU arch", stacklevel=2 + ) continue # barrier to ensure thread reading the same value original_skip_fsdp_hooks = torch._dynamo.config.skip_fsdp_hooks diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 552140f8046..0c26738c2f5 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1258,7 +1258,7 @@ def run_tests(argv=None): # use env vars so pytest-xdist subprocesses can still access them os.environ['SLOW_TESTS_FILE'] = SLOW_TESTS_FILE else: - warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}') + warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}', stacklevel=2) if DISABLED_TESTS_FILE: if os.path.exists(DISABLED_TESTS_FILE): with open(DISABLED_TESTS_FILE) as fp: @@ -1266,7 +1266,7 @@ def run_tests(argv=None): disabled_tests_dict = json.load(fp) os.environ['DISABLED_TESTS_FILE'] = DISABLED_TESTS_FILE else: - warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}') + warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}', stacklevel=2) # Determine the test launch mechanism if TEST_DISCOVER: _print_test_names() @@ -2663,7 +2663,7 @@ class CudaMemoryLeakCheck: f"and is now reported as {caching_allocator_mem_allocated} " # type: ignore[possibly-undefined] f"on device {i}. " f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.") # type: ignore[possibly-undefined] - warnings.warn(msg) + warnings.warn(msg, stacklevel=2) elif caching_allocator_discrepancy and driver_discrepancy: # type: ignore[possibly-undefined] # A caching allocator discrepancy validated by the driver API is a # failure (except on ROCm, see below) @@ -2757,7 +2757,7 @@ try: "pytorch_ci" if IS_CI else os.getenv('PYTORCH_HYPOTHESIS_PROFILE', 'dev') ) except ImportError: - warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning) + warnings.warn('Fail to import hypothesis in common_utils, tests are not derandomized', ImportWarning, stacklevel=2) # Used in check_if_enable to see if a test method should be disabled by an issue, # sanitizes a test method name from appended suffixes by @dtypes parametrization. @@ -4648,7 +4648,7 @@ def download_file(url, binary=True): return path except error.URLError as e: msg = f"could not download test file '{url}'" - warnings.warn(msg, RuntimeWarning) + warnings.warn(msg, RuntimeWarning, stacklevel=2) raise unittest.SkipTest(msg) from e def find_free_port(): @@ -5744,17 +5744,17 @@ def check_leaked_tensors(limit=1, matched_type=torch.Tensor): num_garbage_objs = len(garbage_objs) if num_garbage_objs > 0: warnings.warn( - f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?" + f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?", stacklevel=2 ) try: import objgraph # type: ignore[import-not-found,import-untyped] warnings.warn( - f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png" + f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png", stacklevel=2 ) for g in garbage_objs[:limit]: objgraph.show_backrefs([g], max_depth=10) except ImportError: - warnings.warn("`pip install objgraph` to enable memory leak debugging") + warnings.warn("`pip install objgraph` to enable memory leak debugging", stacklevel=2) finally: gc.set_debug(0) diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py index 4000ec6ca13..d9e2127e956 100644 --- a/torch/testing/_internal/opinfo/utils.py +++ b/torch/testing/_internal/opinfo/utils.py @@ -62,7 +62,8 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type): assert device_type in ["cpu", "cuda"] if not TEST_CUDA and device_type == "cuda": warnings.warn( - "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!" + "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!", + stacklevel=2, ) return _dynamic_dispatch_dtypes(()) @@ -76,7 +77,8 @@ def get_supported_dtypes(op, sample_inputs_fn, device_type): # We raise a warning, so that user knows that this was the case # and can investigate if there was an issue with the `sample_inputs_fn`. warnings.warn( - f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}" + f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}", + stacklevel=2, ) continue diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py index fa756892c34..5c8df5d9ead 100644 --- a/torch/utils/_python_dispatch.py +++ b/torch/utils/_python_dispatch.py @@ -158,7 +158,8 @@ class TorchDispatchMode: @classmethod def push(cls, *args, **kwargs): warnings.warn( - "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`" + "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`", + stacklevel=2, ) instance = cls(*args, **kwargs) return instance diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py index 2ed1ba60a59..703aea93a56 100644 --- a/torch/utils/_pytree.py +++ b/torch/utils/_pytree.py @@ -602,6 +602,7 @@ def _private_register_pytree_node( warnings.warn( f"{cls} is already registered as pytree node. " "Overwriting the previous registration.", + stacklevel=2, ) node_def = NodeDef(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn) diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py index d3c41b8fb9e..5707f4e0fd4 100644 --- a/torch/utils/checkpoint.py +++ b/torch/utils/checkpoint.py @@ -83,7 +83,7 @@ def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]: def check_backward_validity(inputs: Iterable[Any]) -> None: if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)): warnings.warn( - "None of the inputs have requires_grad=True. Gradients will be None" + "None of the inputs have requires_grad=True. Gradients will be None", stacklevel=2 ) @@ -144,7 +144,7 @@ def _infer_device_type(*args): "devices will be ignored. Consequently, if any checkpointed functions involve randomness, " "this may result in incorrect gradients. (Note that if CUDA devices are among the devices " "detected, it will be prioritized; otherwise, the first device encountered will be selected.)" - f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}" + f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}", stacklevel=2 ) if len(device_types) == 0: return DefaultDeviceType.get_device_type() @@ -565,7 +565,7 @@ def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwar "is not passed. use_reentrant=False is " "recommended, but if you need to preserve the current default " "behavior, you can pass use_reentrant=True. Refer to docs for more " - "details on the differences between the two variants." + "details on the differences between the two variants.", stacklevel=2 ) use_reentrant = True diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 5e5307555e5..19400eb4a21 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -624,7 +624,8 @@ class DataLoader(Generic[_T_co]): warnings.warn( _create_warning_msg( max_num_worker_suggest, self.num_workers, cpuset_checked - ) + ), + stacklevel=2, ) return @@ -632,7 +633,8 @@ class DataLoader(Generic[_T_co]): warnings.warn( _create_warning_msg( max_num_worker_suggest, self.num_workers, cpuset_checked - ) + ), + stacklevel=2, ) @@ -663,14 +665,15 @@ class _BaseDataLoaderIter: if loader.pin_memory and loader.pin_memory_device: warnings.warn( "pin_memory_device is deprecated, the current accelerator will be used as the device," - f"ignore pin_memory_device='{loader.pin_memory_device}'." + f"ignore pin_memory_device='{loader.pin_memory_device}'.", + stacklevel=2, ) if loader.pin_memory and not torch.accelerator.is_available(): warn_msg = ( "'pin_memory' argument is set as true but no accelerator is found, " "then device pinned memory won't be used." ) - warnings.warn(warn_msg) + warnings.warn(warn_msg, stacklevel=2) # Enabling pin_memory in _BaseDataLoaderIter to support identical # behavior in forked implementations using _BaseDataLoaderIter. @@ -694,7 +697,7 @@ class _BaseDataLoaderIter: "'pin_memory' argument is set as true but not supported on MPS now, " "device pinned memory won't be used." ) - warnings.warn(warn_msg) + warnings.warn(warn_msg, stacklevel=2) self._timeout = loader.timeout self._collate_fn = loader.collate_fn @@ -751,7 +754,7 @@ class _BaseDataLoaderIter: "IterableDataset replica at each worker. Please see " "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples." ) - warnings.warn(warn_msg) + warnings.warn(warn_msg, stacklevel=2) return data def __len__(self) -> int: diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index b6dda4552c2..2e3d3712442 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -159,6 +159,7 @@ class _ForkerIterDataPipe(IterDataPipe, _ContainerTemplate): "Unlimited buffer size is set for `fork`, " "please be aware of OOM at random places", UserWarning, + stacklevel=2, ) if copy is None: self.copy_fn = _no_op @@ -359,6 +360,7 @@ class _ChildDataPipe(IterDataPipe): "Some child DataPipes are not exhausted when __iter__ is called. We are resetting " "the buffer and each child DataPipe will read from the start again.", UserWarning, + stacklevel=2, ) self.main_datapipe.reset() # 3. Otherwise, the iterator is behind the others, so it will just need to catch up by setting @@ -464,6 +466,7 @@ class _DemultiplexerIterDataPipe(IterDataPipe, _ContainerTemplate): "Unlimited buffer size is set for `demux`, " "please be aware of OOM at random places", UserWarning, + stacklevel=2, ) self.current_buffer_usage = 0 # pyrefly: ignore [invalid-type-var] diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py index f90b426be12..e45ddab282f 100644 --- a/torch/utils/data/datapipes/iter/utils.py +++ b/torch/utils/data/datapipes/iter/utils.py @@ -49,7 +49,8 @@ class IterableWrapperIterDataPipe(IterDataPipe[_T]): except TypeError: warnings.warn( "The input iterable can not be deepcopied, " - "please be aware of in-place modification would affect source data." + "please be aware of in-place modification would affect source data.", + stacklevel=2, ) yield from source_data diff --git a/torch/utils/data/datapipes/map/utils.py b/torch/utils/data/datapipes/map/utils.py index e1290df3237..360f66b3137 100644 --- a/torch/utils/data/datapipes/map/utils.py +++ b/torch/utils/data/datapipes/map/utils.py @@ -47,7 +47,8 @@ class SequenceWrapperMapDataPipe(MapDataPipe[_T]): except TypeError: warnings.warn( "The input sequence can not be deepcopied, " - "please be aware of in-place modification would affect source data" + "please be aware of in-place modification would affect source data", + stacklevel=2, ) self.sequence = sequence else: diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py index 2390434c3ef..003ca568fca 100644 --- a/torch/utils/data/datapipes/utils/common.py +++ b/torch/utils/data/datapipes/utils/common.py @@ -149,7 +149,8 @@ def _check_unpickable_fn(fn: Callable): if _is_local_fn(fn) and not dill_available(): warnings.warn( "Local function is not supported by pickle, please use " - "regular python function or functools.partial instead." + "regular python function or functools.partial instead.", + stacklevel=2, ) return @@ -157,7 +158,8 @@ def _check_unpickable_fn(fn: Callable): if hasattr(fn, "__name__") and fn.__name__ == "" and not dill_available(): warnings.warn( "Lambda function is not supported by pickle, please use " - "regular python function or functools.partial instead." + "regular python function or functools.partial instead.", + stacklevel=2, ) return @@ -185,7 +187,7 @@ def get_file_pathnames_from_root( ) -> Iterable[str]: # print out an error message and raise the error out def onerror(err: OSError): - warnings.warn(err.filename + " : " + err.strerror) + warnings.warn(err.filename + " : " + err.strerror, stacklevel=2) raise err if os.path.isfile(root): @@ -311,7 +313,7 @@ def _deprecation_warning( if new_argument_name: msg = f"{msg}\nPlease use `{old_class_name}({new_argument_name}=)` instead." - warnings.warn(msg, FutureWarning) + warnings.warn(msg, FutureWarning, stacklevel=2) class StreamWrapper: diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py index 221b3116017..f4e61963cd0 100644 --- a/torch/utils/data/dataset.py +++ b/torch/utils/data/dataset.py @@ -463,7 +463,8 @@ def random_split( if length == 0: warnings.warn( f"Length of split at index {i} is 0. " - f"This might result in an empty dataset." + f"This might result in an empty dataset.", + stacklevel=2, ) # Cannot verify that dataset is Sized diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py index a08421f9b68..8867109c1e0 100644 --- a/torch/utils/data/graph.py +++ b/torch/utils/data/graph.py @@ -132,7 +132,7 @@ def traverse(datapipe: DataPipe, only_datapipe: Optional[bool] = None) -> DataPi ) if not only_datapipe: msg += " And, the behavior will be changed to the equivalent of `only_datapipe=True`." - warnings.warn(msg, FutureWarning) + warnings.warn(msg, FutureWarning, stacklevel=2) if only_datapipe is None: only_datapipe = False cache: set[int] = set() diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py index 8cc16c86b0f..bb97558256b 100644 --- a/torch/utils/data/graph_settings.py +++ b/torch/utils/data/graph_settings.py @@ -116,7 +116,8 @@ def apply_shuffle_settings( if not shufflers and shuffle: warnings.warn( "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. " - "Be aware that the default buffer size might not be sufficient for your task." + "Be aware that the default buffer size might not be sufficient for your task.", + stacklevel=2, ) datapipe = datapipe.shuffle() shufflers = [ diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py index c7ce437ab9b..3d51d9efb33 100644 --- a/torch/utils/file_baton.py +++ b/torch/utils/file_baton.py @@ -53,7 +53,7 @@ class FileBaton: if self.warn_after_seconds is not None: if time.time() - start_time > self.warn_after_seconds and not has_warned: warnings.warn(f'Waited on lock file "{self.lock_file_path}" for ' - f'{self.warn_after_seconds} seconds.') + f'{self.warn_after_seconds} seconds.', stacklevel=2) has_warned = True def release(self): diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py index 9ee3dbe18e9..157a5f4fb22 100644 --- a/torch/utils/hooks.py +++ b/torch/utils/hooks.py @@ -88,7 +88,7 @@ def warn_if_has_hooks(tensor): warnings.warn(f"backward hook {repr(hook)} on tensor will not be " "serialized. If this is expected, you can " "decorate the function with @torch.utils.hooks.unserializable_hook " - "to suppress this warning") + "to suppress this warning", stacklevel=2) class BackwardHook: """ diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py index 9b39c303ac3..2ba3ea36088 100644 --- a/torch/utils/model_dump/__init__.py +++ b/torch/utils/model_dump/__init__.py @@ -429,7 +429,7 @@ def get_info_and_burn_skeleton(path_or_bytesio, **kwargs): def main(argv, *, stdout=None): - warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.") + warnings.warn("torch.utils.model_dump is deprecated and will be removed in a future PyTorch release.", stacklevel=2) parser = argparse.ArgumentParser() parser.add_argument("--style", choices=["json", "html"]) parser.add_argument("--title")