Revert "Add label_smoothing param in nn.BCELoss and nn.BCEWithLogitsLoss (#150282)"

This reverts commit f990490a23. Reverted https://github.com/pytorch/pytorch/pull/150282 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/150282#issuecomment-3182844949))
2025-12-06 00:20:18 +01:00 · 2025-08-13 09:01:52 +00:00 · 2025-08-13 09:01:52 +00:00 · 641ee74781
commit 641ee74781
parent 6e8865fbc1
5 changed files with 11 additions and 62 deletions
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@ -3472,7 +3472,6 @@ def binary_cross_entropy(
    size_average: Optional[bool] = None,
    reduce: Optional[bool] = None,
    reduction: str = "mean",
-    label_smoothing: float = 0.0,
 ) -> Tensor:
    r"""Compute Binary Cross Entropy between the target and input probabilities.

@ -3491,11 +3490,9 @@ def binary_cross_entropy(
            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
-        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
-            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
-            become a mixture of the original ground truth and a uniform distribution as described in
-            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
+
    Examples::
+
        >>> input = torch.randn(3, 2, requires_grad=True)
        >>> target = torch.rand(3, 2, requires_grad=False)
        >>> loss = F.binary_cross_entropy(torch.sigmoid(input), target)
@ -3511,7 +3508,6 @@ def binary_cross_entropy(
            size_average=size_average,
            reduce=reduce,
            reduction=reduction,
-            label_smoothing=label_smoothing,
        )
    if size_average is not None or reduce is not None:
        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
@ -3527,13 +3523,6 @@ def binary_cross_entropy(
        new_size = _infer_size(target.size(), weight.size())
        weight = weight.expand(new_size)

-    assert 0 <= label_smoothing <= 1, (
-        f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
-    )
-
-    if label_smoothing > 0:
-        target = target * (1 - label_smoothing) + (1 - target) * label_smoothing
-
    return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)


@ -3545,7 +3534,6 @@ def binary_cross_entropy_with_logits(
    reduce: Optional[bool] = None,
    reduction: str = "mean",
    pos_weight: Optional[Tensor] = None,
-    label_smoothing: float = 0.0,
 ) -> Tensor:
    r"""Compute Binary Cross Entropy between target and input logits.

@ -3572,11 +3560,9 @@ def binary_cross_entropy_with_logits(
            [C, H, W] the same pos_weights across the batch. To apply the same positive weight
            along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
            Default: ``None``
-        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
-            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
-            become a mixture of the original ground truth and a uniform distribution as described in
-            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
+
    Examples::
+
         >>> input = torch.randn(3, requires_grad=True)
         >>> target = torch.empty(3).random_(2)
         >>> loss = F.binary_cross_entropy_with_logits(input, target)
@ -3593,7 +3579,6 @@ def binary_cross_entropy_with_logits(
            reduce=reduce,
            reduction=reduction,
            pos_weight=pos_weight,
-            label_smoothing=label_smoothing,
        )
    if size_average is not None or reduce is not None:
        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
@ -3605,13 +3590,6 @@ def binary_cross_entropy_with_logits(
            f"Target size ({target.size()}) must be the same as input size ({input.size()})"
        )

-    assert 0 <= label_smoothing <= 1, (
-        f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
-    )
-
-    if label_smoothing > 0:
-        target = target * (1 - label_smoothing) + (1 - target) * label_smoothing
-
    return torch.binary_cross_entropy_with_logits(
        input, target, weight, pos_weight, reduction_enum
    )
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@ -134,7 +134,6 @@ def binary_cross_entropy_with_logits(
    reduce: bool | None = ...,
    reduction: str = ...,
    pos_weight: Tensor | None = ...,
-    label_smoothing: float = ...,
 ) -> Tensor: ...

 __all__ += ["binary_cross_entropy_with_logits"]
@ -146,7 +145,6 @@ def binary_cross_entropy(
    size_average: bool | None = ...,
    reduce: bool | None = ...,
    reduction: str = ...,
-    label_smoothing: float = ...,
 ) -> Tensor: ...

 __all__ += ["binary_cross_entropy"]
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@ -692,10 +692,6 @@ class BCELoss(_WeightedLoss):
            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
-        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
-            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
-            become a mixture of the original ground truth and a uniform distribution as described in
-            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.

    Shape:
        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
@ -721,21 +717,15 @@ class BCELoss(_WeightedLoss):
        size_average=None,
        reduce=None,
        reduction: str = "mean",
-        label_smoothing: float = 0.0,
    ) -> None:
        super().__init__(weight, size_average, reduce, reduction)
-        self.label_smoothing = label_smoothing

    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        """
        Runs the forward pass.
        """
        return F.binary_cross_entropy(
-            input,
-            target,
-            weight=self.weight,
-            reduction=self.reduction,
-            label_smoothing=self.label_smoothing,
+            input, target, weight=self.weight, reduction=self.reduction
        )


@ -825,10 +815,6 @@ class BCEWithLogitsLoss(_Loss):
            [C, H, W] the same pos_weights across the batch. To apply the same positive weight
            along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
            Default: ``None``
-        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
-            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
-            become a mixture of the original ground truth and a uniform distribution as described in
-            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.

    Shape:
        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
@ -852,14 +838,12 @@ class BCEWithLogitsLoss(_Loss):
        reduce=None,
        reduction: str = "mean",
        pos_weight: Optional[Tensor] = None,
-        label_smoothing: float = 0.0,
    ) -> None:
        super().__init__(size_average, reduce, reduction)
        self.register_buffer("weight", weight)
        self.register_buffer("pos_weight", pos_weight)
        self.weight: Optional[Tensor]
        self.pos_weight: Optional[Tensor]
-        self.label_smoothing = label_smoothing

    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        """Runs the forward pass."""
@ -869,7 +853,6 @@ class BCEWithLogitsLoss(_Loss):
            self.weight,
            pos_weight=self.pos_weight,
            reduction=self.reduction,
-            label_smoothing=self.label_smoothing,
        )


--- a/torch/overrides.py
+++ b/torch/overrides.py
@ -488,7 +488,7 @@ def get_testing_overrides() -> dict[Callable, Callable]:
        torch.bernoulli: lambda input, generator=None, out=None: -1,
        torch.bilinear: lambda input1, input2, weight, bias: -1,
        torch.binary_cross_entropy_with_logits: (
-            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean", pos_weight=None, label_smoothing=0.0: -1  # noqa: B950
+            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean", pos_weight=None: -1
        ),
        torch.bincount: lambda input, weights=None, minlength=0: -1,
        torch.binomial: lambda count, prob, generator=None: -1,
@ -851,10 +851,10 @@ def get_testing_overrides() -> dict[Callable, Callable]:
        ),
        torch.nn.functional.bilinear: lambda input1, input2, weight, bias=None: -1,
        torch.nn.functional.binary_cross_entropy: (
-            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean", label_smoothing=0.0: -1
+            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean": -1
        ),
        torch.nn.functional.binary_cross_entropy_with_logits: (
-            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean", pos_weight=None, label_smoothing=0.0: -1  # noqa: B950
+            lambda input, target, weight=None, size_average=None, reduce=None, reduction="mean", pos_weight=None: -1
        ),
        torch.nn.functional.celu: lambda input, alpha=1.0, inplace=False: -1,
        torch.nn.functional.cosine_embedding_loss: (
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@ -1463,14 +1463,9 @@ def module_inputs_torch_nn_BCELoss(module_info, device, dtype, requires_grad, tr
        ('reduction_mean', {'reduction': 'mean'}),
        ('reduction_none', {'reduction': 'none'}),
        ('weights', {'weight': make_weight((10,))}),
-        ('label_smoothing', {'label_smoothing': 0.15}),
    ]

-    def bce_loss_reference_fn(m, p, i, t, reduction='mean', weight=None, label_smoothing=0.0):
-        assert 0 <= label_smoothing <= 1
-        if label_smoothing > 0:
-            t = t * (1 - label_smoothing) + (1 - t) * label_smoothing
-
+    def bce_loss_reference_fn(m, p, i, t, reduction='mean', weight=None):
        result = -(t * i.log() + (1 - t) * (1 - i).log())

        if weight is not None:
@ -1516,15 +1511,10 @@ def module_inputs_torch_nn_BCEWithLogitsLoss(module_info, device, dtype, require
        ('reduction_mean', {'reduction': 'mean'}),
        ('reduction_none', {'reduction': 'none'}),
        ('weights', {'weight': make_weight((10,))}),
-        ('scalar_weights', {'weight': make_weight(())}),
-        ('label_smoothing', {'label_smoothing': 0.15}),
+        ('scalar_weights', {'weight': make_weight(())})
    ]

-    def bce_withlogitsloss_reference_fn(m, p, i, t, reduction='mean', weight=None, label_smoothing=0.0):
-        assert 0 <= label_smoothing <= 1
-        if label_smoothing > 0:
-            t = t * (1 - label_smoothing) + (1 - t) * label_smoothing
-
+    def bce_withlogitsloss_reference_fn(m, p, i, t, reduction='mean', weight=None):
        # TODO: add pos_weight to the definition here and corresponding SampleInputs
        max_val = (-i).clamp(min=0)
        result = (1 - t).mul_(i).add_(max_val).add_((-max_val).exp_().add_((-i - max_val).exp_()).log_())