Initial implementation of AdaRound (#126153)

Summary: This is an implementation of AdaRound from a paper https://arxiv.org/abs/2004.10568 This algorithm is going to be used by multiple people, hence we need make it official implementation. Differential Revision: D57227565 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126153 Approved by: https://github.com/jerryzh168, https://github.com/huydhn
2025-12-07 12:21:27 +01:00 · 2024-05-17 19:44:50 +00:00 · 2024-05-17 19:44:50 +00:00 · eb0b16db92
commit eb0b16db92
parent 875221dedf
4 changed files with 600 additions and 0 deletions
--- a/test/quantization/core/experimental/test_adaround_eager.py
+++ b/test/quantization/core/experimental/test_adaround_eager.py
@ -0,0 +1,118 @@
 # Owner(s): ["oncall: speech_infra"]
 import copy
 import torch
 import torch.nn as nn
 from torch.ao.quantization.experimental.adaround_optimization import (
    AdaptiveRoundingOptimizer,
 )
 from torch.nn import functional as F
 from torch.quantization.observer import MinMaxObserver
 from torch.testing._internal.common_quantization import QuantizationTestCase
 def forward_wrapper(fetcher):
    def forward(module, input, output):
        fetcher.append(input[0].detach())
        fetcher.append(output.detach())
    return forward
 class TestAdaround(QuantizationTestCase):
    def feedforawrd_callback(
        self,
        model,
        data,
    ) -> None:
        model(data)
    def run_adaround(self, model, img_data):
        adaround_optimizer = AdaptiveRoundingOptimizer(
            model,
            self.feedforawrd_callback,
            forward_wrapper,
            img_data,
            max_iter=100,
            batch_size=10,
        )
        adarounded_model = adaround_optimizer.run_adaround()
        return adarounded_model
    def get_fake_quant(self, model):
        hard_fake_quant_model = copy.deepcopy(model)
        for _, module in hard_fake_quant_model.named_modules():
            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
                weight_observer = MinMaxObserver(
                    quant_min=-128,
                    quant_max=127,
                    dtype=torch.qint8,
                    qscheme=torch.per_tensor_symmetric,
                )
                weight_observer(module.weight)
                scale, zero_point = weight_observer.calculate_qparams()
                fake_quant_module = torch.fake_quantize_per_tensor_affine(
                    module.weight,
                    scale=scale,
                    zero_point=zero_point,
                    quant_min=-128,
                    quant_max=127,
                )
                module.weight.data.copy_(fake_quant_module)
        return hard_fake_quant_model
    def test_linear_chain(self):
        class LinearChain(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear1 = nn.Linear(3, 4)
                self.linear2 = nn.Linear(4, 5)
                self.linear3 = nn.Linear(5, 6)
            def forward(self, x):
                x = self.linear1(x)
                x = self.linear2(x)
                x = self.linear3(x)
                return x
        float_model = LinearChain()
        img_data = [torch.rand(10, 3, dtype=torch.float) for _ in range(50)]
        adarounded_model = self.run_adaround(float_model, img_data)
        fq_model = self.get_fake_quant(float_model)
        rand_input = torch.rand(10, 3)
        with torch.no_grad():
            ada_out = adarounded_model(rand_input)
            fq_out = fq_model(rand_input)
            float_out = float_model(rand_input)
            ada_loss = F.mse_loss(ada_out, float_out)
            fq_loss = F.mse_loss(fq_out, float_out)
            self.assertTrue(ada_loss.item() < fq_loss.item())
    def test_conv_chain(self):
        class ConvChain(nn.Module):
            def __init__(self):
                super().__init__()
                self.conv2d1 = nn.Conv2d(3, 4, 5, 5)
                self.conv2d2 = nn.Conv2d(4, 5, 5, 5)
                self.conv2d3 = nn.Conv2d(5, 6, 5, 5)
            def forward(self, x):
                x = self.conv2d1(x)
                x = self.conv2d2(x)
                x = self.conv2d3(x)
                return x
        float_model = ConvChain()
        img_data = [torch.rand(10, 3, 125, 125, dtype=torch.float) for _ in range(50)]
        adarounded_model = self.run_adaround(float_model, img_data)
        fq_model = self.get_fake_quant(float_model)
        rand_input = torch.rand(10, 3, 256, 256)
        with torch.no_grad():
            ada_out = adarounded_model(rand_input)
            fq_out = fq_model(rand_input)
            float_out = float_model(rand_input)
            ada_loss = F.mse_loss(ada_out, float_out)
            fq_loss = F.mse_loss(fq_out, float_out)
            self.assertTrue(ada_loss.item() < fq_loss.item())
--- a/torch/ao/quantization/experimental/adaround_fake_quantize.py
+++ b/torch/ao/quantization/experimental/adaround_fake_quantize.py
@ -0,0 +1,148 @@
 from typing import Tuple
 import torch
 from torch.ao.quantization.fake_quantize import _is_symmetric_quant
 from torch.ao.quantization.utils import is_per_tensor
 from torch.quantization import FakeQuantize
 from torch.quantization.observer import MinMaxObserver
 class AdaroundFakeQuantizer(FakeQuantize):
    """
    This is a FakeQuantizer that enables an adaptive rounding fake quantizer.
    Adaround is a technique to adaptively round weights, derived from the paper https://arxiv.org/pdf/2004.10568.pdf
    For HTP compatibility, we are targeting to use symmetric quantization
    """
    scale: torch.Tensor
    zero_point: torch.Tensor
    V: torch.nn.Parameter
    # pyre-fixme[3]: Return type must be annotated.
    def __init__(
        self,
        observer=MinMaxObserver,
        qscheme=torch.per_tensor_symmetric,  # not used, but needed for fakequant
        quant_min: int = -128,
        quant_max: int = 127,
        ch_axis: int = 0,
        # pyre-fixme[2]: Parameter must be annotated.
        **observer_kwargs,
    ):
        super().__init__(
            observer=observer,
            qscheme=qscheme,
            quant_min=quant_min,
            quant_max=quant_max,
            is_dynamic=False,
            **observer_kwargs,
        )
        # Populate quant_min/quant_max to observer_kwargs if valid
        if quant_min is not None and quant_max is not None:
            assert (
                quant_min <= quant_max
            ), "quant_min must be less than or equal to quant_max"
        # pyre-fixme[4]: Attribute must be annotated.
        self.qscheme = qscheme
        self.is_per_tensor: bool = is_per_tensor(qscheme)
        self.is_symmetric: bool = _is_symmetric_quant(qscheme)
        assert self.is_symmetric, "Only symmetric quantization is supported"
        self.ch_axis: int = ch_axis
        self.scale = torch.tensor([], requires_grad=False)
        self.zero_point = torch.tensor([], requires_grad=False)
        self.V = torch.nn.Parameter(torch.tensor([]), requires_grad=True)
        # Fixed Stretch parameters
        self.zeta: torch.Tensor = torch.tensor(1.1, requires_grad=False)
        self.gamma: torch.Tensor = torch.tensor(-0.1, requires_grad=False)
        self.sigmoid = torch.nn.Sigmoid()
        self.use_soft_rounding = True
    @torch.jit.export
    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.scale, self.zero_point
    @torch.jit.export
    def extra_repr(self) -> str:
        return (
            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
            f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, "
            f"dtype={self.dtype}, qscheme={self.qscheme}, ch_axis={self.ch_axis}, "
            f"scale={self.scale}, zero_point={self.zero_point}, (self.V >= 0).int().sum()={(self.V >= 0).int().sum()}"
        )
    def enable_weight_fake_quant(self) -> None:
        self.fake_quant_enabled[0] = 1
    def get_rectified_sigmoid_func(self) -> torch.Tensor:
        if self.use_soft_rounding:
            return torch.clamp(
                self.sigmoid(self.V) * (self.zeta - self.gamma) + self.gamma,
                min=0,
                max=1,
            )
        else:
            # This will dump a binary solution
            return (self.V >= 0).int()
    @torch.jit.ignore
    def update_scale(
        self, X: torch.Tensor, _scale: torch.Tensor, _zero_point: torch.Tensor
    ) -> None:
        if self.scale.numel() == 0:
            self.scale.data = _scale.to(X.device)
            self.zero_point = _zero_point.to(X.device)
        else:
            self.scale.data = _scale
            if not self.is_symmetric:
                self.zero_point = _zero_point
            else:
                self.zero_point = torch.zeros_like(_zero_point)
            for i in range(X.dim()):
                if i == self.ch_axis:
                    continue
                self.zero_point = self.zero_point.unsqueeze(i)
        X_q = X / self.scale
        X_q_floor = torch.floor(X_q)
        residual = X_q - X_q_floor  # [0,1)
        assert torch.all(
            torch.ge(residual, 0)
        ), "residual should be non-negative [0, 1)"
        V_init = -torch.log((self.zeta - self.gamma) / (residual - self.gamma) - 1)
        self.V.data = V_init
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        if self.observer_enabled[0] == 1:
            X_detached = X.detach()
            self.activation_post_process(X_detached)
            _scale, _zero_point = self.activation_post_process.calculate_qparams()
            _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(
                self.zero_point.device
            )
            dims = list(range(X.dim()))
            if not self.is_per_tensor:
                dims.remove(self.ch_axis)
            if not self.is_per_tensor:
                for i in range(X.dim()):
                    if i == self.ch_axis:
                        continue
                    _scale = _scale.unsqueeze(i)
                    _zero_point = _zero_point.unsqueeze(i)
            self.update_scale(X_detached, _scale, _zero_point)
        if self.fake_quant_enabled[0] == 1:
            # Perform soft quantization
            # See the equation (23) in Adaround paper
            h_v = self.get_rectified_sigmoid_func()
            X_q = X / self.scale
            # Straight-Through Estimator for floor function
            X_q_floor = torch.floor(X_q) + self.zero_point
            # Regardless of rounding, gradient should be able to flow back to self.V from X_q_dq.
            # With adaround, we don't train weight, but train V only.
            X_q_dq = (
                torch.clamp(X_q_floor + h_v, min=self.quant_min, max=self.quant_max)
                - self.zero_point
            ) * self.scale
            return X_q_dq
        else:
            return X
--- a/torch/ao/quantization/experimental/adaround_loss.py
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@ -0,0 +1,96 @@
 from typing import Tuple
 import numpy as np
 import torch
 from torch.nn import functional as F
 ADAROUND_ZETA: float = 1.1
 ADAROUND_GAMMA: float = -0.1
 class AdaptiveRoundingLoss(torch.nn.Module):
    """
    Adaptive Rounding Loss functions described in https://arxiv.org/pdf/2004.10568.pdf
    rounding regularization is eq [24]
    reconstruction loss is eq [25] except regularization term
    """
    def __init__(
        self,
        max_iter: int,
        warm_start: float = 0.2,
        beta_range: Tuple[int, int] = (20, 2),
        reg_param: float = 0.001,
    ) -> None:
        super().__init__()
        self.max_iter = max_iter
        self.warm_start = warm_start
        self.beta_range = beta_range
        self.reg_param = reg_param
    def rounding_regularization(
        self,
        V: torch.Tensor,
        curr_iter: int,
    ) -> torch.Tensor:
        """
        Major logics copied from official Adaround Implementation.
        Apply rounding regularization to the input tensor V.
        """
        assert (
            curr_iter < self.max_iter
        ), "Current iteration strictly les sthan max iteration"
        if curr_iter < self.warm_start * self.max_iter:
            return torch.tensor(0.0)
        else:
            start_beta, end_beta = self.beta_range
            warm_start_end_iter = self.warm_start * self.max_iter
            # compute relative iteration of current iteration
            rel_iter = (curr_iter - warm_start_end_iter) / (
                self.max_iter - warm_start_end_iter
            )
            beta = end_beta + 0.5 * (start_beta - end_beta) * (
                1 + np.cos(rel_iter * np.pi)
            )
            # A rectified sigmoid for soft-quantization as formualted [23] in https://arxiv.org/pdf/2004.10568.pdf
            h_alpha = torch.clamp(
                torch.sigmoid(V) * (ADAROUND_ZETA - ADAROUND_GAMMA) + ADAROUND_GAMMA,
                min=0,
                max=1,
            )
            # Apply rounding regularization
            # This regularization term helps out term to converge into binary solution either 0 or 1 at the end of optimization.
            inner_term = torch.add(2 * h_alpha, -1).abs().pow(beta)
            regularization_term = torch.add(1, -inner_term).sum()
            return regularization_term * self.reg_param
    def reconstruction_loss(
        self,
        soft_quantized_output: torch.Tensor,
        original_output: torch.Tensor,
    ) -> torch.Tensor:
        """
        Compute the reconstruction loss between the soft quantized output and the original output.
        """
        return F.mse_loss(
            soft_quantized_output, original_output, reduction="none"
        ).mean()
    def forward(
        self,
        soft_quantized_output: torch.Tensor,
        original_output: torch.Tensor,
        V: torch.Tensor,
        curr_iter: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Compute the asymmetric reconstruction formulation as eq [25]
        """
        regularization_term = self.rounding_regularization(V, curr_iter)
        reconstruction_term = self.reconstruction_loss(
            soft_quantized_output, original_output
        )
        return regularization_term, reconstruction_term
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@ -0,0 +1,238 @@
 import copy
 import logging
 from typing import Any, Callable, List, Optional, Tuple, Type, Union
 import torch
 from torch.ao.quantization.experimental.adaround_fake_quantize import (
    AdaroundFakeQuantizer,
 )
 from torch.ao.quantization.experimental.adaround_loss import AdaptiveRoundingLoss
 from torch.ao.quantization.observer import MinMaxObserver
 from torch.nn import functional as F
 from torch.nn.parallel import DataParallel
 from torch.utils.data import DataLoader, TensorDataset
 logger: logging.Logger = logging.getLogger(__name__)
 class AdaptiveRoundingOptimizer:
    def __init__(
        self,
        model: Union[torch.nn.Module, torch.nn.DataParallel],
        callback: Callable[[torch.nn.Module, List[Any]], None],
        forward_hook_wrapper: Callable[[List[torch.Tensor]], Callable],
        data: List[Any],
        observer: Type[torch.ao.quantization.observer.ObserverBase] = MinMaxObserver,
        max_iter=10000,
        dtype: torch.dtype = torch.qint8,
        quant_min=-128,
        quant_max=127,
        qscheme: torch.qscheme = torch.per_tensor_symmetric,
        batch_size: int = 256,
    ):
        self.model = model
        self.q_model = copy.deepcopy(self.model)
        self.device = torch.device("cuda") if torch.cuda.is_available() else None
        self.callback = callback
        self.forward_hook_wrapper = forward_hook_wrapper
        # TODO rather than having a data as list type or, we better pass *iterator* instead of list
        self.data = data
        self.batch_size = min(batch_size, len(data))
        self.max_iter = max_iter
        self.adaptive_round_loss_fn = AdaptiveRoundingLoss(
            max_iter=self.max_iter, warm_start=0.2
        )
        self.dtype = dtype
        self.observer = observer
        self.quant_min = quant_min
        self.quant_max = quant_max
        self.qscheme = qscheme
    def run_adaround(self) -> torch.nn.Module:
        layer_list: List[Tuple[str, torch.nn.Module, torch.nn.Module]] = []
        for (name, module), q_module in zip(
            self.model.named_modules(), self.q_model.modules()
        ):
            if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
                # Knowing activation ahead-of-time would be helpful for asymmetric formulation
                # But this is challenging in eager mode, but graph module.
                layer_list.append((name, module, q_module))
        logger.info(f"Total number of layers : {len(layer_list)}")  # noqa: G004
        for name, module, q_module in layer_list:
            logger.info(
                f"Kick start adaptive rounding on {name} module {module}"  # noqa: G004
            )
            self.optimize_adaptive_rounding(
                module,
                q_module,
                None,
            )
        return (
            self.q_model.module
            if isinstance(self.q_model, DataParallel)
            else self.q_model
        )
    def get_data_inp_out(
        self, module: torch.nn.Module, q_module: torch.nn.Module, data: List[Any]
    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
        fp_out: List[torch.Tensor] = []
        q_input: List[torch.Tensor] = []
        fp_input: List[torch.Tensor] = []
        fp32_fetcher: List[torch.Tensor] = []
        quant_fetcher: List[torch.Tensor] = []
        handler1 = module.register_forward_hook(self.forward_hook_wrapper(fp32_fetcher))
        handler2 = q_module.register_forward_hook(
            self.forward_hook_wrapper(quant_fetcher)
        )
        for data_ in data:
            with torch.no_grad():
                self.callback(self.model, data_)
                self.callback(self.q_model, data_)
            fp32_output = fp32_fetcher[1]
            quant_input = quant_fetcher[0]
            fp_out.append(fp32_output)
            q_input.append(quant_input)
            fp_input.append(fp32_fetcher[0])
        handler1.remove()
        handler2.remove()
        return q_input, fp_out, fp_input
    @torch.no_grad()
    def feed_forward(self, x, weight, module):
        if isinstance(module, torch.nn.Conv1d):
            out = torch.nn.functional.conv1d(
                x,
                weight,
                stride=module.stride,
                padding=module.padding,
                dilation=module.dilation,
                groups=module.groups,
            )
        elif isinstance(module, torch.nn.Linear):
            out = torch.nn.functional.linear(
                x,
                weight,
                bias=module.bias,
            )
        else:
            raise NotImplementedError
        return out
    def _compute_and_display_local_losses(
        self,
        ada_quantizer: AdaroundFakeQuantizer,
        q_module: torch.nn.Module,
        q_inp: torch.Tensor,
        fp_out: torch.Tensor,
    ):
        with torch.no_grad():
            ada_quantizer.use_soft_rounding = False
            q_w_hard_round = ada_quantizer(q_module.weight)
            out_hard_quant = self.feed_forward(q_inp, q_w_hard_round, q_module)
            ada_quantizer.use_soft_rounding = True
            q_w_soft_round = ada_quantizer(q_module.weight)
            out_soft_quant = self.feed_forward(q_inp, q_w_soft_round, q_module)
            soft_quant_loss = F.mse_loss(out_soft_quant, fp_out)
            hard_quant_loss = F.mse_loss(out_hard_quant, fp_out)
            logger.info(
                f"soft quant loss: {soft_quant_loss.item()} hard quant loss: {hard_quant_loss.item()}"  # noqa: G004
            )
    def optimize_adaptive_rounding(
        self,
        module: torch.nn.Module,
        q_module: torch.nn.Module,
        activation: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
    ) -> None:
        ada_quantizer = AdaroundFakeQuantizer(
            dtype=self.dtype,
            observer=self.observer,
            qscheme=self.qscheme,
            quant_min=self.quant_min,
            quant_max=self.quant_max,
            reduce_range=False,
        )
        ada_quantizer.enable_observer()
        ada_quantizer(q_module.weight)
        ada_quantizer.disable_observer()
        ada_quantizer.enable_fake_quant()
        optimizer = torch.optim.Adam([ada_quantizer.V])
        inp, out, fp_in = self.get_data_inp_out(module, q_module, self.data)
        logger.info("==================== Before adaround ====================")
        test_in, test_out, fp_test_in = self.get_data_inp_out(
            module, q_module, self.data[0]
        )
        assert (
            torch.abs(test_out[0] - module(fp_test_in[0])).sum().item() == 0
        ), "In-placed activation is detected, please do not use activation in-placed"
        # Stack the tensors in each list into a single tensor
        # Assuming inp and out are your lists of tensors
        inp_tensor = torch.vstack(inp)
        out_tensor = torch.vstack(out)
        dataset = TensorDataset(inp_tensor, out_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        self._compute_and_display_local_losses(
            ada_quantizer, q_module, test_in[0], test_out[0]
        )
        global_idx = 0
        one_iter = len(out) // self.batch_size
        for iteration in range(self.max_iter // one_iter):
            reconstruction_loss = regularization_loss = torch.tensor(0)
            for q_inp, fp_out in dataloader:
                optimizer.zero_grad()
                q_weight = ada_quantizer(q_module.weight)
                if isinstance(module, torch.nn.Conv1d):
                    q_out = torch.nn.functional.conv1d(
                        q_inp,
                        q_weight,
                        stride=q_module.stride,
                        padding=q_module.padding,
                        dilation=q_module.dilation,
                        groups=q_module.groups,
                    )
                elif isinstance(q_module, torch.nn.Linear):
                    q_out = torch.nn.functional.linear(
                        q_inp,
                        q_weight,
                        bias=q_module.bias,
                    )
                else:
                    raise NotImplementedError
                regularization_loss, reconstruction_loss = self.adaptive_round_loss_fn(
                    fp_out,
                    q_out,
                    ada_quantizer.V,
                    curr_iter=global_idx,
                )
                loss = regularization_loss + reconstruction_loss
                loss.backward()
                optimizer.step()
                global_idx += 1
                if global_idx >= self.max_iter:
                    break
            if global_idx >= self.max_iter:
                break
            if iteration % 30 == 0:
                logger.info(
                    f"glob iter {global_idx} regularization_loss {regularization_loss.item()} "  # noqa: G004
                    f"reconstruction_loss {reconstruction_loss.item()}"  # noqa: G004
                )
        logger.info("==================== After adaround ====================")
        self._compute_and_display_local_losses(
            ada_quantizer, q_module, test_in[0], test_out[0]
        )
        ada_quantizer.use_soft_rounding = True
        ada_quantizer.V.requires_grad = False
        ada_quantizer = ada_quantizer.eval()
        q_weight = ada_quantizer(q_module.weight)
        # At the end of optimization, we need to copy the adarounded weight back to the original module
        q_module.weight.data.copy_(q_weight)
        # Eager mode requires observer to be set as "weight_fake_quant" to be parsed
        q_module.weight_fake_quant = ada_quantizer.activation_post_process