torch.quantization conversion utilities, observers for eager mode quantization (#22010)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/22010 torch.quantization module with observers and conversion routines Reviewed By: zafartahirov Differential Revision: D15554183 fbshipit-source-id: 05a3fabe28dd701978b8ecebf5bfc3a4c044ba5c
2025-12-06 12:20:52 +01:00 · 2019-07-09 10:42:35 -07:00 · 2019-07-09 10:42:35 -07:00 · 5040d52a5a
commit 5040d52a5a
parent 073fa6f411
7 changed files with 778 additions and 3 deletions
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@ -0,0 +1,381 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+import torch.nn.quantized as nnq
+import torch.quantization as tq
+from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
+    default_eval_fn, QConfig, default_qconfig, default_observer, quantize, \
+    prepare, convert
+
+from common_utils import TestCase, run_tests
+
+class SingleLayerLinearModel(torch.nn.Module):
+    def __init__(self):
+        super(SingleLayerLinearModel, self).__init__()
+        self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        return x
+
+class TwoLayerLinearModel(torch.nn.Module):
+    def __init__(self):
+        super(TwoLayerLinearModel, self).__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+class LinearReluModel(torch.nn.Module):
+    def __init__(self):
+        super(LinearReluModel, self).__init__()
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.fc(x))
+        return x
+
+class NestedModel(torch.nn.Module):
+    def __init__(self):
+        super(NestedModel, self).__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = TwoLayerLinearModel()
+        self.fc3 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class InnerModule(torch.nn.Module):
+    def __init__(self):
+        super(InnerModule, self).__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        return self.relu(self.fc2(self.relu(self.fc1(x))))
+
+class WrappedModel(torch.nn.Module):
+    def __init__(self):
+        super(WrappedModel, self).__init__()
+        self.qconfig = default_qconfig
+        self.sub = QuantWrapper(InnerModule())
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        # don't quantize this fc
+        self.fc.qconfig = None
+
+    def forward(self, x):
+        return self.fc(self.sub(x))
+
+class ManualQuantModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
+    """
+    def __init__(self):
+        super(ManualQuantModel, self).__init__()
+        self.qconfig = default_qconfig
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc(x)
+        return self.dequant(x)
+
+calib_data = [torch.rand(20, 5, dtype=torch.float) for _ in range(20)]
+
+class ModelQuantizeAPITest(TestCase):
+
+    def checkNoPrepModules(self, module):
+        r"""Checks the module does not contain child
+            modules for quantization prepration, e.g.
+            quant, dequant and observer
+        """
+        self.assertFalse(hasattr(module, 'quant'))
+        self.assertFalse(hasattr(module, 'dequant'))
+
+    def checkHasPrepModules(self, module):
+        r"""Checks the module contains child
+            modules for quantization prepration, e.g.
+            quant, dequant and observer
+        """
+        self.assertTrue(hasattr(module, 'module'))
+        self.assertTrue(hasattr(module, 'quant'))
+        self.assertTrue(hasattr(module, 'dequant'))
+
+    def checkObservers(self, module):
+        if hasattr(module, 'qconfig') and module.qconfig is not None and len(module._modules) == 0:
+            self.assertTrue(hasattr(module, 'observer'))
+        for child in module.children():
+            self.checkObservers(child)
+
+    def checkQuantDequant(self, mod):
+        self.assertEqual(type(mod.quant), nnq.Quantize)
+        self.assertEqual(type(mod.dequant), nnq.DeQuantize)
+
+    def checkQuantizedLinear(self, mod):
+        self.assertEqual(type(mod.module), nnq.Linear)
+        self.assertEqual(mod.module.bias.dtype, torch.qint32)
+        self.checkQuantDequant(mod)
+
+    def checkLinear(self, mod):
+        self.assertEqual(type(mod), torch.nn.Linear)
+
+    def test_single_layer(self):
+        r"""Quantize SingleLayerLinearModel which has one Linear module, make sure it is swapped
+        to nnq.Linear which is the quantized version of the module
+        """
+        model = SingleLayerLinearModel()
+        qconfig_dict = {
+            '': default_qconfig
+        }
+        model = prepare(model, qconfig_dict)
+        # Check if observers and quant/dequant nodes are inserted
+        self.checkNoPrepModules(model)
+        self.checkHasPrepModules(model.fc1)
+        self.checkObservers(model)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            self.checkNoPrepModules(model)
+            self.checkHasPrepModules(model.fc1)
+            self.checkQuantizedLinear(model.fc1)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(SingleLayerLinearModel(), default_eval_fn, calib_data, qconfig_dict)
+        checkQuantized(model)
+
+    def test_two_layers(self):
+        r"""TwoLayerLinearModel has two Linear modules but we only quantize the second one
+        `fc2`, and `fc1`is not quantized
+        """
+        model = TwoLayerLinearModel()
+        qconfig_dict = {
+            'fc2': default_qconfig
+        }
+        model = prepare(model, qconfig_dict)
+
+        self.checkNoPrepModules(model)
+        self.checkObservers(model)
+        self.checkNoPrepModules(model.fc1)
+        self.checkHasPrepModules(model.fc2)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            self.checkNoPrepModules(model)
+            self.checkNoPrepModules(model.fc1)
+            self.checkHasPrepModules(model.fc2)
+            self.assertEqual(type(model.fc1), torch.nn.Linear)
+            self.checkQuantizedLinear(model.fc2)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(TwoLayerLinearModel(), default_eval_fn, calib_data, qconfig_dict)
+        checkQuantized(model)
+
+    def test_nested1(self):
+        r"""Test quantization for nested model, top level 'fc3' and
+        'fc1' of submodule 'sub2', 'sub2.fc2' is not quantized
+        """
+        model = NestedModel()
+        qconfig_dict = {
+            'fc3': default_qconfig,
+            'sub2.fc1': default_qconfig
+        }
+
+        def checkPrepModules(model, before_calib=False):
+            if before_calib:
+                self.checkObservers(model)
+            self.checkNoPrepModules(model)
+            self.checkNoPrepModules(model.sub1)
+            self.checkNoPrepModules(model.sub1.fc)
+            self.checkNoPrepModules(model.sub1.relu)
+            self.checkNoPrepModules(model.sub2)
+            self.checkHasPrepModules(model.sub2.fc1)
+            self.checkNoPrepModules(model.sub2.fc2)
+            self.checkHasPrepModules(model.fc3)
+
+        model = prepare(model, qconfig_dict)
+        checkPrepModules(model, True)
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            checkPrepModules(model)
+            self.checkLinear(model.sub1.fc)
+            self.checkQuantizedLinear(model.fc3)
+            self.checkQuantizedLinear(model.sub2.fc1)
+            self.checkLinear(model.sub2.fc2)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(NestedModel(), default_eval_fn, calib_data, qconfig_dict)
+        checkQuantized(model)
+
+
+    def test_nested2(self):
+        r"""Another test case for quantized, we will quantize all submodules
+        of submodule sub2, this will include redundant quant/dequant, to
+        remove them we need to manually call QuantWrapper or insert
+        QuantStub/DeQuantStub, see `test_quant_dequant_wrapper` and
+        `test_manual`
+        """
+        model = NestedModel()
+        qconfig_dict = {
+            'fc3': default_qconfig,
+            'sub2': default_qconfig
+        }
+        model = prepare(model, qconfig_dict)
+
+        def checkPrepModules(model, before_calib=False):
+            if before_calib:
+                self.checkObservers(model)
+            self.checkNoPrepModules(model)
+            self.checkNoPrepModules(model.sub1)
+            self.checkNoPrepModules(model.sub1.fc)
+            self.checkNoPrepModules(model.sub1.relu)
+            self.checkNoPrepModules(model.sub2)
+            self.checkHasPrepModules(model.sub2.fc1)
+            self.checkHasPrepModules(model.sub2.fc2)
+            self.checkHasPrepModules(model.fc3)
+
+        checkPrepModules(model, True)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            checkPrepModules(model)
+            self.checkLinear(model.sub1.fc)
+            self.assertEqual(type(model.sub1.relu), torch.nn.ReLU)
+            self.checkQuantizedLinear(model.sub2.fc1)
+            self.checkQuantizedLinear(model.sub2.fc2)
+            self.checkQuantizedLinear(model.fc3)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(NestedModel(), default_eval_fn, calib_data, qconfig_dict)
+        checkQuantized(model)
+
+    def test_nested3(self):
+        r"""More complicated nested test case with child qconfig overrides
+        parent qconfig
+        """
+        model = NestedModel()
+        custum_options = {
+            'dtype': torch.quint8,
+            'qscheme': torch.per_tensor_affine
+        }
+        custom_qconfig = QConfig(weight=default_observer(),
+                                 activation=default_observer(**custum_options))
+        qconfig_dict = {
+            'fc3': default_qconfig,
+            'sub2': default_qconfig,
+            'sub2.fc1': custom_qconfig
+        }
+        model = prepare(model, qconfig_dict)
+
+        def checkPrepModules(model, before_calib=False):
+            if before_calib:
+                self.checkObservers(model)
+            self.checkNoPrepModules(model)
+            self.checkNoPrepModules(model.sub1)
+            self.checkNoPrepModules(model.sub1.fc)
+            self.checkNoPrepModules(model.sub1.relu)
+            self.checkNoPrepModules(model.sub2)
+            self.checkHasPrepModules(model.sub2.fc1)
+            self.checkHasPrepModules(model.sub2.fc2)
+            self.checkHasPrepModules(model.fc3)
+
+        checkPrepModules(model, True)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            checkPrepModules(model)
+            self.checkQuantizedLinear(model.sub2.fc1)
+            self.checkQuantizedLinear(model.sub2.fc2)
+            self.checkQuantizedLinear(model.fc3)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(NestedModel(), default_eval_fn, calib_data, qconfig_dict)
+        checkQuantized(model)
+
+    def test_quant_wrapper(self):
+        r"""User need to modify the original code with QuantWrapper,
+        and call the quantization utility functions.
+        """
+        model = WrappedModel()
+
+        # since we didn't provide qconfig_dict, the model is modified inplace
+        # but we can do `model = prepare(model)` as well
+        prepare(model)
+        self.checkObservers(model)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            self.checkLinear(model.fc)
+            self.checkQuantDequant(model.sub)
+            self.assertEqual(type(model.sub.module.fc1), nnq.Linear)
+            self.assertEqual(type(model.sub.module.fc2), nnq.Linear)
+            self.assertEqual(type(model.sub.module.relu), nnq.ReLU)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(WrappedModel(), default_eval_fn, calib_data, {})
+        checkQuantized(model)
+
+
+    def test_manual(self):
+        r"""User inserts QuantStub and DeQuantStub in model code
+        and call the quantization utility functions.
+        """
+        model = ManualQuantModel()
+        # propagate the qconfig of parents to children, model is changed
+        # inplace
+        prepare(model)
+        self.checkObservers(model)
+
+        default_eval_fn(model, calib_data)
+        convert(model)
+
+        def checkQuantized(model):
+            self.assertEqual(type(model.fc), nnq.Linear)
+            default_eval_fn(model, calib_data)
+
+        checkQuantized(model)
+
+        # test one line API
+        model = quantize(ManualQuantModel(), default_eval_fn, calib_data)
+        checkQuantized(model)
+
+
+if __name__ == '__main__':
+    run_tests()
--- a/torch/nn/quantized/modules/activation.py
+++ b/torch/nn/quantized/modules/activation.py
@ -4,9 +4,9 @@ from __future__ import print_function
 from __future__ import unicode_literals

 from .. import functional as F
-from ...modules.module import Module
+from ...modules.activation import ReLU as NNReLU

-class ReLU(Module):
+class ReLU(NNReLU):
    r"""Applies quantized rectified linear unit function element-wise:

    :math:`\text{ReLU}(x)= \max(x_0, x)`, where :math:`x_0` is the zero point.
--- a/torch/nn/quantized/modules/linear.py
+++ b/torch/nn/quantized/modules/linear.py
@ -35,7 +35,9 @@ class Quantize(Module):

    @staticmethod
    def from_float(mod):
-        return Quantize(mod.qparams[0].item(), mod.qparams[1].item(), torch.quint8)
+        assert hasattr(mod, 'observer')
+        qparams = mod.observer.calculate_qparams()
+        return Quantize(qparams[0].item(), qparams[1].item(), mod.observer.dtype)

 class DeQuantize(Module):
    r"""Dequantizes an incoming tensor
@ -136,3 +138,30 @@ class Linear(NNLinear):
        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                      missing_keys, unexpected_keys, error_msgs)
        return
+
+    # TODO: support initializing from quantization parameters when Quantizer is
+    # exposed in python
+    @staticmethod
+    def from_float(mod):
+        r"""Create a quantized module from a float module or qparams_dict
+
+            Args: `mod` a float module, either produced by torch.quantization utilities
+            or directly from user
+        """
+        assert type(mod) == NNLinear, 'nnq.Linear.from_float only works for nn.Linear'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert hasattr(mod, 'observer'), 'Input float module must have observer attached'
+        activation_observer = mod.observer
+        act_qparams = activation_observer.calculate_qparams()
+        weight_observer = mod.qconfig.weight()
+        weight_observer(mod.weight)
+        wt_qparams = weight_observer.calculate_qparams()
+        bias_scale = (wt_qparams[0] * act_qparams[0]).float()
+        qweight = torch.quantize_linear(mod.weight.float(), wt_qparams[0], wt_qparams[1].long().item(), torch.qint8)
+        qbias = torch.quantize_linear(mod.bias.float(), bias_scale, 0, torch.qint32)
+        qlinear = Linear(mod.in_features, mod.out_features)
+        qlinear._packed_weight = torch.ops.quantized.fbgemm_linear_prepack(qweight)
+        qlinear.bias = qbias
+        qlinear.out_scale = torch.tensor([act_qparams[0]])
+        qlinear.out_zero_point = torch.tensor([act_qparams[1]])
+        return qlinear
--- a/torch/quantization/QConfig.py
+++ b/torch/quantization/QConfig.py
@ -0,0 +1,9 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from collections import namedtuple
+from .observer import *
+
+QConfig = namedtuple('QConfig',
+                     ['weight', 'activation'])
+
+default_qconfig = QConfig(default_weight_observer(),
+                          default_observer())
--- a/torch/quantization/init.py
+++ b/torch/quantization/init.py
@ -0,0 +1,28 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+from .quantize import *  # noqa: F401
+from .observer import *  # noqa: F401
+from .QConfig import *  # noqa: F401
+
+def default_eval_fn(model, calib_data):
+    r"""
+    Default evaluation function takes a torch.utils.data.Dataset or a list of
+    input Tensors and run the model on the dataset
+    """
+    for data in calib_data:
+        model(data)
+
+_all__ = [
+    'QuantWrapper', 'QuantStub', 'DeQuantStub', 'DEFAULT_MODULE_MAPPING',
+    # Top level API for quantizing a float model
+    'quantize',
+    # Sub functions called by quantize
+    'prepare', 'convert',
+    # Sub functions for `prepare` and `swap_module`
+    'propagate_qconfig', 'add_quant_dequant', 'add_observer', 'swap_module',
+    'default_eval_fn',
+    # Observers
+    'Observer', 'WeightObserver', 'observer', 'default_observer',
+    'default_weight_observer',
+    # QConfig
+    'QConfig', 'default_qconfig'
+]
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@ -0,0 +1,73 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch.nn as nn
+import torch
+from functools import partial
+
+class Observer(nn.Module):
+    r"""Default Observer Module
+    A default implementation of the observer module, only works for
+    `per_tensor_affine` quantization scheme.
+    The module will record the running average of max and min value of the
+    observed Tensor and calulate_qparams will calculate the scale and zero_point
+
+    Other types of Observers should follow the same API, it can take arbitrary
+    number of keyward arguments. In forward, it will update the statistics of
+    the observed Tensor. And it should provide a `calculate_qparam` function
+    that computes the quantization parameters given the collected statistics.
+    TODO: Maybe add an abstract Observer class that enforces these rules?
+    """
+    def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine):
+        super(Observer, self).__init__()
+        self.dtype = dtype
+        self.qscheme = qscheme
+        assert self.qscheme in (torch.per_tensor_affine, torch.per_tensor_symmetric), \
+            'Default Observer only works for per_tensor_affine and \
+                per_tensor_symmetric quantization scheme'
+        assert self.dtype in (torch.qint8, torch.quint8), \
+            'Default Observer only works for qint8 and quint data type'
+        self.min_val = None
+        self.max_val = None
+
+    def forward(self, x):
+        if self.min_val is None or self.max_val is None:
+            self.min_val = torch.min(x)
+            self.max_val = torch.max(x)
+        else:
+            self.min_val = torch.min(torch.min(x), self.min_val)
+            self.max_val = torch.max(torch.max(x), self.max_val)
+
+    def calculate_qparams(self):
+        if self.dtype == torch.qint8:
+            qmin, qmax = -128, 127
+        else:
+            qmin, qmax = 0, 255
+        n_levels = 255.0
+        if self.max_val is None or self.min_val is None:
+            raise Exception('must run observer before calling calculate_qparams!')
+        max_val, min_val = self.max_val.item(), self.min_val.item()
+        if max_val == min_val:
+            scale = 1.0
+            zero_point = 0
+        else:
+            if self.qscheme == torch.per_tensor_symmetric:
+                max_val = max(-min_val, max_val)
+                scale = max_val / 127.0
+                zero_point = 0 if self.dtype == torch.qint8 else 128
+            else:
+                scale = (max_val - min_val) / n_levels
+                zero_point = qmin - round(min_val / scale)
+                zero_point = max(qmin, zero_point)
+                zero_point = min(qmax, zero_point)
+
+        return torch.tensor([scale, zero_point])
+
+def observer(observer_cls, **kwargs):
+    return partial(observer_cls, **kwargs)
+
+def default_observer(**kwargs):
+    return observer(Observer, **kwargs)
+
+def default_weight_observer(**kwargs):
+    kwargs.setdefault('dtype', torch.qint8)
+    kwargs.setdefault('qscheme', torch.per_tensor_symmetric)
+    return observer(Observer, **kwargs)
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@ -0,0 +1,255 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch.nn as nn
+import torch.nn.quantized as nnq
+import torch
+
+def propagate_qconfig_helper(module, qconfig_dict, qconfig_parent=None, prefix=''):
+    r"""This is a helper function for `propagate_qconfig`
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                     configuration
+        qconfig_parent: quantization config of parent module, we will fallback to
+                       this config when there is no specified config for current
+                       module
+        prefix: corresponding prefix of the current module, used as key in
+                qconfig_dict
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if not hasattr(module, 'qconfig'):
+        module.qconfig = None
+        if qconfig_dict and prefix in qconfig_dict:
+            module.qconfig = qconfig_dict[prefix]
+        else:
+            module.qconfig = qconfig_parent
+        print('prefix:', prefix, 'qconfig: ', module.qconfig)
+
+    for name, child in module.named_children():
+        module_prefix = prefix + '.' + name if prefix else name
+        propagate_qconfig_helper(child, qconfig_dict, module.qconfig, module_prefix)
+
+def propagate_qconfig(module, qconfig_dict=None):
+    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
+    attribute on each leaf module
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+            configuration, qconfig applies to all submodules of a given
+            module unless qconfig for the submodules are specified(when the
+            submodule already has qconfig attribute)
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if qconfig_dict is None:
+        qconfig_dict = {}
+    propagate_qconfig_helper(module, qconfig_dict)
+
+def _observer_forward_hook(self, input, output):
+    r"""Forward hook that calls observer on the output
+    """
+    self.observer(output)
+
+# TODO(jerryzh): remove_observer?
+def add_observer(module):
+    r"""Add observer for the leaf child of the module.
+
+    This function insert observer module to all leaf child module that
+    has a valid qconfig attribute.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        None, module is modified inplace with added observer modules and
+            forward_hooks
+    """
+    for child in module.children():
+        add_observer(child)
+
+    # Insert observers only for leaf nodes, note that this observer is for
+    # the output of the module, for input QuantStub will observe them
+    if hasattr(module, 'qconfig') and module.qconfig is not None and len(module._modules) == 0:
+        # observer and hook will be gone after we swap the module
+        module.add_module('observer', module.qconfig.activation())
+        module.register_forward_hook(_observer_forward_hook)
+
+class QuantWrapper(nn.Module):
+    r"""A wrapper class that wraps the input module, adds QuantStub and
+    DeQuantStub and surround the call to module with call to quant and dequant
+    modules.
+
+    This is used by the `quantization` utility functions to add the quant and
+    dequant modules, before `convert` function `QuantStub` will just be observer,
+    it observes the input tensor, after `convert`, `QuantStub`
+    will be swapped to `nnq.Quantize` which does actual quantization. Similarly
+    for `DeQuantStub`.
+    """
+    def __init__(self, module):
+        super(QuantWrapper, self).__init__()
+        qconfig = module.qconfig if hasattr(module, 'qconfig') else None
+        self.quant = QuantStub(qconfig)
+        self.dequant = DeQuantStub()
+        self.module = module
+
+    def forward(self, X):
+        X = self.quant(X)
+        X = self.module(X)
+        return self.dequant(X)
+
+def add_quant_dequant(module):
+    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
+    Note that this function will modify the children of module inplace and it
+    can return a new module which wraps the input module as well.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        Either the inplace modified module with submodules wrapped in
+        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
+        wraps the input module, the latter case only happens when the input
+        module is a leaf module and we want to quantize it.
+    """
+    if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig:
+        return QuantWrapper(module)
+
+    for name, child in module.named_children():
+        module._modules[name] = add_quant_dequant(child)
+    return module
+
+def prepare(module, qconfig_dict=None):
+    r"""Prepares the module for calibration or training given a qconfig_dict.
+    Note that the module will be modified inplace but in case the input module
+    is a leaf module, a wrapped module will be returned.
+
+    Args:
+        mod: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                      configuration
+    Return:
+        A module with qconfig propogated, observer and quant dequant or fake
+        quant modules attached, a module that is ready for calibration or
+        training
+    """
+    propagate_qconfig(module, qconfig_dict)
+    if qconfig_dict:
+        module = add_quant_dequant(module)
+    add_observer(module)
+    return module
+
+class QuantStub(nn.Module):
+    r"""Quantize stub module, before calibration, this is same as an observer,
+    it will be swapped as `nnq.Quantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+    def __init__(self, qconfig=None):
+        super(QuantStub, self).__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x):
+        return x
+
+class DeQuantStub(nn.Module):
+    r"""Dequantize stub module, before calibration, this is same as identity,
+    this will be swapped as `nnq.DeQuantize` in `convert`.
+    """
+    def __init__(self):
+        super(DeQuantStub, self).__init__()
+
+    def forward(self, x):
+        return x
+
+def quantize(module, eval_fn, eval_args, qconfig_dict=None):
+    r"""Converts a float module to quantized module.
+
+    First it will prepare the module for calibration or training, then it calls
+    `eval_fn` which will run the calibration step or training step,
+    after that we will call `convert` which will convert the module to a
+    quantized module.
+
+    When `qconfig_dict` is None or empty dictionary, we will assume user will
+    insert quant/dequant stubs and add qconfig in approporiate places.
+    When `qconfig_dict` is not None or empty dictionary, we will add quant/dequant
+    stubs using QuantWrapper for all the leaf modules.
+
+    Args:
+        module: input module
+        eval_fn: a function for evaluating the prepared module, can be a
+            function that simply runs the prepared module or a training loop
+        eval_args: positional arguments for `eval_fn`
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+            configuration, qconfig applies to all submodules of a given
+            module unless qconfig for the submodules are specified(when the
+            submodule already has qconfig attribute)
+
+
+    Return:
+        A quantized module
+    """
+    module = prepare(module, qconfig_dict)
+    eval_fn(module, eval_args)
+    convert(module)
+    return module
+
+# Map for swapping float module to quantized ones
+DEFAULT_MODULE_MAPPING = {
+    torch.nn.Linear: nnq.Linear,
+    torch.nn.ReLU: nnq.ReLU,
+    QuantStub: nnq.Quantize,
+}
+
+def convert(module, mapping=DEFAULT_MODULE_MAPPING):
+    r"""Converts the float module with observers(where we can get quantization
+    parameters) to a quantized module.
+    Args:
+        module: calibrated module with observers
+        mapping: a dictionary that maps from float module type to quantized
+           module type, can be overwrritten to allow swapping user defined Modules
+    Return:
+        A quantized module
+    """
+    module_swapped = swap_module(module, mapping)
+
+    reassign = {}
+    for name, mod in module.named_children():
+        new_mod = convert(mod, mapping)
+        if new_mod is not mod:
+            reassign[name] = new_mod
+
+    for name, mod in reassign.items():
+        setattr(module_swapped, name, mod)
+
+    return module_swapped
+
+def swap_module(mod, mapping):
+    r"""Swaps the module if it has a quantized counterpart and it has an
+    `observer` attached.
+
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to nnq module
+
+    Return:
+        The corresponding quantized module of `mod`
+    """
+    new_mod = mod
+    print('swapping:', mod)
+    if hasattr(mod, 'observer'):
+        if type(mod) in mapping:
+            new_mod = mapping[type(mod)].from_float(mod)
+
+    if type(mod) == DeQuantStub:
+        new_mod = nnq.DeQuantize.from_float(mod)
+
+    return new_mod