From 8fab682e47242ecaebda99fc78800096962e65cf Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor@fb.com>
Date: Thu, 7 Jul 2022 15:05:35 -0700
Subject: [PATCH] [Quant][fx][bc-breaking] Do not move models to CPU in convert
 (#80555)

Summary: Previously, we automatically moved the model to CPU in
torch.ao.quantization.fx.convert to work around the issue where
certain functions called by convert expect CPU arguments. This
commit pushes this responsibility to the caller since it is the
user's decision of which device to use.

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

BC-breaking Notes:

Before:
```
model = resnet18(...)
model = prepare_fx(model, qconfig_mapping, example_inputs)
... # calibrate
model = convert_fx(model)
```
After:
```
model = resnet18(...)
model.cpu()
model = prepare_fx(model, qconfig_mapping, example_inputs)
... # calibrate
model = convert_fx(model)
```

Reviewers: jerryzh168

Subscribers: jerryzh168

Differential Revision: [D37528830](https://our.internmc.facebook.com/intern/diff/D37528830)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/80555
Approved by: https://github.com/jerryzh168
---
 test/quantization/fx/test_quantize_fx.py | 4 ++++
 torch/ao/quantization/fx/convert.py      | 8 --------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 9498f34ab47..cd9aec0a6e5 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -7611,6 +7611,10 @@ class TestQuantizeFxModels(QuantizationTestCase):
             torch.testing.assert_allclose(grad[0], grad_ref[0])
 
         if 'fbgemm' in torch.backends.quantized.supported_engines:
+            # During the lowering step in convert, fold_weight calls quantized::linear_prepack
+            # which doesn't support QuantizedCuda backend
+            prepared.cpu()
+            prepared_ref.cpu()
             converted = convert_fx(prepared)
             converted_ref = convert_fx(prepared_ref)
             inp = torch.rand(5, 5)
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 2cdbd3686b4..31bce8157bd 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -580,14 +580,6 @@ def convert(
     node_name_to_scope, prepare_custom_config, observed_node_names = restore_state(model)
     qconfig_map: Dict[str, QConfigAny] = model._qconfig_map  # type: ignore[assignment]
 
-    # TODO this should be removed now that gpu support for quantization is being supported.
-    # however in practice, as of 7/22/2021, certain functions that get called by convert expect
-    # only cpu arguments.
-    # As an example, in TestQuantizeFxModels.test_qat_functional_linear when device='cuda',
-    # fold_weight will call quantized::linear_prepack which doesn't support QuantizedCuda backend.
-    if not is_reference:
-        model.cpu()
-
     # mapping from fully qualified module name to module instance
     # for example,
     # {