mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Added support for quantized linear on CPU with fbgemm. Specifically, for torch.ops.quantized.linear_unpacked_dynamic_fp16, we decompose it into two steps, pack weight, and fbgemm's qlinear with packed weight. Test Plan: Included in commit. test_aot_inductor::test_quantized_linear Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D55577959](https://our.internmc.facebook.com/intern/diff/D55577959) Pull Request resolved: https://github.com/pytorch/pytorch/pull/123069 Approved by: https://github.com/hl475
31 lines
762 B
Python
31 lines
762 B
Python
import torch
|
|
from . import lowering
|
|
|
|
quantized = torch.ops.quantized
|
|
_quantized = torch.ops._quantized
|
|
aten = torch.ops.aten
|
|
|
|
|
|
def register_quantized_ops():
|
|
lowering.add_needs_realized_inputs(
|
|
[
|
|
quantized.max_pool2d,
|
|
_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16,
|
|
_quantized.wrapped_fbgemm_linear_fp16_weight,
|
|
]
|
|
)
|
|
|
|
lowering.make_fallback(quantized.max_pool2d)
|
|
lowering.make_fallback(_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16)
|
|
lowering.make_fallback(_quantized.wrapped_fbgemm_linear_fp16_weight)
|
|
|
|
|
|
def register_woq_mm_ops():
|
|
lowering.add_needs_realized_inputs(
|
|
[
|
|
aten._weight_int8pack_mm,
|
|
]
|
|
)
|
|
|
|
lowering.make_fallback(aten._weight_int8pack_mm)
|