mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: make inclusion of immintrin.h only for x64 Test Plan: CI Differential Revision: D38886597 Pull Request resolved: https://github.com/pytorch/pytorch/pull/83793 Approved by: https://github.com/ajtulloch
205 lines
6.5 KiB
C++
205 lines
6.5 KiB
C++
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
|
|
#ifdef __x86_64__
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
#include <fbgemm/QuantUtils.h>
|
|
|
|
#include "caffe2/quantization/server/dynamic_histogram.h"
|
|
#include "caffe2/utils/cpuid.h"
|
|
|
|
namespace dnnlowp {
|
|
|
|
using fbgemm::RequantizationParams;
|
|
using fbgemm::TensorQuantizationParams;
|
|
|
|
// Represents a quantization scheme that provides quantization parameter based
|
|
// on distribution of data to be quantized.
|
|
class QuantizationFactory {
|
|
public:
|
|
enum QuantizationKind {
|
|
// A simple quantization scheme that determines quantization parameter by
|
|
// just looking at min/max.
|
|
MIN_MAX_QUANTIZATION,
|
|
// Minimizes L2 norm of quantization error
|
|
L2_MIN_QUANTIZATION,
|
|
// fast search to remove histogram outliers and approximate L2 min
|
|
L2_MIN_QUANTIZATION_APPROX,
|
|
// Minimizes Kullback-Leibler divergence
|
|
KL_MIN_QUANTIZATION,
|
|
// Take 99 percentail (only works with sparsity preserving quantization)
|
|
P99_QUANTIZATION,
|
|
L1_MIN_QUANTIZATION,
|
|
};
|
|
|
|
/// Get the default factory whose policy is determined by gflags
|
|
static QuantizationFactory* GetDefaultInstance();
|
|
|
|
/// Choose quantization scale and zero_point that maps
|
|
/// floating-point range [min, max] to the integer range of the specified
|
|
/// precision
|
|
TensorQuantizationParams ChooseQuantizationParams(
|
|
float min,
|
|
float max,
|
|
int precision,
|
|
bool preserve_sparsity,
|
|
bool is_signed = false) const {
|
|
TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
|
|
min,
|
|
max,
|
|
is_signed ? -(1 << (precision - 1)) : 0,
|
|
is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
|
|
preserve_sparsity,
|
|
force_scale_power_of_two_);
|
|
qparams.precision = precision;
|
|
return qparams;
|
|
}
|
|
|
|
/// Choose quantization scale and zero_point that maps
|
|
/// floating-point range [min, max] to the default integer range of
|
|
/// this quantization factory
|
|
TensorQuantizationParams
|
|
ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
|
|
return ChooseQuantizationParams(
|
|
min,
|
|
max,
|
|
is_weight ? GetWeightPrecision() : GetActivationPrecision(),
|
|
is_weight ? GetPreserveWeightSparsity()
|
|
: GetPreserveActivationSparsity());
|
|
}
|
|
|
|
/// Choose quantization based on the values in an array to optimize the
|
|
/// quantization errors ignoring a few outliers
|
|
TensorQuantizationParams ChooseQuantizationParams(
|
|
const float* values,
|
|
int len,
|
|
QuantizationKind kind,
|
|
int precision,
|
|
bool preserve_sparsity) const;
|
|
|
|
TensorQuantizationParams ChooseQuantizationParams(
|
|
const float* values,
|
|
int len,
|
|
bool is_weight = false) const;
|
|
|
|
/// Choose quantization based on histogram of values to optimize the
|
|
/// quantization errors ignoring a few outliers
|
|
TensorQuantizationParams ChooseQuantizationParams(
|
|
const Histogram& hist,
|
|
QuantizationKind kind,
|
|
int precision,
|
|
bool preserve_sparsity,
|
|
bool is_weight = false) const;
|
|
|
|
TensorQuantizationParams ChooseQuantizationParams(
|
|
const Histogram& hist,
|
|
bool is_weight = false) const;
|
|
|
|
// Given a real_multiplier, produces a pair (quantized_multiplier,
|
|
// right_shift) where quantized_multiplier is an int32 representing a
|
|
// fixed-point value (in practice we only produce positive values) and
|
|
// right_shift is an amount to shift right by, so that the floating-point
|
|
// multiplication of some int32 input value by real_multiplier,
|
|
//
|
|
// return static_cast<int32>(int32_value * real_multiplier);
|
|
//
|
|
// is best approximated by the integer-arithmetic-only code
|
|
//
|
|
// return RoundingRightShift(
|
|
// Multiplication(int32_value, quantized_multiplier),
|
|
// right_shift);
|
|
//
|
|
// Note: all this code only needs to run offline to generate the quantized
|
|
// neural network workload, not at runtime on the device on which quantized
|
|
// neural networks need to run. So it's not performance-critical at all.
|
|
RequantizationParams ChooseRequantizationMultiplier(
|
|
float real_multiplier,
|
|
TensorQuantizationParams target_qparams) const;
|
|
|
|
int GetActivationPrecision() const {
|
|
return activation_precision_;
|
|
}
|
|
|
|
int GetWeightPrecision() const {
|
|
return weight_precision_;
|
|
}
|
|
|
|
int GetEltwiseQuantizePrecision() const {
|
|
return eltwise_quantize_precision_;
|
|
}
|
|
|
|
bool GetPreserveActivationSparsity() const {
|
|
return preserve_activation_sparsity_;
|
|
}
|
|
|
|
bool GetPreserveWeightSparsity() const {
|
|
return preserve_weight_sparsity_;
|
|
}
|
|
|
|
QuantizationKind GetActivationKind() const {
|
|
return activation_kind_;
|
|
}
|
|
QuantizationKind GetWeightKind() const {
|
|
return weight_kind_;
|
|
}
|
|
|
|
void SetWeightP99Threshold(float threshold) {
|
|
weight_p99_threshold_ = threshold;
|
|
}
|
|
void SetActivationP99Threshold(float threshold) {
|
|
activation_p99_threshold_ = threshold;
|
|
}
|
|
|
|
explicit QuantizationFactory(
|
|
int activation_precision = 8,
|
|
// precision used for activations in main operations like matmul
|
|
int weight_precision = 8, // precision used for weights
|
|
int requantization_multiplier_precision = 32,
|
|
// precision used for the requantization multiplier
|
|
int eltwise_quantize_precision = 16,
|
|
// precision used for element-wise addition
|
|
bool preserve_activation_sparsity = false,
|
|
// preserve zeros in quantization
|
|
bool preserve_weight_sparsity = false,
|
|
// preserve zeros in quantization
|
|
bool force_scale_power_of_two = false,
|
|
// restrict scaling to a power of two
|
|
QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
|
|
QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
|
|
float weight_p99_threshold = 0.99,
|
|
// P99 percentage to select out from the full histogram for weights
|
|
|
|
float activation_p99_threshold = 0.99
|
|
// P99 percentage to select out from the full histogram for activations
|
|
);
|
|
|
|
private:
|
|
int activation_precision_;
|
|
int weight_precision_;
|
|
int requantization_multiplier_precision_;
|
|
int eltwise_quantize_precision_;
|
|
bool preserve_activation_sparsity_;
|
|
bool preserve_weight_sparsity_;
|
|
bool force_scale_power_of_two_;
|
|
QuantizationKind activation_kind_, weight_kind_;
|
|
float weight_p99_threshold_;
|
|
float activation_p99_threshold_;
|
|
}; // class QuantizationFactory
|
|
|
|
/**
|
|
* Parse a string to QuantizationKind
|
|
*/
|
|
QuantizationFactory::QuantizationKind StringToKind(const std::string& s);
|
|
|
|
std::vector<float>
|
|
adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);
|
|
|
|
} // namespace dnnlowp
|