pytorch/caffe2/quantization/server/dnnlowp.h

#pragma once

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <limits>

#ifdef __x86_64__
#include <immintrin.h>
#endif

#include <fbgemm/QuantUtils.h>

#include "caffe2/quantization/server/dynamic_histogram.h"
#include "caffe2/utils/cpuid.h"

namespace dnnlowp {

using fbgemm::RequantizationParams;
using fbgemm::TensorQuantizationParams;

// Represents a quantization scheme that provides quantization parameter based
// on distribution of data to be quantized.
class QuantizationFactory {
 public:
  enum QuantizationKind {
    // A simple quantization scheme that determines quantization parameter by
    // just looking at min/max.
    MIN_MAX_QUANTIZATION,
    // Minimizes L2 norm of quantization error
    L2_MIN_QUANTIZATION,
    // fast search to remove histogram outliers and approximate L2 min
    L2_MIN_QUANTIZATION_APPROX,
    // Minimizes Kullback-Leibler divergence
    KL_MIN_QUANTIZATION,
    // Take 99 percentail (only works with sparsity preserving quantization)
    P99_QUANTIZATION,
    L1_MIN_QUANTIZATION,
  };

  /// Get the default factory whose policy is determined by gflags
  static QuantizationFactory* GetDefaultInstance();

  /// Choose quantization scale and zero_point that maps
  /// floating-point range [min, max] to the integer range of the specified
  /// precision
  TensorQuantizationParams ChooseQuantizationParams(
      float min,
      float max,
      int precision,
      bool preserve_sparsity,
      bool is_signed = false) const {
    TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
        min,
        max,
        is_signed ? -(1 << (precision - 1)) : 0,
        is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
        preserve_sparsity,
        force_scale_power_of_two_);
    qparams.precision = precision;
    return qparams;
  }

  /// Choose quantization scale and zero_point that maps
  /// floating-point range [min, max] to the default integer range of
  /// this quantization factory
  TensorQuantizationParams
  ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
    return ChooseQuantizationParams(
        min,
        max,
        is_weight ? GetWeightPrecision() : GetActivationPrecision(),
        is_weight ? GetPreserveWeightSparsity()
                  : GetPreserveActivationSparsity());
  }

  /// Choose quantization based on the values in an array to optimize the
  /// quantization errors ignoring a few outliers
  TensorQuantizationParams ChooseQuantizationParams(
      const float* values,
      int len,
      QuantizationKind kind,
      int precision,
      bool preserve_sparsity) const;

  TensorQuantizationParams ChooseQuantizationParams(
      const float* values,
      int len,
      bool is_weight = false) const;

  /// Choose quantization based on histogram of values to optimize the
  /// quantization errors ignoring a few outliers
  TensorQuantizationParams ChooseQuantizationParams(
      const Histogram& hist,
      QuantizationKind kind,
      int precision,
      bool preserve_sparsity,
      bool is_weight = false) const;

  TensorQuantizationParams ChooseQuantizationParams(
      const Histogram& hist,
      bool is_weight = false) const;

  // Given a real_multiplier, produces a pair (quantized_multiplier,
  // right_shift) where quantized_multiplier is an int32 representing a
  // fixed-point value (in practice we only produce positive values) and
  // right_shift is an amount to shift right by, so that the floating-point
  // multiplication of some int32 input value by real_multiplier,
  //
  //   return static_cast<int32>(int32_value * real_multiplier);
  //
  // is best approximated by the integer-arithmetic-only code
  //
  //   return RoundingRightShift(
  //       Multiplication(int32_value, quantized_multiplier),
  //       right_shift);
  //
  // Note: all this code only needs to run offline to generate the quantized
  // neural network workload, not at runtime on the device on which quantized
  // neural networks need to run. So it's not performance-critical at all.
  RequantizationParams ChooseRequantizationMultiplier(
      float real_multiplier,
      TensorQuantizationParams target_qparams) const;

  int GetActivationPrecision() const {
    return activation_precision_;
  }

  int GetWeightPrecision() const {
    return weight_precision_;
  }

  int GetEltwiseQuantizePrecision() const {
    return eltwise_quantize_precision_;
  }

  bool GetPreserveActivationSparsity() const {
    return preserve_activation_sparsity_;
  }

  bool GetPreserveWeightSparsity() const {
    return preserve_weight_sparsity_;
  }

  QuantizationKind GetActivationKind() const {
    return activation_kind_;
  }
  QuantizationKind GetWeightKind() const {
    return weight_kind_;
  }

  void SetWeightP99Threshold(float threshold) {
    weight_p99_threshold_ = threshold;
  }
  void SetActivationP99Threshold(float threshold) {
    activation_p99_threshold_ = threshold;
  }

  explicit QuantizationFactory(
      int activation_precision = 8,
      // precision used for activations in main operations like matmul
      int weight_precision = 8, // precision used for weights
      int requantization_multiplier_precision = 32,
      // precision used for the requantization multiplier
      int eltwise_quantize_precision = 16,
      // precision used for element-wise addition
      bool preserve_activation_sparsity = false,
      // preserve zeros in quantization
      bool preserve_weight_sparsity = false,
      // preserve zeros in quantization
      bool force_scale_power_of_two = false,
      // restrict scaling to a power of two
      QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
      QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
      float weight_p99_threshold = 0.99,
      // P99 percentage to select out from the full histogram for weights

      float activation_p99_threshold = 0.99
      // P99 percentage to select out from the full histogram for activations
  );

 private:
  int activation_precision_;
  int weight_precision_;
  int requantization_multiplier_precision_;
  int eltwise_quantize_precision_;
  bool preserve_activation_sparsity_;
  bool preserve_weight_sparsity_;
  bool force_scale_power_of_two_;
  QuantizationKind activation_kind_, weight_kind_;
  float weight_p99_threshold_;
  float activation_p99_threshold_;
}; // class QuantizationFactory

/**
 * Parse a string to QuantizationKind
 */
QuantizationFactory::QuantizationKind StringToKind(const std::string& s);

std::vector<float>
adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);

} // namespace dnnlowp