pytorch/caffe2/quantization/server/dynamic_histogram.cc

#include "dynamic_histogram.h"
#include "dnnlowp_op.h"

#include <cassert>
#include <cmath>
#include <limits>

#ifdef _OPENMP
#include <omp.h>
#endif

namespace dnnlowp {

using namespace std;

void Histogram::Add(float f, uint64_t cnt) {
  int nbins = histogram_.size();
  float bin_width = (max_ - min_) / nbins;
  int bin = bin_width == 0
      ? 0
      : std::min(static_cast<int>((f - min_) / bin_width), nbins - 1);
  bin = std::max(0, bin);
  assert(bin >= 0);
  histogram_[bin] += cnt;
}

void Histogram::Add(const float* f, int len) {
  int nbins = histogram_.size();
  float bin_width = (max_ - min_) / nbins;

  if (bin_width > 0.0) {
    assert(per_thread_histogram_.size() % nbins == 0);

    // Check if dnnlowp_get_max_threads has been reduced, and if so reduce
    // per-thread histogram and clear them.
    int old_nthreads = per_thread_histogram_.size() / nbins + 1;
    if (caffe2::dnnlowp_get_max_threads() < old_nthreads) {
      Finalize();
    }

    per_thread_histogram_.resize(
        (caffe2::dnnlowp_get_max_threads() - 1) * nbins);

#ifdef _OPENMP
#pragma omp parallel
#endif
    {
      int tid = caffe2::dnnlowp_get_thread_num();

      uint64_t* my_histogram = nullptr;
      if (tid == 0) {
        my_histogram = histogram_.data();
      } else {
        my_histogram = per_thread_histogram_.data() + (tid - 1) * nbins;
      }

#ifdef _OPENMP
#pragma omp for
#endif
      for (auto i = 0; i < len; ++i) {
        int bin =
            std::min(static_cast<int>((f[i] - min_) / bin_width), nbins - 1);
        bin = std::max(0, bin);
        ++my_histogram[bin];
      }
    } // omp parallel
  } else {
    histogram_[0] += len;
  }
}

void Histogram::Finalize() {
  int nbins = histogram_.size();
  assert(per_thread_histogram_.size() % nbins == 0);
  int nthreads = per_thread_histogram_.size() / nbins + 1;

  if (nthreads > 1) {
    for (int bin = 0; bin < nbins; ++bin) {
      for (int i = 1; i < nthreads; ++i) {
        histogram_[bin] += per_thread_histogram_[(i - 1) * nbins + bin];
      }
    }
  }

  per_thread_histogram_.clear();
}

static const int OVER_BINNING_FACTOR = 4;

DynamicHistogram::DynamicHistogram(int nbins)
    : nbins_(nbins),
      min_(numeric_limits<float>::max()),
      max_(numeric_limits<float>::lowest()) {
  assert(nbins_ > 0);
}

void DynamicHistogram::Add(float f) {
  min_ = std::min(min_, f);
  max_ = std::max(max_, f);

  if (histograms_.empty()) {
    histograms_.emplace_back(nbins_ * OVER_BINNING_FACTOR, f, f);
  } else {
    Histogram& curr_hist = histograms_.back();
    if (f < curr_hist.Min() || f > curr_hist.Max()) {
      float old_spread = curr_hist.Max() - curr_hist.Min();
      if (f < curr_hist.Min()) {
        float new_min;
        if (old_spread == 0) {
          new_min = f;
        } else {
          new_min = curr_hist.Min() -
              ceil((curr_hist.Min() - f) / old_spread) * old_spread;
        }
        histograms_.emplace_back(
            curr_hist.GetHistogram()->size(), new_min, curr_hist.Max());
      } else {
        float new_max;
        if (old_spread == 0) {
          new_max = f;
        } else {
          new_max = curr_hist.Max() +
              ceil((f - curr_hist.Max()) / old_spread) * old_spread;
        }
        histograms_.emplace_back(
            curr_hist.GetHistogram()->size(), curr_hist.Min(), new_max);
      }
    }
  }

  Histogram& new_hist = histograms_.back();
  new_hist.Add(f);
}

void DynamicHistogram::Add(const float* f, int len) {
  float minimum = min_, maximum = max_;
#ifdef _OPENMP
#pragma omp parallel for reduction(min : minimum) reduction(max : maximum)
#endif
  for (int i = 0; i < len; ++i) {
    minimum = std::min(f[i], minimum);
    maximum = std::max(f[i], maximum);
  }
  min_ = minimum;
  max_ = maximum;

  if (histograms_.empty()) {
    histograms_.emplace_back(nbins_ * OVER_BINNING_FACTOR, min_, max_);
  } else {
    Histogram& curr_hist = histograms_.back();
    if (min_ < curr_hist.Min() || max_ > curr_hist.Max()) {
      float old_spread = curr_hist.Max() - curr_hist.Min();
      float new_min = curr_hist.Min(), new_max = curr_hist.Max();
      if (min_ < curr_hist.Min()) {
        if (old_spread == 0.0f) {
          new_min = min_;
        } else {
          new_min = curr_hist.Min() -
              ceil((curr_hist.Min() - min_) / old_spread) * old_spread;
        }
      }
      if (max_ > curr_hist.Max()) {
        old_spread = curr_hist.Max() - new_min;
        if (old_spread == 0.0f) {
          new_max = max_;
        } else {
          new_max = curr_hist.Max() +
              ceil((max_ - curr_hist.Max()) / old_spread) * old_spread;
        }
      }
      histograms_.emplace_back(
          curr_hist.GetHistogram()->size(), new_min, new_max);
    }
  }

  Histogram& new_hist = histograms_.back();
  new_hist.Add(f, len);
}

const Histogram* DynamicHistogram::Finalize() {
  if (final_histogram_.get()) {
    return final_histogram_.get();
  }

  final_histogram_.reset(new Histogram(nbins_, min_, max_));
  float dst_bin_width = (max_ - min_) / nbins_;

  for (Histogram& hist : histograms_) {
    hist.Finalize();

    const std::vector<uint64_t>& bins = *hist.GetHistogram();
    float src_bin_width = (hist.Max() - hist.Min()) / bins.size();

    for (int i = 0; i < bins.size(); ++i) {
      if (bins[i] == 0) {
        continue;
      }
      float src_bin_begin = hist.Min() + src_bin_width * i;
      float src_bin_end = src_bin_begin + src_bin_width;
      // dst_bin corresponds to the beginning of the src_bin
      // dst_bin2 corresponds to the end of the src_bin
      int dst_bin =
          dst_bin_width == 0 ? 0 : (src_bin_begin - min_) / dst_bin_width;
      float dst_bin_begin = min_ + dst_bin_width * dst_bin;
      float dst_bin_end = dst_bin_begin + dst_bin_width;
      int dst_bin2 =
          dst_bin_width == 0 ? 0 : (src_bin_end - min_) / dst_bin_width;
      // 1 src_bin is mapped to at most 2 dst bin
      assert(dst_bin2 <= dst_bin + 2);

      // dst_bin_cnt is the count from src_bin that should go to dst_bin
      // The remainder should go to dst_bin2
      // rint is the fastest way to round
      // (https://stackoverflow.com/questions/485525/round-for-float-in-c/5849630)
      uint64_t dst_bin_cnt = (src_bin_width == 0 || dst_bin_width == 0)
          ? bins[i]
          : std::min(
                static_cast<uint64_t>(rint(
                    (dst_bin_end - src_bin_begin) / src_bin_width * bins[i])),
                bins[i]);

      final_histogram_->Add(dst_bin_begin + dst_bin_width / 2, dst_bin_cnt);
      if (dst_bin_cnt < bins[i]) {
        final_histogram_->Add(
            dst_bin_end + dst_bin_width / 2, bins[i] - dst_bin_cnt);
      }
    }
  } // for each histogram with different scales

  return final_histogram_.get();
}

} // namespace dnnlowp