pytorch/caffe2/quantization/server/pybind.cc

#include <fbgemm/FbgemmFP16.h>
#include <fbgemm/Utils.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "activation_distribution_observer.h"
#include "caffe2/opt/fakefp16_transform.h"
#include "caffe2/opt/custom/freeze_quantization_params.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/fbgemm_pack_blob.h"
#include "caffe2/quantization/server/int8_gen_quant_params.h"
#include "quantization_error_minimization.h"

namespace caffe2 {
namespace python {
// defined in caffe2/python/pybind_state.cc
Workspace* GetCurrentWorkspace();
} // namespace python
} // namespace caffe2

PYBIND11_MODULE(dnnlowp_pybind11, m) {
  using namespace std;
  using namespace caffe2;

  m.def("ClearNetObservers", []() { ClearGlobalNetObservers(); });

  m.def(
      "ObserveMinMaxOfOutput",
      [](const string& min_max_file_name, int dump_freq, string delimiter) {
        AddGlobalNetObserverCreator(
            [dump_freq, min_max_file_name, delimiter](NetBase* net) {
              return make_unique<OutputMinMaxNetObserver>(
                  net, min_max_file_name, dump_freq, delimiter);
            });
      },
      pybind11::arg("min_max_file_name"),
      pybind11::arg("dump_freq") = -1,
      pybind11::arg("delimiter") = " ");

  m.def(
      "ObserveHistogramOfOutput",
      [](const string& out_file_name,
         int dump_freq,
         bool mul_nets,
         string op_filter,
         string delimiter) {
        AddGlobalNetObserverCreator(
            [out_file_name, dump_freq, mul_nets, op_filter, delimiter](
                NetBase* net) {
              return make_unique<HistogramNetObserver>(
                  net,
                  out_file_name,
                  2048,
                  dump_freq,
                  mul_nets,
                  op_filter,
                  delimiter);
            });
      },
      pybind11::arg("out_file_name"),
      pybind11::arg("dump_freq") = -1,
      pybind11::arg("mul_nets") = false,
      pybind11::arg("op_filter") = "",
      pybind11::arg("delimiter") = " ");

  m.def(
      "DumpHistogramFile",
      [](Observable<NetBase>::Observer* ob) {
        HistogramNetObserver* hist_ob =
            dynamic_cast_if_rtti<HistogramNetObserver*>(ob);
        hist_ob->DumpHistogramFile();
      },
      pybind11::arg("ob"));

  m.def(
      "AddHistogramObserver",
      [](const string& net_name,
         const string& out_file_name,
         int dump_freq,
         bool mul_nets,
         string delimiter) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        CAFFE_ENFORCE(
            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
        pybind11::gil_scoped_release g;

        NetBase* net = gWorkspace->GetNet(net_name);
        const Observable<NetBase>::Observer* observer = nullptr;

        observer = net->AttachObserver(make_unique<HistogramNetObserver>(
            net, out_file_name, 2048, dump_freq, mul_nets, "", delimiter));

        CAFFE_ENFORCE(observer != nullptr);
        return pybind11::cast(observer);
      },
      pybind11::arg("net_name"),
      pybind11::arg("out_file_name"),
      pybind11::arg("dump_freq") = -1,
      pybind11::arg("mul_nets") = false,
      pybind11::arg("delimiter") = " ");

  m.def(
      "DumpOutputColumnMaxHistogramFile",
      [](Observable<NetBase>::Observer* ob) {
        OutputColumnMaxHistogramNetObserver* hist_ob =
            dynamic_cast_if_rtti<OutputColumnMaxHistogramNetObserver*>(ob);
        hist_ob->DumpOutputColumnMaxHistogramFile();
      },
      pybind11::arg("ob"));

  m.def(
      "AddOutputColumnMaxHistogramObserver",
      [](const string& net_name,
         const string& out_file_name,
         const std::vector<std::string>& observe_column_max_for_blobs,
         int dump_freq,
         int bin_nums,
         bool mul_nets,
         string delimiter) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        CAFFE_ENFORCE(
            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
        pybind11::gil_scoped_release g;

        NetBase* net = gWorkspace->GetNet(net_name);
        const Observable<NetBase>::Observer* observer = nullptr;

        observer = net->AttachObserver(
            make_unique<OutputColumnMaxHistogramNetObserver>(
                net,
                out_file_name,
                observe_column_max_for_blobs,
                bin_nums,
                dump_freq,
                mul_nets,
                delimiter));

        CAFFE_ENFORCE(observer != nullptr);
        return pybind11::cast(observer);
      },
      pybind11::arg("net_name"),
      pybind11::arg("out_file_name"),
      pybind11::arg("observe_column_max_for_blobs"),
      pybind11::arg("dump_freq") = -1,
      pybind11::arg("bin_nums") = 16,
      pybind11::arg("mul_nets") = false,
      pybind11::arg("delimiter") = " ");

  m.def(
      "ChooseQuantizationParams",
      [](const std::string& blob_name) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        pybind11::gil_scoped_release g;

        const auto* blob = gWorkspace->GetBlob(blob_name);
        if (blob == nullptr) {
          LOG(WARNING) << "Can't find blob " << blob_name;
        } else if (!BlobIsTensorType(*blob, CPU)) {
          LOG(WARNING) << "Blob " << blob_name << " is not a tensor";
        } else {
          const auto& tensor = blob->template Get<Tensor>();
          if (tensor.IsType<float>()) {
            dnnlowp::QuantizationFactory* qfactory =
                dnnlowp::QuantizationFactory::GetDefaultInstance();
            dnnlowp::TensorQuantizationParams qparams =
                qfactory->ChooseQuantizationParams(
                    tensor.data<float>(), tensor.size(), true /*weight*/);
            return std::tuple<float, int>(qparams.scale, qparams.zero_point);
          } else {
            LOG(WARNING) << "Blob " << blob_name << " is not a float tensor";
          }
        }
        return std::tuple<float, int>(1.0, 0);
      },
      pybind11::arg("blob_name"));

  m.def(
      "RegisterQuantizationParams",
      [](const string& min_max_file_name,
         bool is_weight,
         const string& qparams_output_file_name) {
        AddGlobalNetObserverCreator([min_max_file_name,
                                     is_weight,
                                     qparams_output_file_name](NetBase* net) {
          return make_unique<RegisterQuantizationParamsNetObserver>(
              net, min_max_file_name, is_weight, qparams_output_file_name);
        });
      },
      pybind11::arg("min_max_file_name"),
      pybind11::arg("is_weight") = false,
      pybind11::arg("qparams_output_file_name") = "");

  m.def(
      "RegisterQuantizationParamsWithHistogram",
      [](const string& histogram_file_name,
         bool is_weight,
         const string& qparams_output_file_name) {
        AddGlobalNetObserverCreator([histogram_file_name,
                                     is_weight,
                                     qparams_output_file_name](NetBase* net) {
          return make_unique<
              RegisterQuantizationParamsWithHistogramNetObserver>(
              net, histogram_file_name, is_weight, qparams_output_file_name);
        });
      },
      pybind11::arg("histogram_file_name"),
      pybind11::arg("is_weight") = false,
      pybind11::arg("qparams_output_file_name") = "");

  m.def(
      "AddRegisterQuantizationParamsWithHistogramObserver",
      [](const string& net_name,
         const string& histogram_file_name,
         int is_weight,
         const string& qparams_output_file_name) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        CAFFE_ENFORCE(
            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
        pybind11::gil_scoped_release g;

        NetBase* net = gWorkspace->GetNet(net_name);
        const Observable<NetBase>::Observer* observer = nullptr;

        observer = net->AttachObserver(
            make_unique<RegisterQuantizationParamsWithHistogramNetObserver>(
                net, histogram_file_name, is_weight, qparams_output_file_name));

        CAFFE_ENFORCE(observer != nullptr);
        return pybind11::cast(observer);
      },
      pybind11::arg("net_name"),
      pybind11::arg("histogram_file_name"),
      pybind11::arg("is_weight") = false,
      pybind11::arg("qparams_output_file_name") = "");

  m.def(
      "AddScaleZeroOffsetArgumentsWithHistogram",
      [](const pybind11::bytes& net_def_bytes,
         const string& histogram_file_name) {
        NetDef def;
        CAFFE_ENFORCE(
            ParseProtoFromLargeString(net_def_bytes.cast<string>(), &def));
        pybind11::gil_scoped_release g;

        string protob;
        auto transformed_net =
            dnnlowp::AddScaleZeroOffsetArgumentsWithHistogram(
                def, histogram_file_name);

        CAFFE_ENFORCE(transformed_net.SerializeToString(&protob));
        return pybind11::bytes(protob);
      });

  pybind11::class_<dnnlowp::TensorQuantizationParams>(m, "QueryTensorQparam")
      .def(pybind11::init<float, std::int32_t, int>())
      .def_property_readonly(
          "scale",
          [](dnnlowp::TensorQuantizationParams& qparam) {
            return qparam.scale;
          })
      .def_property_readonly(
          "zero_point",
          [](dnnlowp::TensorQuantizationParams& qparam) {
            return qparam.zero_point;
          })
      .def_property_readonly(
          "precision",
          [](dnnlowp::TensorQuantizationParams& qparam) {
            return qparam.precision;
          })
      .def_property_readonly(
          "min",
          [](dnnlowp::TensorQuantizationParams& qparam) {
            return qparam.Min();
          })
      .def_property_readonly(
          "max", [](dnnlowp::TensorQuantizationParams& qparam) {
            return qparam.Max();
          });
  m.def("get_fakefp16_mapping", [](bool use_fp16_acc, bool use_nnpi) {
    return caffe2::opt::getFakeFp16OpMapping(use_fp16_acc, use_nnpi);
  });
  m.def("freeze_quantization_params",
      [](const pybind11::bytes& net_def_bytes){
        NetDef def;
        CAFFE_ENFORCE(
            ParseProtoFromLargeString(net_def_bytes.cast<string>(), &def));
        string protob;
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        freezeQuantizationParams(&def, gWorkspace);
        CAFFE_ENFORCE(def.SerializeToString(&protob));
        return pybind11::bytes(protob);
      });
  m.def(
      "ChooseStaticQuantizationParams",
      [](float min,
         float max,
         const std::vector<uint64_t>& bins,
         bool preserve_sparsity,
         int precision,
         const std::string& quant_scheme,
         float p99_threshold,
         bool is_weight) {
        dnnlowp::Histogram hist = dnnlowp::Histogram(min, max, bins);

        dnnlowp::QuantizationFactory::QuantizationKind quant_kind =
            dnnlowp::QuantizationFactory::MIN_MAX_QUANTIZATION;
        if (quant_scheme.compare("L2_MIN_QUANTIZATION") == 0) {
          quant_kind = dnnlowp::QuantizationFactory::L2_MIN_QUANTIZATION;
        } else if (quant_scheme.compare("L2_MIN_QUANTIZATION_APPROX") == 0) {
          quant_kind = dnnlowp::QuantizationFactory::L2_MIN_QUANTIZATION_APPROX;
        } else if (quant_scheme.compare("KL_MIN_QUANTIZATION") == 0) {
          quant_kind = dnnlowp::QuantizationFactory::KL_MIN_QUANTIZATION;
        } else if (quant_scheme.compare("P99_QUANTIZATION") == 0) {
          quant_kind = dnnlowp::QuantizationFactory::P99_QUANTIZATION;
        } else if (quant_scheme.compare("L1_MIN_QUANTIZATION") == 0) {
          quant_kind = dnnlowp::QuantizationFactory::L1_MIN_QUANTIZATION;
        } else {
          LOG(INFO) << "Using DNNLOWP default MIN_MAX_QUANTIZATION";
        }
        dnnlowp::QuantizationFactory* qfactory =
            dnnlowp::QuantizationFactory::GetDefaultInstance();
        if (is_weight) {
          qfactory->SetWeightP99Threshold(p99_threshold);
        } else {
          qfactory->SetActivationP99Threshold(p99_threshold);
        }
        return qfactory->ChooseQuantizationParams(
            hist, quant_kind, precision, preserve_sparsity, is_weight);
      },
      pybind11::arg("min"),
      pybind11::arg("max"),
      pybind11::arg("bins"),
      pybind11::arg("preserve_sparsity") = true,
      pybind11::arg("precision") = 8,
      pybind11::arg("quant_scheme") = "min_max",
      pybind11::arg("p99_threshold") = 0.99,
      pybind11::arg("is_weight") = false);
  m.def(
      "ObserveFp16FCPackedWeights",
      [](const string& blob_name, const string& weights_out_file) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        const auto* blob = gWorkspace->GetBlob(blob_name);
        CAFFE_ENFORCE(blob, "Can't find blob ", blob_name);
        fbgemm::PackedGemmMatrixFP16* packedGemmMatrixPtr =
            blob->template Get<unique_ptr<fbgemm::PackedGemmMatrixFP16>>()
                .get();
        uint64_t nrow = packedGemmMatrixPtr->numRows();
        uint64_t ncol = packedGemmMatrixPtr->numCols();
        uint64_t size = nrow * ncol;
        fbgemm::float16* unpacked_mat_ptr = nullptr;
        vector<fbgemm::float16> unpacked_mat;

        if (!packedGemmMatrixPtr->packed()) {
          unpacked_mat_ptr = packedGemmMatrixPtr->pmat();
        } else {
          unpacked_mat.resize(size);
          packedGemmMatrixPtr->unpack(
              unpacked_mat.data(), fbgemm::matrix_op_t::Transpose);
          unpacked_mat_ptr = unpacked_mat.data();
        }
        ofstream fout;
        fout.open(weights_out_file);
        if (!fout) {
          LOG(WARNING) << "Can't open output file to dump fp16 weights "
                       << weights_out_file;
          return;
        }
        for (int i = 0; i < nrow; ++i) {
          for (int j = 0; j < ncol; ++j) {
            if (j > 0) {
              fout << " ";
            }
            fout << fbgemm::cpu_half2float(unpacked_mat_ptr[i + nrow * j]);
          }
          fout << endl;
        }
        LOG(INFO) << "Written unpacked blob " << blob_name << " to "
                  << weights_out_file;
      },
      pybind11::arg("blob_name"),
      pybind11::arg("weights_out_file"));
  m.def(
      "ObserveInt8FCPackedWeights",
      [](const string& blob_name, const string& weights_out_file) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        const auto* blob = gWorkspace->GetBlob(blob_name);
        if (blob == nullptr) {
          LOG(WARNING) << "Can't find blob " << blob_name;
          return;
        }
        const Int8FCDNNLowPPackedWeightBlob& packedInt8Blob =
            blob->template Get<Int8FCDNNLowPPackedWeightBlob>();
        auto& qparams = packedInt8Blob.qparams;
        auto& unpacked_tensor = packedInt8Blob.original_tensor;
        auto& packed_tensor = packedInt8Blob.W;

        auto shape = unpacked_tensor.sizes();
        CAFFE_ENFORCE(shape.size() == 2);
        vector<int8_t> unpacked_int8_data;
        unpacked_int8_data.resize(shape[0] * shape[1]);
        packed_tensor->unpack(unpacked_int8_data.data());

        ofstream fout;
        fout.open(weights_out_file);
        if (!fout) {
          LOG(WARNING) << "Can't open output file to dump int8 weights "
                       << weights_out_file;
          return;
        }
        for (int i = 0; i < qparams.size(); ++i) {
          if (i > 0) {
            fout << " ";
          }
          fout << to_string(qparams[i].scale) << " "
               << to_string(qparams[i].zero_point);
        }
        fout << endl;
        for (int i = 0; i < shape[0]; ++i) {
          for (int j = 0; j < shape[1]; ++j) {
            if (j > 0) {
              fout << " ";
            }
            fout << to_string(unpacked_int8_data.data()[i * shape[1] + j]);
          }
          fout << endl;
        }
        LOG(INFO) << "Written int8 qparams and weights for " << blob_name
                  << " to " << weights_out_file;
      },
      pybind11::arg("blob_name"),
      pybind11::arg("weights_out_file"));
  m.def(
      "CreateInt8QuantSchemeBlob",
      [](std::string quant_scheme_blob_name,
         std::string quantization_kind,
         bool preserve_sparsity) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        auto* quant_scheme_blob = gWorkspace->GetBlob(quant_scheme_blob_name);
        if (quant_scheme_blob == nullptr) {
          quant_scheme_blob = gWorkspace->CreateBlob(quant_scheme_blob_name);
        }
        auto* quant_scheme_blob_data =
            quant_scheme_blob->GetMutable<unique_ptr<Int8QuantSchemeBlob>>();
        quant_scheme_blob_data->reset(
            new Int8QuantSchemeBlob(quantization_kind, preserve_sparsity));
      },
      pybind11::arg("quant_scheme_blob_name"),
      pybind11::arg("quantization_kind"),
      pybind11::arg("preserve_sparsity"));
  m.def(
      "CreateInt8QuantParamsBlob",
      [](std::string quant_params_blob_name, float scale, int zero_point) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        auto* quant_params_blob = gWorkspace->GetBlob(quant_params_blob_name);
        if (quant_params_blob == nullptr) {
          quant_params_blob = gWorkspace->CreateBlob(quant_params_blob_name);
        }
        auto* quant_params_blob_data =
            quant_params_blob->GetMutable<unique_ptr<Int8QuantParamsBlob>>();
        quant_params_blob_data->reset(
            new Int8QuantParamsBlob(scale, zero_point));
      },
      pybind11::arg("quant_param_blob_name"),
      pybind11::arg("scale"),
      pybind11::arg("zero_point"));
  m.def(
      "ObserveInt8QuantParamsBlob",
      [](std::string quant_params_blob_name) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        auto* quant_params_blob = gWorkspace->GetBlob(quant_params_blob_name);
        CAFFE_ENFORCE(quant_params_blob);
        auto* quant_params_blob_data =
            quant_params_blob->Get<unique_ptr<Int8QuantParamsBlob>>().get();
        return std::tuple<float, int>(
            quant_params_blob_data->qparam.scale,
            quant_params_blob_data->qparam.zero_point);
      },
      pybind11::arg("quant_params_blob_name"));
  m.def(
      "ObserveInt8QuantSchemeBlob",
      [](std::string quant_scheme_blob_name) {
        Workspace* gWorkspace = caffe2::python::GetCurrentWorkspace();
        CAFFE_ENFORCE(gWorkspace);
        auto* quant_scheme_blob = gWorkspace->GetBlob(quant_scheme_blob_name);
        CAFFE_ENFORCE(quant_scheme_blob);
        auto* quant_scheme_blob_data =
            quant_scheme_blob->Get<unique_ptr<Int8QuantSchemeBlob>>().get();
        return std::tuple<std::string, bool>(
            quant_scheme_blob_data->quantization_kind_,
            quant_scheme_blob_data->preserve_sparsity_);
      },
      pybind11::arg("quant_scheme_blob_name"));
}