mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-06 12:20:11 +01:00
This CL can resolve the latest bfloat16 TFLite flatbuffers' interpreter executation & quantization. It's required because quantization checks the TFLite float FB validation with interpreter. PiperOrigin-RevId: 695529144
1408 lines
52 KiB
C++
1408 lines
52 KiB
C++
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#ifndef TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
|
|
#define TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <complex>
|
|
#include <cstdlib>
|
|
#include <functional>
|
|
#include <initializer_list>
|
|
#include <limits>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <ostream>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include <gmock/gmock.h>
|
|
#include <gtest/gtest.h>
|
|
#include "absl/algorithm/container.h"
|
|
#include "absl/types/span.h"
|
|
#include "Eigen/Core" // from @eigen_archive
|
|
#include "flatbuffers/flatbuffers.h" // from @flatbuffers
|
|
#include "tensorflow/lite/core/api/op_resolver.h"
|
|
#include "tensorflow/lite/core/c/common.h"
|
|
#include "tensorflow/lite/core/interpreter.h"
|
|
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
|
|
#include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
|
|
#include "tensorflow/lite/kernels/kernel_util.h"
|
|
#include "tensorflow/lite/portable_type_to_tflitetype.h"
|
|
#include "tensorflow/lite/schema/schema_generated.h"
|
|
#include "tensorflow/lite/string_type.h"
|
|
#include "tensorflow/lite/string_util.h"
|
|
#include "tensorflow/lite/testing/util.h" // IWYU pragma: keep
|
|
#include "tensorflow/lite/tools/optimize/quantization_utils.h"
|
|
#include "tensorflow/lite/type_to_tflitetype.h"
|
|
#include "tensorflow/lite/util.h"
|
|
#include "tsl/platform/logging.h"
|
|
|
|
namespace tflite {
|
|
|
|
// This constant indicates the error bound is derived automatically in functions
|
|
// like ArrayFloatNear.
|
|
constexpr float kFpErrorAuto = -1;
|
|
|
|
// Returns whether we allow FP16 precision for FP32 operations, i.e. in FP16
|
|
// mode.
|
|
bool AllowFp16PrecisionForFp32();
|
|
|
|
// It checks if the actual number almost equals the expected number with the
|
|
// tolerance of 4 FP16 ULPs in FP16 mode; 4 FP32 ULPs in FP32 mode.
|
|
// Given float x, 2^e <= |x| <= 2^(e+1), then ULP(x) = 2^(max(e, e_min)-p+1)
|
|
// where e_min is -24 for FP16, -126 for FP32; p is 10 for FP16, 23 for FP32.
|
|
::testing::Matcher<std::tuple<float, float>> FloatingPointAlmostEq();
|
|
|
|
// In FP32 mode, it equals to Eq(), which means the error bound is zero (no
|
|
// error allowed); in FP16 mode, it checks if the actual number almost equals
|
|
// the expected number with the tolerance of 4 FP16 ULPs.
|
|
::testing::Matcher<std::tuple<float, float>> FloatingPointEq();
|
|
|
|
// A gmock matcher that check that elements of a float vector match to a given
|
|
// tolerance. In FP32 mode, the tolerance is max(max_abs_err, value *
|
|
// max_rel_err). In FP16 mode, the tolerance is max(fp16_max_abs_err, value *
|
|
// fp16_max_rel_err). If fp16_max_abs_err is kFpErrorAuto, it is set to
|
|
// std::max(max_abs_err, sqrt(max_abs_err)) automatically.
|
|
std::vector<::testing::Matcher<float>> ArrayFloatNear(
|
|
const std::vector<float>& values, float max_abs_err = 1e-5,
|
|
float fp16_max_abs_err = kFpErrorAuto, float max_rel_err = 0,
|
|
float fp16_max_rel_err = 0.01);
|
|
|
|
// TODO(b/280061335): Add FP16 logic as ArrayFloatNear does.
|
|
// A gmock matcher that check that elements of a complex vector match to a given
|
|
// tolerance.
|
|
std::vector<::testing::Matcher<std::complex<float>>> ArrayComplex64Near(
|
|
const std::vector<std::complex<float>>& values, float max_abs_error = 1e-5);
|
|
|
|
template <typename T>
|
|
inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
|
|
int32_t zero_point,
|
|
TfLiteType type = kTfLiteNoType) {
|
|
std::vector<T> q;
|
|
|
|
T min = std::numeric_limits<T>::min();
|
|
T max = std::numeric_limits<T>::max();
|
|
|
|
if (type == kTfLiteInt4) {
|
|
min = -7;
|
|
max = 7;
|
|
}
|
|
|
|
q.reserve(data.size());
|
|
for (const auto& f : data) {
|
|
q.push_back(static_cast<T>(std::max<float>(
|
|
min, std::min<float>(max, std::round(zero_point + (f / scale))))));
|
|
}
|
|
return q;
|
|
}
|
|
|
|
template <typename T>
|
|
inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
|
|
int32_t zero_point) {
|
|
std::vector<float> f;
|
|
f.reserve(data.size());
|
|
for (const T& q : data) {
|
|
f.push_back(scale * (q - zero_point));
|
|
}
|
|
return f;
|
|
}
|
|
|
|
template <>
|
|
constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
|
|
return kTfLiteFloat16;
|
|
}
|
|
|
|
template <>
|
|
constexpr TfLiteType typeToTfLiteType<Eigen::bfloat16>() {
|
|
return kTfLiteBFloat16;
|
|
}
|
|
|
|
// A test model that contains a single operator. All operator inputs and
|
|
// output are external to the model, so the tests can directly access them.
|
|
// Typical usage:
|
|
// SingleOpModel m;
|
|
// int a = m.AddInput({TensorType_FLOAT32, a_shape});
|
|
// int b = m.AddInput({TensorType_FLOAT32, b_shape});
|
|
// int c = m.AddOutput({TensorType_FLOAT32, {}});
|
|
// m.SetBuiltinOp(...);
|
|
// m.BuildInterpreter({GetShape(a), GetShape(b)});
|
|
// m.PopulateTensor(a, {...});
|
|
// m.PopulateTensor(b, {...});
|
|
// m.Invoke();
|
|
// EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
|
|
//
|
|
|
|
// A helper struct to construct test tensors. This is particularly useful for
|
|
// quantized tensor which must have their scale and zero_point defined before
|
|
// the actual data is known. This mimics what happens in practice: quantization
|
|
// parameters are calculated during training or post training..
|
|
struct TensorData {
|
|
// NOLINTNEXTLINE
|
|
TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
|
|
float min = 0.0f, float max = 0.0f, float scale = 0.0f,
|
|
int32_t zero_point = 0, bool per_channel_quantization = false,
|
|
std::vector<float> per_channel_quantization_scales = {},
|
|
std::vector<int64_t> per_channel_quantization_offsets = {},
|
|
int32_t channel_index = 0, std::vector<int> traversal_order = {},
|
|
std::vector<TfLiteDimensionType> format = {},
|
|
std::vector<int> block_size = {}, std::vector<int> block_map = {},
|
|
std::vector<int> shape_signature = {})
|
|
: type(type),
|
|
shape(shape),
|
|
min(min),
|
|
max(max),
|
|
scale(scale),
|
|
zero_point(zero_point),
|
|
per_channel_quantization(per_channel_quantization),
|
|
per_channel_quantization_scales(
|
|
std::move(per_channel_quantization_scales)),
|
|
per_channel_quantization_offsets(
|
|
std::move(per_channel_quantization_offsets)),
|
|
channel_index(channel_index),
|
|
traversal_order(traversal_order),
|
|
format(format),
|
|
block_size(block_size),
|
|
block_map(block_map),
|
|
shape_signature(shape_signature) {}
|
|
TensorType type;
|
|
std::vector<int> shape;
|
|
float min;
|
|
float max;
|
|
float scale;
|
|
int32_t zero_point;
|
|
bool per_channel_quantization;
|
|
std::vector<float> per_channel_quantization_scales;
|
|
std::vector<int64_t> per_channel_quantization_offsets;
|
|
int32_t channel_index;
|
|
std::vector<int> traversal_order;
|
|
std::vector<TfLiteDimensionType> format;
|
|
std::vector<int> block_size;
|
|
std::vector<int> block_map;
|
|
std::vector<int> shape_signature;
|
|
};
|
|
|
|
class SingleOpResolver : public OpResolver {
|
|
public:
|
|
SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration,
|
|
int version = 1)
|
|
: op_(op), registration_(*registration) {
|
|
registration_.builtin_code = static_cast<int32_t>(op);
|
|
registration_.version = version;
|
|
}
|
|
const TfLiteRegistration* FindOp(BuiltinOperator op,
|
|
int version) const override {
|
|
if (op == op_) {
|
|
return ®istration_;
|
|
}
|
|
return nullptr;
|
|
}
|
|
const TfLiteRegistration* FindOp(const char* op, int version) const override {
|
|
return nullptr;
|
|
}
|
|
|
|
private:
|
|
const BuiltinOperator op_;
|
|
TfLiteRegistration registration_;
|
|
};
|
|
|
|
class SingleOpModel;
|
|
class AccelerationValidator {
|
|
public:
|
|
using Callback = std::function<void(const SingleOpModel& model)>;
|
|
|
|
// Returns a global AccelerationValidator instance.
|
|
static AccelerationValidator* Get();
|
|
|
|
// Adds a callback function that will be invoked at the end of a kernel test
|
|
// to validate acceleration.
|
|
void AddCallback(Callback callback);
|
|
|
|
// Performs acceleration validation with all registered callbacks.
|
|
void Validate(const SingleOpModel& model) const;
|
|
|
|
private:
|
|
std::vector<Callback> callbacks_;
|
|
};
|
|
|
|
class SingleOpModel {
|
|
public:
|
|
SingleOpModel() = default;
|
|
~SingleOpModel();
|
|
|
|
// Set a delegate that is applied right after graph is prepared. This is
|
|
// useful for testing other runtimes like NN API or GPU.
|
|
// Note: the caller still owns the memory of the passed-in `delegate`.
|
|
void SetDelegate(TfLiteDelegate* delegate) {
|
|
delegate_ = delegate;
|
|
// As this is a manually-set TF Lite delegate, we assume the intention of
|
|
// the test is to test against the particular delegate, hence bypassing
|
|
// applying TfLite default delegates (i.e. the XNNPACK delegate).
|
|
if (delegate_ != nullptr) {
|
|
SetBypassDefaultDelegates();
|
|
}
|
|
}
|
|
|
|
TfLiteStatus ApplyDelegate();
|
|
|
|
// Copying or assignment is disallowed to simplify ownership semantics.
|
|
SingleOpModel(const SingleOpModel&) = delete;
|
|
SingleOpModel& operator=(const SingleOpModel&) = delete;
|
|
|
|
// Add a TensorType input tensor and return its index.
|
|
int AddInput(const TensorData& t);
|
|
int AddVariableInput(const TensorData& t);
|
|
|
|
int AddIntermediate(TensorType type, const std::vector<float>& scale,
|
|
const std::vector<int64_t>& zero_point);
|
|
|
|
// Returns the input tensor at position `index`.
|
|
TfLiteTensor* GetInputTensor(int index) {
|
|
return interpreter_->input_tensor(index);
|
|
}
|
|
|
|
// Returns the output tensor at position `index`.
|
|
TfLiteTensor* GetOutputTensor(int index) {
|
|
return interpreter_->output_tensor(index);
|
|
}
|
|
// Templated version of AddConstInput() taking pointer and size.
|
|
template <typename T>
|
|
int AddConstInput(const TensorData& t, const T* data, size_t size) {
|
|
int id = 0;
|
|
if (t.per_channel_quantization) {
|
|
id = AddTensorPerChannelQuant(t, data, size);
|
|
} else {
|
|
id = AddTensor(t, data, size);
|
|
}
|
|
inputs_.push_back(id);
|
|
return id;
|
|
}
|
|
|
|
// Templated version of AddConstInput() taking vector and shape.
|
|
template <typename T>
|
|
int AddConstInput(TensorType type, const std::vector<T>& data,
|
|
std::initializer_list<int> shape) {
|
|
return AddConstInput(TensorData{type, shape}, data.data(), data.size());
|
|
}
|
|
|
|
// Templated version of AddConstInput() taking TensorType, initializer_list
|
|
// and shape.
|
|
template <typename T>
|
|
int AddConstInput(TensorType type, std::initializer_list<T> data,
|
|
std::initializer_list<int> shape) {
|
|
return AddConstInput<T>(TensorData{type, shape}, data.begin(), data.size());
|
|
}
|
|
|
|
// Templated version of AddConstInput() taking TensorData, initializer_list
|
|
// and shape.
|
|
template <typename T>
|
|
int AddConstInput(const TensorData& t, std::initializer_list<T> data) {
|
|
return AddConstInput(t, data.begin(), data.size());
|
|
}
|
|
|
|
// Templated version of AddConstInput() taking TensorData and vector.
|
|
template <typename T>
|
|
int AddConstInput(const TensorData& t, const std::vector<T>& data) {
|
|
return AddConstInput(t, data.data(), data.size());
|
|
}
|
|
|
|
// TODO(b/166202747): Use a better way to do type specialization. Reduce
|
|
// duplicate code in the two functions below.
|
|
int AddConstSparseInput(const TensorData& t,
|
|
const std::vector<int8_t>& data) {
|
|
int id = tensors_.size();
|
|
const int dims_count = t.traversal_order.size();
|
|
std::vector<int8_t> dense_data(data);
|
|
|
|
tflite::internal::sparsity::FormatConverter<int8_t> converter(
|
|
t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
|
|
converter.DenseToSparse(dense_data.data());
|
|
|
|
const auto& dim_metadata = converter.GetDimMetadata();
|
|
const auto& sparse_data = converter.GetData();
|
|
|
|
// Build sparsity parameter.
|
|
std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
|
|
dims_count);
|
|
for (int i = 0; i < dims_count; i++) {
|
|
const int metadata_idx = 2 * i;
|
|
if (i < t.shape.size() &&
|
|
t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
|
|
auto array_segments =
|
|
CreateInt32Vector(builder_, builder_.CreateVector<int>(
|
|
dim_metadata[metadata_idx]))
|
|
.Union();
|
|
auto array_indices =
|
|
CreateInt32Vector(builder_, builder_.CreateVector<int>(
|
|
dim_metadata[metadata_idx + 1]))
|
|
.Union();
|
|
fb_dim_metadata[i] = CreateDimensionMetadata(
|
|
builder_, DimensionType_SPARSE_CSR, 0,
|
|
SparseIndexVector_Int32Vector, array_segments,
|
|
SparseIndexVector_Int32Vector, array_indices);
|
|
} else {
|
|
fb_dim_metadata[i] = CreateDimensionMetadata(
|
|
builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
|
|
}
|
|
}
|
|
|
|
flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
|
|
builder_, builder_.CreateVector<int>(t.traversal_order),
|
|
builder_.CreateVector<int>(t.block_map),
|
|
builder_.CreateVector(fb_dim_metadata));
|
|
|
|
int buffer_id = 0;
|
|
if (!data.empty()) {
|
|
// Initialize buffers list with empty buffer to allow for non-const
|
|
// tensors.
|
|
if (buffers_.empty()) {
|
|
buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
|
|
}
|
|
|
|
// Add compressed data as a Buffer to buffers list.
|
|
buffer_id = buffers_.size();
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(sparse_data.data()),
|
|
sparse_data.size());
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
}
|
|
|
|
tensors_.push_back(CreateTensor(
|
|
builder_, builder_.CreateVector<int>(t.shape), t.type,
|
|
/*buffer=*/buffer_id,
|
|
/*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
|
|
|
|
inputs_.push_back(id);
|
|
tensor_data_[id] = t;
|
|
|
|
return id;
|
|
}
|
|
|
|
// Add a constant sparse tensor as input.
|
|
template <typename T>
|
|
int AddConstSparseInput(const TensorData& t, const std::vector<T>& data,
|
|
bool symmetric_quantize = false) {
|
|
int id = tensors_.size();
|
|
const int dims_count = t.traversal_order.size();
|
|
std::vector<T> dense_data(data);
|
|
|
|
tflite::internal::sparsity::FormatConverter<T> converter(
|
|
t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
|
|
converter.DenseToSparse(dense_data.data());
|
|
|
|
const auto dim_metadata = converter.GetDimMetadata();
|
|
const auto sparse_data = converter.GetData();
|
|
|
|
// Build sparsity parameter.
|
|
std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
|
|
dims_count);
|
|
for (int i = 0; i < dims_count; i++) {
|
|
const int metadata_idx = 2 * i;
|
|
if (i < t.shape.size() &&
|
|
t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
|
|
auto array_segments =
|
|
CreateInt32Vector(builder_,
|
|
builder_.CreateVector(dim_metadata[metadata_idx]))
|
|
.Union();
|
|
auto array_indices =
|
|
CreateInt32Vector(
|
|
builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
|
|
.Union();
|
|
fb_dim_metadata[i] = CreateDimensionMetadata(
|
|
builder_, DimensionType_SPARSE_CSR, 0,
|
|
SparseIndexVector_Int32Vector, array_segments,
|
|
SparseIndexVector_Int32Vector, array_indices);
|
|
} else {
|
|
fb_dim_metadata[i] = CreateDimensionMetadata(
|
|
builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
|
|
}
|
|
}
|
|
|
|
flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
|
|
builder_, builder_.CreateVector(t.traversal_order),
|
|
builder_.CreateVector(t.block_map),
|
|
builder_.CreateVector(fb_dim_metadata));
|
|
|
|
flatbuffers::Offset<QuantizationParameters> q_params = 0;
|
|
int buffer_id = 0;
|
|
if (!data.empty()) {
|
|
// Initialize buffers list with empty buffer to allow for non-const
|
|
// tensors.
|
|
if (buffers_.empty()) {
|
|
buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
|
|
}
|
|
|
|
// Add compressed data as a Buffer to buffers list.
|
|
buffer_id = buffers_.size();
|
|
// When the quantization parameter is set for the added tensor, we
|
|
// quantize the given data.
|
|
bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0 ||
|
|
(t.per_channel_quantization &&
|
|
!t.per_channel_quantization_scales.empty() &&
|
|
!t.per_channel_quantization_offsets.empty()));
|
|
if (symmetric_quantize) {
|
|
const int length = sparse_data.size();
|
|
std::vector<int8_t> q(length);
|
|
float min, max, scaling_factor;
|
|
tensor_utils::SymmetricQuantizeFloats(
|
|
sparse_data.data(), length, q.data(), &min, &max, &scaling_factor);
|
|
std::vector<float> scales{scaling_factor};
|
|
std::vector<int64_t> zero_points{0};
|
|
q_params = CreateQuantizationParameters(
|
|
builder_, 0, 0, builder_.CreateVector<float>(scales),
|
|
builder_.CreateVector<int64_t>(zero_points));
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(q.data()), q.size());
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
} else if (is_quantized) {
|
|
CHECK_EQ(t.type, TensorType_INT8)
|
|
<< "The INT8 quantization is only supported for sparsified tensor";
|
|
std::vector<int8_t> quantized_output(sparse_data.size());
|
|
std::vector<float> scales;
|
|
std::vector<int64_t> zero_points;
|
|
if (t.per_channel_quantization) {
|
|
CHECK_EQ(t.per_channel_quantization_scales.size(), // NOLINT
|
|
t.per_channel_quantization_offsets.size())
|
|
<< "Per channel quantization scales and offsets should have the "
|
|
"same size";
|
|
std::vector<int8_t> temp_data(dense_data.size());
|
|
const int32_t num_channel = t.shape[t.channel_index];
|
|
std::vector<float> scales_inv(num_channel);
|
|
for (int i = 0; i < num_channel; ++i) {
|
|
scales_inv[i] = 1.0f / t.per_channel_quantization_scales[i];
|
|
}
|
|
optimize::utils::SymmetricPerChannelQuantizeValues(
|
|
dense_data.data(), scales_inv, t.shape, t.channel_index,
|
|
&temp_data, kTfLiteInt8);
|
|
|
|
tflite::internal::sparsity::FormatConverter<int8_t> quant_converter(
|
|
t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
|
|
quant_converter.DenseToSparse(temp_data.data());
|
|
quantized_output = std::move(quant_converter.GetData());
|
|
scales = t.per_channel_quantization_scales;
|
|
zero_points = t.per_channel_quantization_offsets;
|
|
|
|
} else {
|
|
quantized_output =
|
|
std::move(Quantize<int8_t>(sparse_data, t.scale, t.zero_point));
|
|
scales = {t.scale};
|
|
zero_points = {0};
|
|
}
|
|
q_params = CreateQuantizationParameters(
|
|
builder_, t.min, t.max, builder_.CreateVector<float>(scales),
|
|
builder_.CreateVector<int64_t>(zero_points));
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(quantized_output.data()),
|
|
quantized_output.size());
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
} else {
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(sparse_data.data()),
|
|
sizeof(T) * sparse_data.size());
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
}
|
|
}
|
|
|
|
tensors_.push_back(
|
|
CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
|
|
symmetric_quantize ? TensorType_INT8 : t.type,
|
|
/*buffer=*/buffer_id,
|
|
/*name=*/0, q_params, /*is_variable=*/false, s_param));
|
|
|
|
inputs_.push_back(id);
|
|
tensor_data_[id] = t;
|
|
|
|
return id;
|
|
}
|
|
|
|
// Add a null input tensor (optional input) and return kTfLiteOptionalTensor.
|
|
int AddNullInput();
|
|
|
|
// Add a TensorType output tensor and return its index.
|
|
int AddOutput(const TensorData& t);
|
|
|
|
template <typename T>
|
|
void QuantizeAndPopulate(int index, const std::vector<float>& data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
auto q = Quantize<T>(data, t->params.scale, t->params.zero_point, t->type);
|
|
PopulateTensor(index, 0, q.data(), q.data() + q.size());
|
|
}
|
|
|
|
void QuantizeAndPopulate4bit(int index, const std::vector<float>& data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
t->type = kTfLiteInt4;
|
|
std::vector<int8_t> quantized_output =
|
|
Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
|
|
PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
|
|
quantized_output.data() + quantized_output.size());
|
|
}
|
|
|
|
void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
|
|
std::vector<int8_t> q = QuantizeTensor(index, data);
|
|
PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
|
|
reinterpret_cast<uint8_t*>(q.data() + q.size()));
|
|
}
|
|
|
|
void SignedSymmetricQuantizeAndPopulate(int index,
|
|
const std::vector<float>& data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
if (t->type == kTfLiteInt4) {
|
|
std::vector<int8_t> q = Quantize<int8_t>(data, t->params.scale,
|
|
t->params.zero_point, t->type);
|
|
PopulateTensor4bit(index, /*offset=*/0, q.data(), q.data() + q.size());
|
|
} else {
|
|
std::vector<int8_t> q = QuantizeTensor(index, data);
|
|
PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
|
|
}
|
|
}
|
|
|
|
// Quantize and populate data for filter with per channel quantization.
|
|
void PerChannelSymmetricQuantizeAndPopulate(
|
|
int index, const std::vector<float>& input_data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
auto* params =
|
|
reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
|
|
const int channel_index = params->quantized_dimension;
|
|
|
|
std::vector<int32_t> shape(t->dims->size);
|
|
for (size_t i = 0; i < shape.size(); ++i) {
|
|
shape[i] = t->dims->data[i];
|
|
}
|
|
const int32_t num_inputs = input_data.size();
|
|
const int32_t num_channel = shape[channel_index];
|
|
std::vector<int8_t> quantized_output(num_inputs);
|
|
std::vector<float> scales_inv(num_channel);
|
|
for (int i = 0; i < num_channel; ++i) {
|
|
const float scale = params->scale->size == 1 ? params->scale->data[0]
|
|
: params->scale->data[i];
|
|
scales_inv[i] = 1.0f / scale;
|
|
}
|
|
|
|
optimize::utils::SymmetricPerChannelQuantizeValues(
|
|
input_data.data(), scales_inv, shape, channel_index, &quantized_output,
|
|
t->type);
|
|
|
|
if (t->type == kTfLiteInt4) {
|
|
PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
|
|
quantized_output.data() + quantized_output.size());
|
|
|
|
} else {
|
|
PopulateTensor(index, /*offset=*/0, quantized_output.data(),
|
|
quantized_output.data() + quantized_output.size());
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void PerChannelQuantizeBiasPopulateTensor(
|
|
const std::vector<float>& input_data, int index,
|
|
TfLiteAffineQuantization* params) {
|
|
const int32_t num_inputs = input_data.size();
|
|
std::vector<T> quantized_output(num_inputs);
|
|
for (int i = 0; i < num_inputs; ++i) {
|
|
const float scale = params->scale->size == 1 ? params->scale->data[0]
|
|
: params->scale->data[i];
|
|
quantized_output[i] = input_data[i] / scale;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void PerChannelQuantizeBiasPopulateTensor(
|
|
int index, const std::vector<float>& input_data,
|
|
const TfLiteAffineQuantization* params) {
|
|
const int32_t num_inputs = input_data.size();
|
|
std::vector<T> quantized_output(num_inputs);
|
|
for (int i = 0; i < num_inputs; ++i) {
|
|
const float scale = params->scale->size == 1 ? params->scale->data[0]
|
|
: params->scale->data[i];
|
|
quantized_output[i] = input_data[i] / scale;
|
|
}
|
|
PopulateTensor(index, /*offset=*/0, quantized_output.data(),
|
|
quantized_output.data() + quantized_output.size());
|
|
}
|
|
|
|
// Quantize and populate data for bias with per channel quantization.
|
|
void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
auto* params =
|
|
reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
|
|
CHECK(t->type == kTfLiteInt32 || t->type == kTfLiteInt64);
|
|
if (t->type == kTfLiteInt32) {
|
|
PerChannelQuantizeBiasPopulateTensor<int32_t>(index, input_data, params);
|
|
} else {
|
|
PerChannelQuantizeBiasPopulateTensor<int64_t>(index, input_data, params);
|
|
}
|
|
}
|
|
|
|
const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
|
|
|
|
float GetScale(int id) { return tensor_data_.at(id).scale; }
|
|
int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
|
|
|
|
// Define the operator in this model.
|
|
void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
|
|
flatbuffers::Offset<void> builtin_options);
|
|
void SetBuiltinOp(BuiltinOperator type,
|
|
BuiltinOptions2 builtin_options_2_type,
|
|
flatbuffers::Offset<void> builtin_options_2);
|
|
void SetCustomOp(const string& name,
|
|
const std::vector<uint8_t>& custom_option,
|
|
const std::function<TfLiteRegistration*()>& registration);
|
|
|
|
// Allocate tensors and apply delegate.
|
|
// Note that this is called by default in BuiltInterpreter().
|
|
void AllocateAndDelegate(bool apply_delegate);
|
|
|
|
// Build the interpreter for this model. Also, resize and allocate all
|
|
// tensors given the shapes of the inputs.
|
|
// Note, if `allocate_and_delegate` is `false`, then the value of
|
|
// `apply_delegate` is ignored.
|
|
void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
|
|
int num_threads, bool allow_fp32_relax_to_fp16,
|
|
bool apply_delegate, bool allocate_and_delegate = true,
|
|
bool use_simple_allocator = false);
|
|
|
|
void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
|
|
bool use_simple_allocator = false);
|
|
|
|
// Executes inference and return status code.
|
|
TfLiteStatus Invoke();
|
|
|
|
void PopulateStringTensor(int index, const std::vector<string>& content) {
|
|
auto tensor = interpreter_->tensor(index);
|
|
DynamicBuffer buf;
|
|
for (const string& s : content) {
|
|
buf.AddString(s.data(), s.length());
|
|
}
|
|
buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
|
|
}
|
|
|
|
// Populates the tensor given its index.
|
|
template <typename T>
|
|
void PopulateTensor(int index, const std::initializer_list<T>& data) {
|
|
PopulateTensorImpl<T>(index, /*offset=*/0, data);
|
|
}
|
|
|
|
// Populates the tensor given its index.
|
|
template <typename T>
|
|
void PopulateTensor(int index, const std::vector<T>& data) {
|
|
PopulateTensorImpl<T>(index, /*offset=*/0, data);
|
|
}
|
|
|
|
// Populates the tensor given its index.
|
|
template <typename T>
|
|
void PopulateTensor(int index, absl::Span<const T> data) {
|
|
PopulateTensorImpl<T>(index, /*offset=*/0, data);
|
|
}
|
|
|
|
// Partially populates the tensor, starting at the given offset.
|
|
template <typename T>
|
|
void PopulateTensor(int index, int offset, T* begin, T* end) {
|
|
PopulateTensorImpl<T>(index, offset, absl::Span<T>(begin, end - begin));
|
|
}
|
|
|
|
// Return a vector with the flattened contents of a tensor.
|
|
template <typename T>
|
|
std::vector<T> ExtractVector(int index) const {
|
|
const T* v = interpreter_->typed_tensor<T>(index);
|
|
const auto* tensor = interpreter_->tensor(index);
|
|
CHECK(v) << "Could not extract vector at index: " << index;
|
|
int tensor_size;
|
|
if (tensor->sparsity) {
|
|
// Getting the size of the sparse buffer this way is based on the
|
|
// assumption that the last dimension of the tensor is a compressed
|
|
// dimension.
|
|
tensor_size = tensor->sparsity
|
|
->dim_metadata[tensor->sparsity->dim_metadata_size - 1]
|
|
.array_indices->size;
|
|
} else {
|
|
tensor_size = GetTensorSize(index);
|
|
}
|
|
|
|
return std::vector<T>(v, v + tensor_size);
|
|
}
|
|
|
|
// Return the TFLite model buffer, only available after BuildInterpreter.
|
|
const uint8_t* GetModelBuffer() { return builder_.GetBufferPointer(); }
|
|
|
|
std::vector<int> GetTensorShape(int index) {
|
|
std::vector<int> result;
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
result.reserve(t->dims->size);
|
|
for (int i = 0; i < t->dims->size; ++i) {
|
|
result.push_back(t->dims->data[i]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Sets the number of threads available to the interpreter.
|
|
// Reconstruct the interpreter if reset_interpreter is true.
|
|
void SetNumThreads(int num_threads, bool reset_interpreter = false) {
|
|
CHECK(interpreter_ != nullptr);
|
|
if (reset_interpreter) {
|
|
// Reconstruct interpreter as number of threads may affect internal state,
|
|
// e.g. stratch buffer allocation.
|
|
BuildInterpreter(input_shapes_, num_threads, allow_fp32_relax_to_fp16_,
|
|
apply_delegate_, allocate_and_delegate_);
|
|
}
|
|
interpreter_->SetNumThreads(num_threads);
|
|
}
|
|
|
|
void SetResolver(std::unique_ptr<OpResolver> resolver) {
|
|
resolver_ = std::move(resolver);
|
|
}
|
|
|
|
// Indicate whether the test has the NNAPI delegate applied.
|
|
static bool GetForceUseNnapi();
|
|
int CountOpsExecutedByCpuKernel();
|
|
int CountNumberOfDelegatedPartitions() const;
|
|
int GetNumberOfAppliedDelegates() const { return num_applied_delegates_; }
|
|
// Return the most recent return status of ApplyDelegate.
|
|
std::optional<TfLiteStatus> GetDelegateApplicationStatus() const {
|
|
return delegate_application_status_;
|
|
}
|
|
void SetDelegateApplicationStatus(std::optional<TfLiteStatus> status) {
|
|
delegate_application_status_ = status;
|
|
}
|
|
|
|
// Tell TF Lite runtime to apply default delegates (i.e. XNNPACK delegate)
|
|
// when handling this op-level model.
|
|
void SetApplyDefaultDelegates() { bypass_default_delegates_ = false; }
|
|
|
|
protected:
|
|
int32_t GetTensorSize(int index) const;
|
|
|
|
// Tell TF Lite runtime to skip applying default delegates (i.e. XNNPACK
|
|
// delegate) when handling this op-level model.
|
|
void SetBypassDefaultDelegates() { bypass_default_delegates_ = true; }
|
|
|
|
flatbuffers::FlatBufferBuilder builder_;
|
|
std::unique_ptr<tflite::Interpreter> interpreter_;
|
|
std::unique_ptr<OpResolver> resolver_;
|
|
|
|
std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
|
|
std::vector<flatbuffers::Offset<Operator>> operators_;
|
|
std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
|
|
|
|
template <typename T>
|
|
int AddTensor(TensorData t, const T* data, size_t size,
|
|
bool is_variable = false) {
|
|
int id = tensors_.size();
|
|
|
|
// This is slightly different depending on whether we are adding a
|
|
// quantized or a regular tensor.
|
|
bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
|
|
|
|
flatbuffers::Offset<QuantizationParameters> q_params = 0;
|
|
|
|
if (is_quantized) {
|
|
if (t.min != 0 || t.max != 0) {
|
|
if (t.type == TensorType_UINT8) {
|
|
std::tie(t.scale, t.zero_point) =
|
|
QuantizationParams<uint8_t>(t.min, t.max);
|
|
} else if (t.type == TensorType_INT8) {
|
|
std::tie(t.scale, t.zero_point) =
|
|
QuantizationParams<int8_t>(t.min, t.max);
|
|
} else if (t.type == TensorType_INT32) {
|
|
std::tie(t.scale, t.zero_point) =
|
|
QuantizationParams<int32_t>(t.min, t.max);
|
|
} else if (t.type == TensorType_INT16) {
|
|
std::tie(t.scale, t.zero_point) =
|
|
QuantizationParams<int16_t>(t.min, t.max);
|
|
} else if (t.type == TensorType_INT4) {
|
|
std::tie(t.scale, t.zero_point) =
|
|
QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
|
|
} else {
|
|
LOG(FATAL) << "No support for the requested quantized type";
|
|
}
|
|
t.min = 0;
|
|
t.max = 0;
|
|
}
|
|
|
|
std::vector<float> scales{t.scale};
|
|
std::vector<int64_t> zero_points{t.zero_point};
|
|
q_params = CreateQuantizationParameters(
|
|
builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>(scales),
|
|
builder_.CreateVector<int64_t>(zero_points));
|
|
}
|
|
|
|
int buffer_id = 0;
|
|
if (size) {
|
|
// Initialize buffers list with empty buffer to allow for non-const
|
|
// tensors.
|
|
if (buffers_.empty()) {
|
|
buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
|
|
}
|
|
|
|
builder_.ForceVectorAlignment(size, sizeof(T), 16);
|
|
// Add data as a Buffer to buffers list.
|
|
buffer_id = buffers_.size();
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(data), sizeof(T) * size);
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
}
|
|
|
|
tensors_.push_back(CreateTensor(
|
|
builder_, builder_.CreateVector<int>(t.shape), t.type,
|
|
/*buffer=*/buffer_id,
|
|
/*name=*/0, q_params, is_variable,
|
|
/*sparsity=*/0, builder_.CreateVector<int>(t.shape_signature)));
|
|
|
|
tensor_data_[id] = t;
|
|
|
|
return id;
|
|
}
|
|
|
|
template <typename T>
|
|
std::pair<float, int32_t> QuantizationParams(
|
|
float f_min, float f_max, TfLiteType type = kTfLiteNoType) {
|
|
int32_t zero_point = 0;
|
|
float scale = 0;
|
|
T qmin;
|
|
T qmax;
|
|
|
|
if (type == kTfLiteInt4) {
|
|
qmin = -7;
|
|
qmax = 7;
|
|
} else {
|
|
qmin = std::numeric_limits<T>::min();
|
|
qmax = std::numeric_limits<T>::max();
|
|
}
|
|
const float qmin_double = qmin;
|
|
const float qmax_double = qmax;
|
|
// 0 should always be a representable value. Let's assume that the initial
|
|
// min,max range contains 0.
|
|
CHECK_LE(f_min, 0);
|
|
CHECK_GE(f_max, 0);
|
|
if (f_min == f_max) {
|
|
// Special case where the min,max range is a point. Should be {0}.
|
|
CHECK_EQ(f_min, 0);
|
|
CHECK_EQ(f_max, 0);
|
|
return {scale, zero_point};
|
|
}
|
|
|
|
// General case.
|
|
//
|
|
// First determine the scale.
|
|
scale = (f_max - f_min) / (qmax_double - qmin_double);
|
|
|
|
// Zero-point computation.
|
|
// First the initial floating-point computation. The zero-point can be
|
|
// determined from solving an affine equation for any known pair
|
|
// (real value, corresponding quantized value).
|
|
// We know two such pairs: (rmin, qmin) and (rmax, qmax).
|
|
// The arithmetic error on the zero point computed from either pair
|
|
// will be roughly machine_epsilon * (sum of absolute values of terms)
|
|
// so we want to use the variant that adds the smaller terms.
|
|
const float zero_point_from_min = qmin_double - f_min / scale;
|
|
const float zero_point_from_max = qmax_double - f_max / scale;
|
|
|
|
const float zero_point_from_min_error =
|
|
std::abs(qmin_double) + std::abs(f_min / scale);
|
|
|
|
const float zero_point_from_max_error =
|
|
std::abs(qmax_double) + std::abs(f_max / scale);
|
|
|
|
const float zero_point_double =
|
|
zero_point_from_min_error < zero_point_from_max_error
|
|
? zero_point_from_min
|
|
: zero_point_from_max;
|
|
|
|
// Now we need to nudge the zero point to be an integer
|
|
// (our zero points are integer, and this is motivated by the requirement
|
|
// to be able to represent the real value "0" exactly as a quantized value,
|
|
// which is required in multiple places, for example in Im2col with SAME
|
|
// padding).
|
|
|
|
T nudged_zero_point = 0;
|
|
if (zero_point_double < qmin_double) {
|
|
nudged_zero_point = qmin;
|
|
} else if (zero_point_double > qmax_double) {
|
|
nudged_zero_point = qmax;
|
|
} else {
|
|
nudged_zero_point = static_cast<T>(std::round(zero_point_double));
|
|
}
|
|
|
|
// The zero point should always be in the range of quantized value,
|
|
// // [qmin, qmax].
|
|
CHECK_GE(nudged_zero_point, qmin);
|
|
CHECK_LE(nudged_zero_point, qmax);
|
|
|
|
zero_point = nudged_zero_point;
|
|
// finally, return the values
|
|
return {scale, zero_point};
|
|
}
|
|
|
|
void AddSubgraphs(int subgraphs_to_add,
|
|
int* first_new_subgraph_index = nullptr) {
|
|
interpreter_->AddSubgraphs(subgraphs_to_add, first_new_subgraph_index);
|
|
}
|
|
|
|
// Partially populates the tensor, starting at the given offset.
|
|
void PopulateTensor4bit(int index, int offset, const int8_t* begin,
|
|
const int8_t* end) {
|
|
auto data = absl::Span<const int8_t>(begin, end - begin);
|
|
TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
|
|
uint8_t* v = nullptr;
|
|
if (tensor_ptr) {
|
|
v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
|
|
}
|
|
|
|
if (!v) {
|
|
auto* t = interpreter_->tensor(index);
|
|
CHECK(t) << "No tensor with index " << index << ".";
|
|
CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
|
|
LOG(FATAL) << "Unknown tensor error.";
|
|
}
|
|
absl::c_copy(data, v + offset);
|
|
PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
|
|
tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
|
|
}
|
|
|
|
private:
|
|
// Populates the tensor starting at offset using given data.
|
|
template <typename T, typename Container>
|
|
void PopulateTensorImpl(int index, int offset, const Container& data) {
|
|
T* v = interpreter_->typed_tensor<T>(index);
|
|
if (!v) {
|
|
auto* t = interpreter_->tensor(index);
|
|
CHECK(t) << "No tensor with index " << index << ".";
|
|
CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
|
|
CHECK_EQ(t->type, typeToTfLiteType<T>())
|
|
<< "Type mismatch for tensor with index " << index << ". Requested "
|
|
<< TfLiteTypeGetName(typeToTfLiteType<T>()) << ", got "
|
|
<< TfLiteTypeGetName(t->type) << ".";
|
|
LOG(FATAL) << "Unknown tensor error.";
|
|
}
|
|
absl::c_copy(data, v + offset);
|
|
}
|
|
|
|
void PackInt4ValuesDenselyInPlace(uint8_t* src_buffer, int buffer_size) {
|
|
for (int i = 0; i < buffer_size; ++i) {
|
|
if (i % 2 == 0) {
|
|
src_buffer[i / 2] = src_buffer[i] & 0x0F;
|
|
} else {
|
|
src_buffer[i / 2] |= src_buffer[i] << 4;
|
|
}
|
|
}
|
|
// the rest of the buffer should be empty since half of it is packed with
|
|
// the values
|
|
memset(src_buffer + (buffer_size + 1) / 2, 0, buffer_size / 2);
|
|
}
|
|
|
|
int ElementCount(TfLiteIntArray& dims) {
|
|
int result = 1;
|
|
for (int i = 0; i < dims.size; ++i) {
|
|
result *= dims.data[i];
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int AddTensorPerChannelQuant(const TensorData& t) {
|
|
// type does not matter when adding empty data.
|
|
return AddTensorPerChannelQuant<uint8_t>(t, nullptr, 0);
|
|
}
|
|
|
|
template <typename T>
|
|
int AddTensorPerChannelQuant(const TensorData& t, const T* data,
|
|
size_t size) {
|
|
const int id = tensors_.size();
|
|
flatbuffers::Offset<QuantizationParameters> q_params = 0;
|
|
q_params = CreateQuantizationParameters(
|
|
builder_, /*min=*/0, /*max=*/0,
|
|
/*scale=*/
|
|
builder_.CreateVector<float>(t.per_channel_quantization_scales),
|
|
/*zero point=*/
|
|
builder_.CreateVector<int64_t>(t.per_channel_quantization_offsets),
|
|
QuantizationDetails_NONE, 0, t.channel_index);
|
|
|
|
int buffer_id = 0;
|
|
if (size) {
|
|
// Initialize buffers list with empty buffer to allow for non-const
|
|
// tensors.
|
|
if (buffers_.empty()) {
|
|
buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
|
|
}
|
|
|
|
// Add data as a Buffer to buffers list.
|
|
buffer_id = buffers_.size();
|
|
auto data_buffer = builder_.CreateVector(
|
|
reinterpret_cast<const uint8_t*>(data), sizeof(T) * size);
|
|
buffers_.push_back(CreateBuffer(builder_, data_buffer));
|
|
}
|
|
|
|
tensors_.push_back(
|
|
CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type,
|
|
/*buffer=*/buffer_id,
|
|
/*name=*/0, q_params, /*is_variable=*/false));
|
|
tensor_data_[id] = t;
|
|
return id;
|
|
}
|
|
|
|
std::vector<int8_t> QuantizeTensor(int index,
|
|
const std::vector<float>& data) {
|
|
TfLiteTensor* t = interpreter_->tensor(index);
|
|
const int length = data.size();
|
|
std::vector<int8_t> q(length);
|
|
float min, max, scaling_factor;
|
|
tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
|
|
&max, &scaling_factor);
|
|
// Update quantization params.
|
|
t->params.scale = scaling_factor;
|
|
t->params.zero_point = 0;
|
|
// Populate the new quantization params.
|
|
TfLiteQuantizationFree(&t->quantization);
|
|
t->quantization.type = kTfLiteAffineQuantization;
|
|
auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
|
|
malloc(sizeof(TfLiteAffineQuantization)));
|
|
affine_quantization->quantized_dimension = 0;
|
|
affine_quantization->scale = TfLiteFloatArrayCreate(1);
|
|
affine_quantization->zero_point = TfLiteIntArrayCreate(1);
|
|
affine_quantization->scale->data[0] = scaling_factor;
|
|
affine_quantization->zero_point->data[0] = 0;
|
|
t->quantization.params = affine_quantization;
|
|
return q;
|
|
}
|
|
|
|
// Checks if acceleration has been done as expected.
|
|
// Currently supports only NNAPI.
|
|
// It verifies if the test was configured to run with NNAPI acceleration
|
|
// or not (SetForceUseNnapi(true)).
|
|
// In affirmative case it checks if:
|
|
// - the test case has been listed in the list of nnapi-accelerated cases
|
|
// - the test is running on a device (NNAPI has been loaded)
|
|
//
|
|
// The list of nnapi-accelerated test cases is a file containing regex to
|
|
// include or exclude specific test cases plus the minimum android SDK version
|
|
// the acceleration should be enabled for. For example:
|
|
// To enable the test BorderFloat in TopKV2OpTest only from
|
|
// android_sdk_version 29:
|
|
//
|
|
// TopKV2OpTest/BorderFloat,29
|
|
//
|
|
// And to have it always excluded while enabling all other Float tests
|
|
// (the order of the rules is important, the first one matching is used):
|
|
//
|
|
// -TopKV2OpTest/BorderFloat
|
|
// TopKV2OpTest/.+Float
|
|
|
|
void ValidateAcceleration();
|
|
|
|
// If the test was configured to use NNAPI and NNAPI was actually loaded,
|
|
// checks if the single operation in the model has been accelerated.
|
|
void ExpectOpAcceleratedWithNnapi(const std::string& test_id);
|
|
|
|
std::map<int, TensorData> tensor_data_;
|
|
std::vector<int32_t> inputs_;
|
|
std::vector<int32_t> intermediates_;
|
|
std::vector<int32_t> outputs_;
|
|
std::vector<flatbuffers::Offset<Tensor>> tensors_;
|
|
std::vector<flatbuffers::Offset<Buffer>> buffers_;
|
|
TfLiteDelegate* delegate_ = nullptr; // not own the memory.
|
|
std::optional<TfLiteStatus> delegate_application_status_ = std::nullopt;
|
|
std::vector<std::vector<int>> input_shapes_;
|
|
int num_applied_delegates_ = 0;
|
|
bool allow_fp32_relax_to_fp16_ = false;
|
|
bool apply_delegate_ = true;
|
|
bool allocate_and_delegate_ = true;
|
|
|
|
// Whether to bypass the application of TF Lite default delegates (i.e.
|
|
// XNNPACK delegate) at rutnime.
|
|
// True by default as delegated graphs are tested elsewhere.
|
|
bool bypass_default_delegates_ = true;
|
|
};
|
|
|
|
// Populate string tensors.
|
|
template <>
|
|
inline void SingleOpModel::PopulateTensor<string>(
|
|
int index, const std::initializer_list<string>& data) {
|
|
PopulateStringTensor(index, data);
|
|
}
|
|
|
|
// Base class for single op unit tests.
|
|
// The tests are parameterized to test multiple kernels for a single op.
|
|
// The parameters are strings like "optimized" and "reference" to have better
|
|
// readability in test reports.
|
|
//
|
|
// To use this class:
|
|
// * Define a constant map from strings to TfLiteRegistration.
|
|
// * Implement a test class that inherits SingleOpTest.
|
|
// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
|
|
// function.
|
|
// * Call GetRegistration to get the TfLiteRegistration to be used before
|
|
// building the interpreter.
|
|
class SingleOpTest : public ::testing::TestWithParam<string> {
|
|
public:
|
|
static std::vector<string> GetKernelTags(
|
|
const std::map<string, TfLiteRegistration*>& kernel_map) {
|
|
std::vector<string> tags;
|
|
tags.reserve(kernel_map.size());
|
|
for (const auto& it : kernel_map) {
|
|
tags.push_back(it.first);
|
|
}
|
|
return tags;
|
|
}
|
|
|
|
protected:
|
|
virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
|
|
TfLiteRegistration* GetRegistration() {
|
|
return GetKernelMap().at(GetParam());
|
|
}
|
|
};
|
|
|
|
// Maps the native C++ types to the corresponding TFLite tensor type enum
|
|
// values.
|
|
template <class T>
|
|
struct TensorTypeFor;
|
|
|
|
#define TFLITE_TENSOR_TYPE_ASSOC(CPP_TYPE, TENSORTYPE_VALUE) \
|
|
template <> \
|
|
struct TensorTypeFor<CPP_TYPE> { \
|
|
static constexpr TensorType value = TENSORTYPE_VALUE; \
|
|
};
|
|
|
|
TFLITE_TENSOR_TYPE_ASSOC(bool, TensorType_BOOL);
|
|
TFLITE_TENSOR_TYPE_ASSOC(int8_t, TensorType_INT8);
|
|
TFLITE_TENSOR_TYPE_ASSOC(int16_t, TensorType_INT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(int32_t, TensorType_INT32);
|
|
TFLITE_TENSOR_TYPE_ASSOC(int64_t, TensorType_INT64);
|
|
TFLITE_TENSOR_TYPE_ASSOC(uint8_t, TensorType_UINT8);
|
|
TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
|
|
TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
|
|
TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
|
|
TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
|
|
TFLITE_TENSOR_TYPE_ASSOC(double, TensorType_FLOAT64);
|
|
TFLITE_TENSOR_TYPE_ASSOC(std::string, TensorType_STRING);
|
|
|
|
#undef TFLITE_TENSOR_TYPE_ASSOC
|
|
|
|
// Returns the corresponding TensorType given the type T.
|
|
template <typename T>
|
|
constexpr TensorType GetTensorType() {
|
|
return TensorTypeFor<T>::value;
|
|
}
|
|
|
|
// Strings have a special implementation that is in test_util.cc
|
|
template <>
|
|
std::vector<string> SingleOpModel::ExtractVector(int index) const;
|
|
|
|
// The TypeUnion struct specializations hold a collection of related types.
|
|
// Each struct holds: 1. a primitive type (e.g. float), 2. a TensorType (e.g.
|
|
// TensorType_FLOAT32, and 3. a TfLiteType (e.g. kTfLiteFloat32). The latter
|
|
// two are actually enum values and not raw types, but these specializations
|
|
// make it easy to use gUnit Typed Test Suite:
|
|
// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests
|
|
template <typename T>
|
|
struct TypeUnion;
|
|
|
|
template <>
|
|
struct TypeUnion<float> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT32;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
|
|
typedef float ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<int32_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_INT32;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
|
|
typedef int32_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<uint32_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_UINT32;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt32;
|
|
typedef uint32_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<int16_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_INT16;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
|
|
typedef int16_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<uint16_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_UINT16;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt16;
|
|
typedef uint16_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<int8_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_INT8;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
|
|
typedef int8_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<uint8_t> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_UINT8;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
|
|
typedef uint8_t ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<Eigen::half> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
|
|
typedef Eigen::half ScalarType;
|
|
};
|
|
|
|
template <>
|
|
struct TypeUnion<Eigen::bfloat16> {
|
|
public:
|
|
// NOLINTNEXTLINE
|
|
static constexpr TensorType tensor_type = TensorType::TensorType_BFLOAT16;
|
|
// NOLINTNEXTLINE
|
|
static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteBFloat16;
|
|
typedef Eigen::bfloat16 ScalarType;
|
|
};
|
|
|
|
class MultiOpModel : public SingleOpModel {
|
|
public:
|
|
MultiOpModel() : SingleOpModel() {}
|
|
~MultiOpModel() = default;
|
|
|
|
void AddBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
|
|
const flatbuffers::Offset<void>& builtin_options,
|
|
const std::vector<int32_t>& inputs,
|
|
const std::vector<int32_t>& outputs);
|
|
|
|
void AddCustomOp(const string& name,
|
|
const std::vector<uint8_t>& custom_option,
|
|
const std::function<TfLiteRegistration*()>& registration,
|
|
const std::vector<int32_t>& inputs,
|
|
const std::vector<int32_t>& outputs);
|
|
|
|
template <typename T>
|
|
int AddInnerTensor(TensorData t) {
|
|
return AddTensor<T>(t, {}, false);
|
|
}
|
|
};
|
|
|
|
// This Matcher can be used to check the dimensions of a `TfLiteTensor`
|
|
// in a readable way. It will print a helpful error message in the
|
|
// case of a failure.
|
|
//
|
|
// EXAMPLE:
|
|
// TfLiteTensor* t = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
|
|
// t->dims = ConvertVectorToTfLiteIntArray({2, 2});
|
|
// EXPECT_EQ(t, DimsAre({2, 2}));
|
|
//
|
|
// To check for scalar, simply pass empty initializer list: `DimsAre({})`.
|
|
class DimsAreMatcher {
|
|
public:
|
|
using is_gtest_matcher = void;
|
|
explicit DimsAreMatcher(const std::vector<int>& dims) {
|
|
dims_ = std::shared_ptr<TfLiteIntArray>(ConvertVectorToTfLiteIntArray(dims),
|
|
TfLiteIntArrayFree);
|
|
}
|
|
|
|
// Required method to implement for matcher objects. We overload on
|
|
// both `TfLiteTensor*` and `TfLiteIntArray` for flexibility.
|
|
bool MatchAndExplain(const TfLiteIntArray* arg,
|
|
testing::MatchResultListener* result_listener) const {
|
|
if (arg == nullptr) {
|
|
*result_listener << "dims are null";
|
|
return false;
|
|
}
|
|
if (TfLiteIntArrayEqual(arg, dims_.get())) {
|
|
return true;
|
|
}
|
|
*result_listener << "has dims " << tflite::GetShapeDebugString(arg);
|
|
return false;
|
|
}
|
|
|
|
bool MatchAndExplain(const TfLiteTensor* arg,
|
|
testing::MatchResultListener* result_listener) const {
|
|
if (arg == nullptr) {
|
|
*result_listener << "tensor is null";
|
|
return false;
|
|
}
|
|
return MatchAndExplain(arg->dims, result_listener);
|
|
}
|
|
|
|
void DescribeTo(std::ostream* os) const {
|
|
*os << "dims equal to ";
|
|
*os << tflite::GetShapeDebugString(dims_.get());
|
|
}
|
|
|
|
void DescribeNegationTo(std::ostream* os) const {
|
|
*os << "dims not equal to ";
|
|
*os << tflite::GetShapeDebugString(dims_.get());
|
|
}
|
|
|
|
private:
|
|
// gUnit uses the implicit copy constructor of `DimsAreMatcher`. Use
|
|
// `std::shared_ptr` here so copies of `DimsAreMatcher` can share the
|
|
// same `TfLiteIntArray`.
|
|
std::shared_ptr<TfLiteIntArray> dims_;
|
|
};
|
|
|
|
inline DimsAreMatcher DimsAre(std::initializer_list<int> dims_list) {
|
|
return DimsAreMatcher(dims_list);
|
|
}
|
|
|
|
inline DimsAreMatcher DimsAre(const std::vector<int>& dims) {
|
|
return DimsAreMatcher(dims);
|
|
}
|
|
|
|
inline DimsAreMatcher DimsAre(absl::Span<int> dims) {
|
|
return DimsAreMatcher(std::vector<int>(dims.begin(), dims.end()));
|
|
}
|
|
|
|
} // namespace tflite
|
|
|
|
#endif // TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
|