mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Avoid exposing defines that conflict with google logging, since this blocks external usage of libtorch in certain cases. All the 'interesting' changes should be in these two files, and the rest should just be mechanical changes via sed. c10/util/logging_is_not_google_glog.h c10/util/logging_is_google_glog.h Fixes https://github.com/pytorch/pytorch/issues/81415 cc @miladm @malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/82032 Approved by: https://github.com/soumith, https://github.com/miladm
262 lines
6.5 KiB
C++
262 lines
6.5 KiB
C++
#ifndef CAFFE2_CORE_QTENSOR_H_
|
|
#define CAFFE2_CORE_QTENSOR_H_
|
|
|
|
#include "caffe2/core/common.h"
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/core/tensor.h"
|
|
#include <c10/util/accumulate.h>
|
|
#include <c10/util/irange.h>
|
|
#include <c10/util/typeid.h>
|
|
|
|
#include <algorithm>
|
|
#include <climits>
|
|
#include <cstddef>
|
|
#include <vector>
|
|
|
|
namespace caffe2 {
|
|
|
|
template <class Context>
|
|
class C10_EXPORT QTensor {
|
|
public:
|
|
QTensor() {}
|
|
virtual ~QTensor() {}
|
|
/**
|
|
* @brief Creates a quantized tensor of the given dimension.
|
|
*
|
|
* Note that the actual data allocation is not going to be carried out until
|
|
* the first time mutable_data() is called.
|
|
*
|
|
* The underlying storage of the quantized tensor interleaves elements
|
|
* by bit depth.
|
|
*
|
|
* Labeled memory for tensor of size 6, precision 3
|
|
* [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ] // Least significant Bits
|
|
* [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
|
|
* [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
|
|
*
|
|
* In the case of sign bits (see enable_sign argument), an extra bit
|
|
* per element is added:
|
|
*
|
|
* Labeled memory for tensor of size 6, precision 3, sign bit enabled
|
|
* [ E1[0] E2[0] E3[0] E4[0] E5[0] E6[0] ]
|
|
* [ E1[1] E2[1] E3[1] E4[1] E5[1] E6[1] ]
|
|
* [ E1[2] E2[2] E3[2] E4[2] E5[2] E6[2] ]
|
|
* [ E1[s] E2[s] E3[s] E4[s] E5[s] E6[s] ]
|
|
* Where 's' is 1 if E is negative
|
|
*
|
|
* The reason for this layout is the ability to efficiently multiply
|
|
* many low precision integers as a sum of popcnt(A & B) * 1 << bit.
|
|
* Explained here: https://arxiv.org/abs/1606.06160
|
|
*/
|
|
// TODO: changing at::ArrayRef<int> to at::ArrayRef<int64_t>?
|
|
explicit QTensor(
|
|
at::ArrayRef<int> dims,
|
|
const unsigned char precision,
|
|
const bool signbit = false)
|
|
: precision_(precision), signed_(signbit) {
|
|
Resize(dims);
|
|
}
|
|
|
|
void Resize(at::ArrayRef<int> dim_source) {
|
|
if (dims_ != dim_source) {
|
|
const auto source_size = c10::multiply_integers(dim_source);
|
|
if (static_cast<size_t>(source_size * (precision_ + signed_)) > capacity_) {
|
|
data_ptr_.clear();
|
|
capacity_ = 0;
|
|
}
|
|
dims_ = dim_source.vec();
|
|
size_ = source_size;
|
|
}
|
|
}
|
|
|
|
void
|
|
SetBitAtIndex(const unsigned char bit, const size_t index, const bool value) {
|
|
// Get the mutable data at bit depth `bit`.
|
|
unsigned char* d = mutable_data();
|
|
|
|
CAFFE_ENFORCE(
|
|
bit < precision_ + signed_,
|
|
"Attempted to a set a bit that is not allocated.");
|
|
CAFFE_ENFORCE(bit * aligned_size() < capacity_);
|
|
|
|
auto idx = (aligned_size() * bit) / CHAR_BIT;
|
|
d = &d[idx];
|
|
|
|
idx = index / CHAR_BIT;
|
|
auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
|
|
|
|
if (value) {
|
|
d[idx] |= 1 << shift;
|
|
} else {
|
|
d[idx] &= ~(1 << shift);
|
|
}
|
|
}
|
|
|
|
bool GetBitAtIndex(const unsigned char bit, const size_t index) const {
|
|
// Get the data at bit depth `bit`
|
|
const unsigned char* d = data();
|
|
auto idx = (aligned_size() * bit) / CHAR_BIT;
|
|
d = &d[idx];
|
|
|
|
idx = index / CHAR_BIT;
|
|
auto shift = CHAR_BIT - (index % CHAR_BIT) - 1;
|
|
|
|
return d[idx] & (1 << shift);
|
|
}
|
|
|
|
void SetPrecision(const unsigned char precision) {
|
|
precision_ = precision;
|
|
data_ptr_.clear();
|
|
}
|
|
|
|
void SetSigned(const bool make_signed = true) {
|
|
signed_ = make_signed;
|
|
data_ptr_.clear();
|
|
}
|
|
|
|
void SetScale(const double scale) {
|
|
scale_ = scale;
|
|
}
|
|
|
|
void SetBias(const double bias) {
|
|
bias_ = bias;
|
|
}
|
|
|
|
unsigned char* mutable_data() {
|
|
if (!data_ptr_) {
|
|
data_ptr_ = Context::New(nbytes());
|
|
capacity_ = nbytes() * CHAR_BIT;
|
|
}
|
|
CAFFE_ENFORCE(capacity_ == nbytes() * CHAR_BIT);
|
|
return static_cast<unsigned char*>(data_ptr_.get());
|
|
}
|
|
|
|
inline const unsigned char* data() const {
|
|
return static_cast<unsigned char*>(data_ptr_.get());
|
|
}
|
|
|
|
inline size_t size() const {
|
|
return size_;
|
|
}
|
|
|
|
inline unsigned char alignment() const {
|
|
return alignment_;
|
|
}
|
|
|
|
inline unsigned char precision() const {
|
|
return precision_;
|
|
}
|
|
|
|
inline at::ArrayRef<int> sizes() const {
|
|
return dims_;
|
|
}
|
|
|
|
// TODO: deprecate?
|
|
inline at::ArrayRef<int> dims() const {
|
|
return dims_;
|
|
}
|
|
|
|
inline bool is_signed() const {
|
|
return signed_;
|
|
}
|
|
|
|
/**
|
|
* Returns the number of dimensions of the data.
|
|
*/
|
|
inline int ndim() const {
|
|
return dims_.size();
|
|
}
|
|
|
|
inline size_t aligned_size() const {
|
|
return alignment_ * ((size_ + alignment_ - 1) / alignment_);
|
|
}
|
|
|
|
inline size_t nbytes() const {
|
|
return (aligned_size() * (precision_ + signed_)) / CHAR_BIT;
|
|
}
|
|
|
|
inline double scale() const {
|
|
return scale_;
|
|
}
|
|
|
|
inline double bias() const {
|
|
return bias_;
|
|
}
|
|
|
|
/**
|
|
* Returns the i-th dimension of the qtensor in int.
|
|
*/
|
|
inline int dim32(const int i) const {
|
|
TORCH_DCHECK_LT(i, static_cast<int>(dims_.size())) << "Exceeding ndim limit " << dims_.size();
|
|
TORCH_DCHECK_GE(i, 0) << "Cannot have negative index";
|
|
CAFFE_ENFORCE_LT(dims_[i], std::numeric_limits<int>::max());
|
|
return static_cast<int>(dims_[i]);
|
|
}
|
|
|
|
/**
|
|
* Returns the 'canonical' version of a (usually) user-specified axis,
|
|
* allowing for negative indexing (e.g., -1 for the last axis).
|
|
*
|
|
* @param axis_index the axis index.
|
|
* If 0 <= index < ndim(), return index.
|
|
* If -ndim <= index <= -1, return (ndim() - (-index)),
|
|
* e.g., the last axis index (ndim() - 1) if index == -1,
|
|
* the second to last if index == -2, etc.
|
|
* Dies on out of range index.
|
|
*/
|
|
inline int canonical_axis_index(int axis_index) const {
|
|
CAFFE_ENFORCE_GE(axis_index, -ndim());
|
|
CAFFE_ENFORCE_LT(axis_index, ndim());
|
|
if (axis_index < 0) {
|
|
return axis_index + ndim();
|
|
}
|
|
return axis_index;
|
|
}
|
|
|
|
/**
|
|
* Return product of all dimensions starting from K.
|
|
*/
|
|
inline int64_t size_from_dim(int k) const {
|
|
int64_t r = 1;
|
|
for (const auto i : c10::irange(k, dims_.size())) {
|
|
r *= dims_[i];
|
|
}
|
|
return r;
|
|
}
|
|
|
|
/**
|
|
* Product of all dims up to.
|
|
*/
|
|
inline int64_t size_to_dim(int k) const {
|
|
CAFFE_ENFORCE(k < dims_.size());
|
|
int64_t r = 1;
|
|
for (const auto i : c10::irange(k)) {
|
|
r *= dims_[i];
|
|
}
|
|
return r;
|
|
}
|
|
|
|
protected:
|
|
std::vector<int> dims_;
|
|
size_t size_ = 0;
|
|
|
|
// Precision in bits.
|
|
unsigned char precision_ = CHAR_BIT;
|
|
// Bit alignment.
|
|
unsigned char alignment_ = CHAR_BIT;
|
|
|
|
// Allocated data.
|
|
at::DataPtr data_ptr_;
|
|
|
|
// value = scale_ * (x + bias_)
|
|
double scale_;
|
|
double bias_;
|
|
bool signed_ = false;
|
|
|
|
// Capacity in bits.
|
|
size_t capacity_ = 0;
|
|
};
|
|
|
|
} // namespace caffe2
|
|
#endif // CAFFE2_CORE_QTENSOR_H_
|