mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: While the use of memcpy as part of the byte swapping sequence looks funky, all major compilers recognize and optimize this pattern reliably, resulting in essentially optimal code generation. For example, decodeUInt32LE goes from this on iOS arm64: > ldrb w8, [x0, #3] > ldrb w9, [x0, #2] > bfi w8, w9, #8, #8 > ldrb w9, [x0, #1] > bfi w8, w9, #16, #8 > ldrb w9, [x0] > bfi w8, w9, #24, #8 > mov x0, x8 > ret To this: > ldr w8, [x0] > rev w0, w8 > ret Pull Request resolved: https://github.com/pytorch/pytorch/pull/11394 Reviewed By: SsnL Differential Revision: D9728659 Pulled By: resistor fbshipit-source-id: 9afbd4adfad1d1fb7b01f1179e6707ee21fa726f
209 lines
5.5 KiB
C++
209 lines
5.5 KiB
C++
#include "byte_order.h"
|
|
|
|
#include <string.h>
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <stdlib.h>
|
|
#endif
|
|
|
|
static inline void swapBytes16(void *ptr)
|
|
{
|
|
uint16_t output;
|
|
memcpy(&output, ptr, sizeof(uint16_t));
|
|
#if defined(_MSC_VER) && !defined(_DEBUG)
|
|
output = _byteswap_ushort(output);
|
|
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
|
|
output = __builtin_bswap16(output);
|
|
#else
|
|
uint16_t Hi = output >> 8;
|
|
uint16_t Lo = output << 8;
|
|
output = Hi | Lo;
|
|
#endif
|
|
memcpy(ptr, &output, sizeof(uint16_t));
|
|
}
|
|
|
|
static inline void swapBytes32(void *ptr)
|
|
{
|
|
uint32_t output;
|
|
memcpy(&output, ptr, sizeof(uint32_t));
|
|
#if defined(_MSC_VER) && !defined(_DEBUG)
|
|
output = _byteswap_ulong(output);
|
|
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
|
|
output = __builtin_bswap32(output);
|
|
#else
|
|
uint32_t Byte0 = output & 0x000000FF;
|
|
uint32_t Byte1 = output & 0x0000FF00;
|
|
uint32_t Byte2 = output & 0x00FF0000;
|
|
uint32_t Byte3 = output & 0xFF000000;
|
|
output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
|
|
#endif
|
|
memcpy(ptr, &output, sizeof(uint32_t));
|
|
}
|
|
|
|
static inline void swapBytes64(void *ptr)
|
|
{
|
|
uint64_t output;
|
|
memcpy(&output, ptr, sizeof(uint64_t));
|
|
#if defined(_MSC_VER) && !defined(_DEBUG)
|
|
output = _byteswap_uint64(output);
|
|
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
|
|
output = __builtin_bswap64(output);
|
|
#else
|
|
uint64_t Hi = SwapByteOrder_32(uint32_t(value));
|
|
uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
|
|
return (Hi << 32) | Lo;
|
|
#endif
|
|
memcpy(ptr, &output, sizeof(uint64_t));
|
|
}
|
|
|
|
static inline uint16_t decodeUInt16LE(const uint8_t *data) {
|
|
uint16_t output;
|
|
memcpy(&output, data, sizeof(uint16_t));
|
|
return output;
|
|
}
|
|
|
|
static inline uint16_t decodeUInt16BE(const uint8_t *data) {
|
|
uint16_t output = decodeUInt16LE(data);
|
|
swapBytes16(&output);
|
|
return output;
|
|
}
|
|
|
|
static inline uint32_t decodeUInt32LE(const uint8_t *data) {
|
|
uint32_t output;
|
|
memcpy(&output, data, sizeof(uint32_t));
|
|
return output;
|
|
}
|
|
|
|
static inline uint32_t decodeUInt32BE(const uint8_t *data) {
|
|
uint32_t output = decodeUInt32LE(data);
|
|
swapBytes32(&output);
|
|
return output;
|
|
}
|
|
|
|
static inline uint64_t decodeUInt64LE(const uint8_t *data) {
|
|
uint64_t output;
|
|
memcpy(&output, data, sizeof(uint64_t));
|
|
return output;
|
|
}
|
|
|
|
static inline uint64_t decodeUInt64BE(const uint8_t *data) {
|
|
uint64_t output = decodeUInt64LE(data);
|
|
swapBytes64(&output);
|
|
return output;
|
|
}
|
|
|
|
THPByteOrder THP_nativeByteOrder()
|
|
{
|
|
uint32_t x = 1;
|
|
return *(uint8_t*)&x ? THP_LITTLE_ENDIAN : THP_BIG_ENDIAN;
|
|
}
|
|
|
|
void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
dst[i] = (int16_t) (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
|
src += sizeof(int16_t);
|
|
}
|
|
}
|
|
|
|
void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
dst[i] = (int32_t) (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
|
src += sizeof(int32_t);
|
|
}
|
|
}
|
|
|
|
void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
dst[i] = (int64_t) (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
|
src += sizeof(int64_t);
|
|
}
|
|
}
|
|
|
|
void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
union { uint16_t x; THHalf f; };
|
|
x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
|
|
dst[i] = f;
|
|
src += sizeof(uint16_t);
|
|
}
|
|
}
|
|
|
|
void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
union { uint32_t x; float f; };
|
|
x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
|
|
dst[i] = f;
|
|
src += sizeof(float);
|
|
}
|
|
}
|
|
|
|
void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
for (size_t i = 0; i < len; i++) {
|
|
union { uint64_t x; double d; };
|
|
x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
|
|
dst[i] = d;
|
|
src += sizeof(double);
|
|
}
|
|
}
|
|
|
|
void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
memcpy(dst, src, sizeof(int16_t) * len);
|
|
if (order != THP_nativeByteOrder()) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
swapBytes16(dst);
|
|
dst += sizeof(int16_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
memcpy(dst, src, sizeof(int32_t) * len);
|
|
if (order != THP_nativeByteOrder()) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
swapBytes32(dst);
|
|
dst += sizeof(int32_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len)
|
|
{
|
|
memcpy(dst, src, sizeof(int64_t) * len);
|
|
if (order != THP_nativeByteOrder()) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
swapBytes64(dst);
|
|
dst += sizeof(int64_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len)
|
|
{
|
|
memcpy(dst, src, sizeof(float) * len);
|
|
if (order != THP_nativeByteOrder()) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
swapBytes32(dst);
|
|
dst += sizeof(float);
|
|
}
|
|
}
|
|
}
|
|
|
|
void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len)
|
|
{
|
|
memcpy(dst, src, sizeof(double) * len);
|
|
if (order != THP_nativeByteOrder()) {
|
|
for (size_t i = 0; i < len; i++) {
|
|
swapBytes64(dst);
|
|
dst += sizeof(double);
|
|
}
|
|
}
|
|
}
|