pytorch/torch/csrc/byte_order.cpp
Owen Anderson 0b78ae86c5 Cleanup byte swapping utilities to generate optimal code on the platforms we care about. (#11394)
Summary:
While the use of memcpy as part of the byte swapping sequence looks funky, all major
compilers recognize and optimize this pattern reliably, resulting in essentially
optimal code generation.

For example, decodeUInt32LE goes from this on iOS arm64:
>         ldrb    w8, [x0, #3]
>         ldrb    w9, [x0, #2]
>         bfi     w8, w9, #8, #8
>         ldrb    w9, [x0, #1]
>         bfi     w8, w9, #16, #8
>         ldrb            w9, [x0]
>         bfi     w8, w9, #24, #8
>         mov      x0, x8
>         ret

To this:
>         ldr             w8, [x0]
>         rev     w0, w8
>         ret
Pull Request resolved: https://github.com/pytorch/pytorch/pull/11394

Reviewed By: SsnL

Differential Revision: D9728659

Pulled By: resistor

fbshipit-source-id: 9afbd4adfad1d1fb7b01f1179e6707ee21fa726f
2018-09-10 15:40:24 -07:00

209 lines
5.5 KiB
C++

#include "byte_order.h"
#include <string.h>
#if defined(_MSC_VER)
#include <stdlib.h>
#endif
static inline void swapBytes16(void *ptr)
{
uint16_t output;
memcpy(&output, ptr, sizeof(uint16_t));
#if defined(_MSC_VER) && !defined(_DEBUG)
output = _byteswap_ushort(output);
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
output = __builtin_bswap16(output);
#else
uint16_t Hi = output >> 8;
uint16_t Lo = output << 8;
output = Hi | Lo;
#endif
memcpy(ptr, &output, sizeof(uint16_t));
}
static inline void swapBytes32(void *ptr)
{
uint32_t output;
memcpy(&output, ptr, sizeof(uint32_t));
#if defined(_MSC_VER) && !defined(_DEBUG)
output = _byteswap_ulong(output);
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
output = __builtin_bswap32(output);
#else
uint32_t Byte0 = output & 0x000000FF;
uint32_t Byte1 = output & 0x0000FF00;
uint32_t Byte2 = output & 0x00FF0000;
uint32_t Byte3 = output & 0xFF000000;
output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
#endif
memcpy(ptr, &output, sizeof(uint32_t));
}
static inline void swapBytes64(void *ptr)
{
uint64_t output;
memcpy(&output, ptr, sizeof(uint64_t));
#if defined(_MSC_VER) && !defined(_DEBUG)
output = _byteswap_uint64(output);
#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
output = __builtin_bswap64(output);
#else
uint64_t Hi = SwapByteOrder_32(uint32_t(value));
uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
return (Hi << 32) | Lo;
#endif
memcpy(ptr, &output, sizeof(uint64_t));
}
static inline uint16_t decodeUInt16LE(const uint8_t *data) {
uint16_t output;
memcpy(&output, data, sizeof(uint16_t));
return output;
}
static inline uint16_t decodeUInt16BE(const uint8_t *data) {
uint16_t output = decodeUInt16LE(data);
swapBytes16(&output);
return output;
}
static inline uint32_t decodeUInt32LE(const uint8_t *data) {
uint32_t output;
memcpy(&output, data, sizeof(uint32_t));
return output;
}
static inline uint32_t decodeUInt32BE(const uint8_t *data) {
uint32_t output = decodeUInt32LE(data);
swapBytes32(&output);
return output;
}
static inline uint64_t decodeUInt64LE(const uint8_t *data) {
uint64_t output;
memcpy(&output, data, sizeof(uint64_t));
return output;
}
static inline uint64_t decodeUInt64BE(const uint8_t *data) {
uint64_t output = decodeUInt64LE(data);
swapBytes64(&output);
return output;
}
THPByteOrder THP_nativeByteOrder()
{
uint32_t x = 1;
return *(uint8_t*)&x ? THP_LITTLE_ENDIAN : THP_BIG_ENDIAN;
}
void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
dst[i] = (int16_t) (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
src += sizeof(int16_t);
}
}
void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
dst[i] = (int32_t) (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
src += sizeof(int32_t);
}
}
void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
dst[i] = (int64_t) (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
src += sizeof(int64_t);
}
}
void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
union { uint16_t x; THHalf f; };
x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
dst[i] = f;
src += sizeof(uint16_t);
}
}
void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
union { uint32_t x; float f; };
x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
dst[i] = f;
src += sizeof(float);
}
}
void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len)
{
for (size_t i = 0; i < len; i++) {
union { uint64_t x; double d; };
x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
dst[i] = d;
src += sizeof(double);
}
}
void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
{
memcpy(dst, src, sizeof(int16_t) * len);
if (order != THP_nativeByteOrder()) {
for (size_t i = 0; i < len; i++) {
swapBytes16(dst);
dst += sizeof(int16_t);
}
}
}
void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len)
{
memcpy(dst, src, sizeof(int32_t) * len);
if (order != THP_nativeByteOrder()) {
for (size_t i = 0; i < len; i++) {
swapBytes32(dst);
dst += sizeof(int32_t);
}
}
}
void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len)
{
memcpy(dst, src, sizeof(int64_t) * len);
if (order != THP_nativeByteOrder()) {
for (size_t i = 0; i < len; i++) {
swapBytes64(dst);
dst += sizeof(int64_t);
}
}
}
void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len)
{
memcpy(dst, src, sizeof(float) * len);
if (order != THP_nativeByteOrder()) {
for (size_t i = 0; i < len; i++) {
swapBytes32(dst);
dst += sizeof(float);
}
}
}
void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len)
{
memcpy(dst, src, sizeof(double) * len);
if (order != THP_nativeByteOrder()) {
for (size_t i = 0; i < len; i++) {
swapBytes64(dst);
dst += sizeof(double);
}
}
}