pytorch/caffe2/utils/GpuDefs.cuh

#ifndef CAFFE2_UTILS_GPU_DEFS_H_
#define CAFFE2_UTILS_GPU_DEFS_H_

#include <cuda_runtime.h>

namespace caffe2 {

// Static definition of GPU warp size for unrolling and code generation

#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ <= 800
constexpr int kWarpSize = 32;
#else
#error Unknown __CUDA_ARCH__; please define parameters for compute capability
#endif // __CUDA_ARCH__ types
#elif defined(__HIP_PLATFORM_HCC__)
constexpr int kWarpSize = warpSize;   // = 64 (Defined in hip_runtime.h)
#else
// dummy value for host compiler
constexpr int kWarpSize = 32;
#endif // __CUDA_ARCH__

//
// Interfaces to PTX instructions for which there appears to be no
// intrinsic
//

template <typename T>
struct Bitfield {};

template <>
struct Bitfield<unsigned int> {
  static __device__ __forceinline__
  unsigned int getBitfield(unsigned int val, int pos, int len) {
#if defined(__HIP_PLATFORM_HCC__)
    pos &= 0xff;
    len &= 0xff;

    unsigned int m = (1u << len) - 1u;
    return (val >> pos) & m;
#else
    unsigned int ret;
    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
    return ret;
#endif // __HIP_PLATFORM_HCC__
  }

  static __device__ __forceinline__
  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
#if defined(__HIP_PLATFORM_HCC__)
    pos &= 0xff;
    len &= 0xff;

    unsigned int m = (1u << len) - 1u;
    toInsert &= m;
    toInsert <<= pos;
    m <<= pos;

    return (val & ~m) | toInsert;
#else
    unsigned int ret;
    asm("bfi.b32 %0, %1, %2, %3, %4;" :
        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
    return ret;
#endif // __HIP_PLATFORM_HCC__
  }
};

template <>
struct Bitfield<unsigned long long int> {
  static __device__ __forceinline__
  unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
#if defined(__HIP_PLATFORM_HCC__)
    pos &= 0xff;
    len &= 0xff;

    unsigned long long int m = (1u << len) - 1u;
    return (val >> pos) & m;
#else
    unsigned long long int ret;
    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
    return ret;
#endif // __HIP_PLATFORM_HCC__
  }

  static __device__ __forceinline__
  unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
#if defined(__HIP_PLATFORM_HCC__)
    pos &= 0xff;
    len &= 0xff;

    unsigned long long int m = (1u << len) - 1u;
    toInsert &= m;
    toInsert <<= pos;
    m <<= pos;

    return (val & ~m) | toInsert;
#else
    unsigned long long int ret;
    asm("bfi.b64 %0, %1, %2, %3, %4;" :
        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
    return ret;
#endif // __HIP_PLATFORM_HCC__
  }
};

__device__ __forceinline__ int getLaneId() {
#if defined(__HIP_PLATFORM_HCC__)
  return __lane_id();
#else
  int laneId;
  asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
  return laneId;
#endif // __HIP_PLATFORM_HCC__
}

#if defined(__HIP_PLATFORM_HCC__)
__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
  unsigned long long int m = (1ull << getLaneId()) - 1ull;
  return m;
}

__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
  unsigned long long int m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
  return m;
}

__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
  unsigned long long int m = getLaneMaskLe();
  return m ? ~m : m;
}

__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
  unsigned long long int m = getLaneMaskLt();
  return ~m;
}
#else
__device__ __forceinline__ unsigned getLaneMaskLt() {
  unsigned mask;
  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
  return mask;
}

__device__ __forceinline__ unsigned getLaneMaskLe() {
  unsigned mask;
  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
  return mask;
}

__device__ __forceinline__ unsigned getLaneMaskGt() {
  unsigned mask;
  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
  return mask;
}

__device__ __forceinline__ unsigned getLaneMaskGe() {
  unsigned mask;
  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
  return mask;
}
#endif // __HIP_PLATFORM_HCC__

}  // namespace caffe2

#endif  // CAFFE2_UTILS_GPU_DEFS_H_