pytorch/caffe2/utils/GpuBitonicSort.cuh

#ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
#define CAFFE2_UTILS_GPU_BITONIC_SORT_H_

#include "caffe2/utils/math.h"
#include "caffe2/utils/GpuDefs.cuh"

namespace caffe2 {

// Returns true if the given integer type is a power-of-2 (positive only)
// Note(jiayq): windows reported an error per
//     https://github.com/caffe2/caffe2/issues/997
// and as a result will make it a macro.
#ifdef _MSC_VER
#define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
#else // _MSC_VER
template <typename T>
constexpr bool integerIsPowerOf2(T v) {
  return (v && !(v & (v - 1)));
}
#endif // _MSC_VER

/// The maximum in-block bitonic sort we support
constexpr int kMaxBitonicSortSize = 4096;

template <typename T>
__device__ inline void swapVars(T& t1, T& t2) {
  T tmp = t1;
  t1 = t2;
  t2 = tmp;
}

template <typename Comparator, typename K, typename V>
__device__ inline void bitonicSwap(K& kA, V& vA,
                                   K& kB, V& vB,
                                   bool dir,
                                   const Comparator& comp) {
  bool swap = comp(kA, vA, kB, vB);
  if (swap == dir) {
    swapVars(kA, kB);
    swapVars(vA, vB);
  }
};

template <typename Comparator, typename K, typename V,
          int Power2SortSize,
          int ThreadsPerBlock>
__device__ inline void bitonicSort(K* keys,
                                   V* values,
                                   const Comparator& comp) {
  static_assert(Power2SortSize <= kMaxBitonicSortSize,
                "sort size <= 4096 only supported");
  // Assume the sort is taking place in shared memory
  // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
  //               "sort data too large (>32768 bytes)");
  static_assert(integerIsPowerOf2(Power2SortSize),
                "sort size must be power of 2");
  static_assert(integerIsPowerOf2(ThreadsPerBlock),
                "threads in block must be power of 2");

  // If what we are sorting is too small, then not all threads
  // participate
  constexpr int numThreadsForSort = Power2SortSize / 2;
  constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;

  // If what we are sorting is too large, then threads must loop more
  // than once
  constexpr int loopPerThread =
    allThreads ? numThreadsForSort / ThreadsPerBlock : 1;

#pragma unroll
  for (int size = 2; size < Power2SortSize; size *= 2) {

#pragma unroll
    for (int stride = size / 2; stride > 0; stride /= 2) {

#pragma unroll
      for (int loop = 0; loop < loopPerThread; ++loop) {
        int threadId = loop * ThreadsPerBlock + threadIdx.x;
        bool flag = ((threadId & (size / 2)) != 0);

        int pos = 2 * threadId - (threadId & (stride - 1));

        if (allThreads || (threadId < numThreadsForSort)) {
          bitonicSwap<Comparator, K, V>(
            keys[pos], values[pos],
            keys[pos + stride], values[pos + stride],
            flag, comp);
        }

        __syncthreads();
      }
    }
  }

#pragma unroll
  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

#pragma unroll
    for (int loop = 0; loop < loopPerThread; ++loop) {
      int threadId = loop * ThreadsPerBlock + threadIdx.x;

      int pos = 2 * threadId - (threadId & (stride - 1));

      if (allThreads || (threadId < numThreadsForSort)) {
        bitonicSwap<Comparator, K, V>(
          keys[pos], values[pos],
          keys[pos + stride], values[pos + stride],
          false, comp);
      }

      __syncthreads();
    }
  }
}

template <typename Comparator, typename K, typename V, int Power2SortSize>
__device__ inline void warpBitonicSort(K* keys,
                                       V* values,
                                       const Comparator& comp) {
  // Smaller sorts should use a warp shuffle sort
  static_assert(Power2SortSize > kWarpSize,
                "sort not large enough");
  static_assert(integerIsPowerOf2(Power2SortSize),
                "sort size must be power of 2");
  static_assert(Power2SortSize <= kMaxBitonicSortSize,
                "sort size <= 4096 only supported");

  // If what we are sorting is too large, then lanes must loop more
  // than once
  constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
  int laneId = getLaneId();

#pragma unroll
  for (int size = 2; size < Power2SortSize; size *= 2) {

#pragma unroll
    for (int stride = size / 2; stride > 0; stride /= 2) {

#pragma unroll
      for (int loop = 0; loop < loopPerThread; ++loop) {
        int threadId = loop * kWarpSize + laneId;
        bool flag = ((threadId & (size / 2)) != 0);

        int pos = 2 * threadId - (threadId & (stride - 1));

        bitonicSwap<Comparator, K, V>(
          keys[pos], values[pos],
          keys[pos + stride], values[pos + stride],
          flag, comp);

        __threadfence_block();
      }
    }
  }

#pragma unroll
  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

#pragma unroll
    for (int loop = 0; loop < loopPerThread; ++loop) {
      int threadId = loop * kWarpSize + laneId;

      int pos = 2 * threadId - (threadId & (stride - 1));

      bitonicSwap<Comparator, K, V>(
        keys[pos], values[pos],
        keys[pos + stride], values[pos + stride],
        false, comp);

      __threadfence_block();
    }
  }
}


}  // namespace caffe2

#endif  // CAFFE2_UTILS_GPU_BITONIC_SORT_H_