| #ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_ |
| #define CAFFE2_UTILS_GPU_BITONIC_SORT_H_ |
| |
| #include "caffe2/utils/math.h" |
| #include "caffe2/utils/GpuDefs.cuh" |
| |
| namespace caffe2 { |
| |
| // Returns true if the given integer type is a power-of-2 (positive only) |
| // Note(jiayq): windows reported an error per |
| // https://github.com/caffe2/caffe2/issues/997 |
| // and as a result will make it a macro. |
| #ifdef _MSC_VER |
| #define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1))) |
| #else // _MSC_VER |
| template <typename T> |
| constexpr bool integerIsPowerOf2(T v) { |
| return (v && !(v & (v - 1))); |
| } |
| #endif // _MSC_VER |
| |
| /// The maximum in-block bitonic sort we support |
| constexpr int kMaxBitonicSortSize = 4096; |
| |
| template <typename T> |
| __device__ inline void swapVars(T& t1, T& t2) { |
| T tmp = t1; |
| t1 = t2; |
| t2 = tmp; |
| } |
| |
| template <typename Comparator, typename K, typename V> |
| __device__ inline void bitonicSwap(K& kA, V& vA, |
| K& kB, V& vB, |
| bool dir, |
| const Comparator& comp) { |
| bool swap = comp(kA, vA, kB, vB); |
| if (swap == dir) { |
| swapVars(kA, kB); |
| swapVars(vA, vB); |
| } |
| }; |
| |
| template <typename Comparator, typename K, typename V, |
| int Power2SortSize, |
| int ThreadsPerBlock> |
| __device__ inline void bitonicSort(K* keys, |
| V* values, |
| const Comparator& comp) { |
| static_assert(Power2SortSize <= kMaxBitonicSortSize, |
| "sort size <= 4096 only supported"); |
| // Assume the sort is taking place in shared memory |
| // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768, |
| // "sort data too large (>32768 bytes)"); |
| static_assert(integerIsPowerOf2(Power2SortSize), |
| "sort size must be power of 2"); |
| static_assert(integerIsPowerOf2(ThreadsPerBlock), |
| "threads in block must be power of 2"); |
| |
| // If what we are sorting is too small, then not all threads |
| // participate |
| constexpr int numThreadsForSort = Power2SortSize / 2; |
| constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock; |
| |
| // If what we are sorting is too large, then threads must loop more |
| // than once |
| constexpr int loopPerThread = |
| allThreads ? numThreadsForSort / ThreadsPerBlock : 1; |
| |
| #pragma unroll |
| for (int size = 2; size < Power2SortSize; size *= 2) { |
| |
| #pragma unroll |
| for (int stride = size / 2; stride > 0; stride /= 2) { |
| |
| #pragma unroll |
| for (int loop = 0; loop < loopPerThread; ++loop) { |
| int threadId = loop * ThreadsPerBlock + threadIdx.x; |
| bool flag = ((threadId & (size / 2)) != 0); |
| |
| int pos = 2 * threadId - (threadId & (stride - 1)); |
| |
| if (allThreads || (threadId < numThreadsForSort)) { |
| bitonicSwap<Comparator, K, V>( |
| keys[pos], values[pos], |
| keys[pos + stride], values[pos + stride], |
| flag, comp); |
| } |
| |
| __syncthreads(); |
| } |
| } |
| } |
| |
| #pragma unroll |
| for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) { |
| |
| #pragma unroll |
| for (int loop = 0; loop < loopPerThread; ++loop) { |
| int threadId = loop * ThreadsPerBlock + threadIdx.x; |
| |
| int pos = 2 * threadId - (threadId & (stride - 1)); |
| |
| if (allThreads || (threadId < numThreadsForSort)) { |
| bitonicSwap<Comparator, K, V>( |
| keys[pos], values[pos], |
| keys[pos + stride], values[pos + stride], |
| false, comp); |
| } |
| |
| __syncthreads(); |
| } |
| } |
| } |
| |
| template <typename Comparator, typename K, typename V, int Power2SortSize> |
| __device__ inline void warpBitonicSort(K* keys, |
| V* values, |
| const Comparator& comp) { |
| // Smaller sorts should use a warp shuffle sort |
| static_assert(Power2SortSize > kWarpSize, |
| "sort not large enough"); |
| static_assert(integerIsPowerOf2(Power2SortSize), |
| "sort size must be power of 2"); |
| static_assert(Power2SortSize <= kMaxBitonicSortSize, |
| "sort size <= 4096 only supported"); |
| |
| // If what we are sorting is too large, then lanes must loop more |
| // than once |
| constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize; |
| int laneId = getLaneId(); |
| |
| #pragma unroll |
| for (int size = 2; size < Power2SortSize; size *= 2) { |
| |
| #pragma unroll |
| for (int stride = size / 2; stride > 0; stride /= 2) { |
| |
| #pragma unroll |
| for (int loop = 0; loop < loopPerThread; ++loop) { |
| int threadId = loop * kWarpSize + laneId; |
| bool flag = ((threadId & (size / 2)) != 0); |
| |
| int pos = 2 * threadId - (threadId & (stride - 1)); |
| |
| bitonicSwap<Comparator, K, V>( |
| keys[pos], values[pos], |
| keys[pos + stride], values[pos + stride], |
| flag, comp); |
| |
| __threadfence_block(); |
| } |
| } |
| } |
| |
| #pragma unroll |
| for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) { |
| |
| #pragma unroll |
| for (int loop = 0; loop < loopPerThread; ++loop) { |
| int threadId = loop * kWarpSize + laneId; |
| |
| int pos = 2 * threadId - (threadId & (stride - 1)); |
| |
| bitonicSwap<Comparator, K, V>( |
| keys[pos], values[pos], |
| keys[pos + stride], values[pos + stride], |
| false, comp); |
| |
| __threadfence_block(); |
| } |
| } |
| } |
| |
| |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_UTILS_GPU_BITONIC_SORT_H_ |