diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu index 5068fe80f16696579318b445d9b44acfac076dc8..8a023397f9f7a705eb8a16ca15259fe23d0f0c5a 100644 --- a/src/CudaUtil.cu +++ b/src/CudaUtil.cu @@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) { res = warp_bcast(res,leader); return res + __popc( mask & ((1 << warpLane) - 1) ); } -#else -__inline__ __device__ uint __lanemask_lt() -{ - uint mask; - asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) ); - return mask; -} -__device__ int atomicAggInc(int *ctr, int warpLane) -{ - // unsigned int active = __ballot_sync(0xFFFFFFFF, 1); - unsigned int active = __activemask(); - int leader = __ffs(active) - 1; - int change = __popc(active); - unsigned int rank = __popc(active & __lanemask_lt()); - int warp_res; - if(rank == 0) - warp_res = atomicAdd(ctr, change); - warp_res = __shfl_sync(active, warp_res, leader); - return warp_res + rank; -} #endif __global__ diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh index cf2fbabd121ffa3cc18d3a2633984cfeb986abbc..61f1811b85ad320d373d1c36b305855675dc8c30 100644 --- a/src/CudaUtil.cuh +++ b/src/CudaUtil.cuh @@ -2,9 +2,18 @@ #include "useful.h" #define WARPSIZE 32 - extern __device__ int warp_bcast(int v, int leader); + +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION < 9000) extern __device__ int atomicAggInc(int *ctr, int warpLane); +#else +__device__ inline int atomicAggInc(int *ctr, int warpLane) { + return atomicAdd(ctr, 1); +} +#endif + extern __global__ void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);