Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a...

Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a better job\n\nMaybe older versions of cuda would also perform better

Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a...
7352affc · cmaffeo2 · 367fb57f · 7352affc · 7352affc
Commit 7352affc authored 4 years ago by cmaffeo2
--- a/src/CudaUtil.cu
+++ b/src/CudaUtil.cu
@@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
 	res = warp_bcast(res,leader);
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
-#else
-__inline__ __device__ uint __lanemask_lt()
-{
-    uint mask;
-    asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) );
-    return mask;
-}
-__device__ int atomicAggInc(int *ctr, int warpLane) 
-{
-    // unsigned int active = __ballot_sync(0xFFFFFFFF, 1);
-    unsigned int active = __activemask();
-    int leader = __ffs(active) - 1;
-    int change = __popc(active);
-    unsigned int rank = __popc(active & __lanemask_lt());
-    int warp_res;
-    if(rank == 0)
-        warp_res = atomicAdd(ctr, change);
-    warp_res = __shfl_sync(active, warp_res, leader);
-    return warp_res + rank;
-}
 #endif

 __global__

--- a/src/CudaUtil.cuh
+++ b/src/CudaUtil.cuh
@@ -2,9 +2,18 @@
 #include "useful.h"
 #define WARPSIZE 32

-
 extern __device__ int warp_bcast(int v, int leader);
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION < 9000)
 extern __device__ int atomicAggInc(int *ctr, int warpLane);
+#else
+__device__ inline int atomicAggInc(int *ctr, int warpLane) {
+    return atomicAdd(ctr, 1);
+}
+#endif
+
 extern __global__
 void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);