From 7352affcc8b46f25ef0a99ed7876f006af856b2f Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Wed, 30 Dec 2020 18:07:49 -0600
Subject: [PATCH] Don't use warp aggregated intrinsics; the compiler for cuda9
 and on can do a better job\n\nMaybe older versions of cuda would also perform
 better

---
 src/CudaUtil.cu  | 20 --------------------
 src/CudaUtil.cuh | 11 ++++++++++-
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu
index 5068fe8..8a02339 100644
--- a/src/CudaUtil.cu
+++ b/src/CudaUtil.cu
@@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
 	res = warp_bcast(res,leader);
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
-#else
-__inline__ __device__ uint __lanemask_lt()
-{
-    uint mask;
-    asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) );
-    return mask;
-}
-__device__ int atomicAggInc(int *ctr, int warpLane) 
-{
-    // unsigned int active = __ballot_sync(0xFFFFFFFF, 1);
-    unsigned int active = __activemask();
-    int leader = __ffs(active) - 1;
-    int change = __popc(active);
-    unsigned int rank = __popc(active & __lanemask_lt());
-    int warp_res;
-    if(rank == 0)
-        warp_res = atomicAdd(ctr, change);
-    warp_res = __shfl_sync(active, warp_res, leader);
-    return warp_res + rank;
-}
 #endif
 
 __global__
diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh
index cf2fbab..61f1811 100644
--- a/src/CudaUtil.cuh
+++ b/src/CudaUtil.cuh
@@ -2,9 +2,18 @@
 #include "useful.h"
 #define WARPSIZE 32
 
-
 extern __device__ int warp_bcast(int v, int leader);
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION < 9000)
 extern __device__ int atomicAggInc(int *ctr, int warpLane);
+#else
+__device__ inline int atomicAggInc(int *ctr, int warpLane) {
+    return atomicAdd(ctr, 1);
+}
+#endif
+
 extern __global__
 void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);
 
-- 
GitLab