diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu
index 5068fe80f16696579318b445d9b44acfac076dc8..8a023397f9f7a705eb8a16ca15259fe23d0f0c5a 100644
--- a/src/CudaUtil.cu
+++ b/src/CudaUtil.cu
@@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
 	res = warp_bcast(res,leader);
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
-#else
-__inline__ __device__ uint __lanemask_lt()
-{
-    uint mask;
-    asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) );
-    return mask;
-}
-__device__ int atomicAggInc(int *ctr, int warpLane) 
-{
-    // unsigned int active = __ballot_sync(0xFFFFFFFF, 1);
-    unsigned int active = __activemask();
-    int leader = __ffs(active) - 1;
-    int change = __popc(active);
-    unsigned int rank = __popc(active & __lanemask_lt());
-    int warp_res;
-    if(rank == 0)
-        warp_res = atomicAdd(ctr, change);
-    warp_res = __shfl_sync(active, warp_res, leader);
-    return warp_res + rank;
-}
 #endif
 
 __global__
diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh
index cf2fbabd121ffa3cc18d3a2633984cfeb986abbc..61f1811b85ad320d373d1c36b305855675dc8c30 100644
--- a/src/CudaUtil.cuh
+++ b/src/CudaUtil.cuh
@@ -2,9 +2,18 @@
 #include "useful.h"
 #define WARPSIZE 32
 
-
 extern __device__ int warp_bcast(int v, int leader);
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION < 9000)
 extern __device__ int atomicAggInc(int *ctr, int warpLane);
+#else
+__device__ inline int atomicAggInc(int *ctr, int warpLane) {
+    return atomicAdd(ctr, 1);
+}
+#endif
+
 extern __global__
 void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);