From 7352affcc8b46f25ef0a99ed7876f006af856b2f Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Wed, 30 Dec 2020 18:07:49 -0600 Subject: [PATCH] Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a better job\n\nMaybe older versions of cuda would also perform better --- src/CudaUtil.cu | 20 -------------------- src/CudaUtil.cuh | 11 ++++++++++- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu index 5068fe8..8a02339 100644 --- a/src/CudaUtil.cu +++ b/src/CudaUtil.cu @@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) { res = warp_bcast(res,leader); return res + __popc( mask & ((1 << warpLane) - 1) ); } -#else -__inline__ __device__ uint __lanemask_lt() -{ - uint mask; - asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) ); - return mask; -} -__device__ int atomicAggInc(int *ctr, int warpLane) -{ - // unsigned int active = __ballot_sync(0xFFFFFFFF, 1); - unsigned int active = __activemask(); - int leader = __ffs(active) - 1; - int change = __popc(active); - unsigned int rank = __popc(active & __lanemask_lt()); - int warp_res; - if(rank == 0) - warp_res = atomicAdd(ctr, change); - warp_res = __shfl_sync(active, warp_res, leader); - return warp_res + rank; -} #endif __global__ diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh index cf2fbab..61f1811 100644 --- a/src/CudaUtil.cuh +++ b/src/CudaUtil.cuh @@ -2,9 +2,18 @@ #include "useful.h" #define WARPSIZE 32 - extern __device__ int warp_bcast(int v, int leader); + +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION < 9000) extern __device__ int atomicAggInc(int *ctr, int warpLane); +#else +__device__ inline int atomicAggInc(int *ctr, int warpLane) { + return atomicAdd(ctr, 1); +} +#endif + extern __global__ void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector); -- GitLab