From 0553b235847975a31bd2af329bb82e0f431d24a0 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Wed, 11 Dec 2019 15:32:26 -0600 Subject: [PATCH] CudaUtil.cu works for CUDA versions < 9 --- src/CudaUtil.cu | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu index a5f131a..eb23592 100644 --- a/src/CudaUtil.cu +++ b/src/CudaUtil.cu @@ -1,6 +1,11 @@ #include "CudaUtil.cuh" +#include <cuda_runtime_api.h> -/*#if __CUDA_ARCH__ < 300 +#ifndef CUDART_VERSION +#error CUDART_VERSION Undefined! +#elif (CUDART_VERSION < 9000) + +#if __CUDA_ARCH__ < 300 volatile extern __shared__ int sh[]; __device__ int warp_bcast(int v, int leader) { // WARNING: might not be safe to call in divergent branches @@ -12,18 +17,10 @@ __device__ int warp_bcast(int v, int leader) { } #elif __CUDA_ARCH__ < 700 __device__ int warp_bcast(int v, int leader) {return __shfl(v, leader); } -#else*/ -//__device__ int warp_bcast(int v, int leader, int srcLane) {return __shfl_sync(v, leader, srcLane); } -__inline__ __device__ uint __lanemask_lt() -{ - uint mask; - asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) ); - return mask; -} -//#endif - +#else +__device__ int warp_bcast(int v, int leader) {return __shfl_sync(v, leader); } +#endif -/*#if __CUDA_ARCH__ < 700 __device__ int atomicAggInc(int *ctr, int warpLane) { // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ int mask = __ballot(1); @@ -35,10 +32,16 @@ __device__ int atomicAggInc(int *ctr, int warpLane) { res = warp_bcast(res,leader); return res + __popc( mask & ((1 << warpLane) - 1) ); } -#else*/ +#else +__inline__ __device__ uint __lanemask_lt() +{ + uint mask; + asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) ); + return mask; +} __device__ int atomicAggInc(int *ctr, int warpLane) { - unsigned int active = __activemask(); + unsigned int active = __ballot_sync(0xFFFFFFFF, 1); int leader = __ffs(active) - 1; int change = __popc(active); unsigned int rank = __popc(active & __lanemask_lt()); @@ -48,7 +51,8 @@ __device__ int atomicAggInc(int *ctr, int warpLane) warp_res = __shfl_sync(active, warp_res, leader); return warp_res + rank; } -//#endif +#endif + __global__ void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector) { extern __shared__ Vector3 blockVector[]; -- GitLab