Skip to content
Snippets Groups Projects
Commit 7352affc authored by cmaffeo2's avatar cmaffeo2
Browse files

Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a...

Don't use warp aggregated intrinsics; the compiler for cuda9 and on can do a better job\n\nMaybe older versions of cuda would also perform better
parent 367fb57f
No related branches found
No related tags found
No related merge requests found
......@@ -32,26 +32,6 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
res = warp_bcast(res,leader);
return res + __popc( mask & ((1 << warpLane) - 1) );
}
#else
__inline__ __device__ uint __lanemask_lt()
{
uint mask;
asm( "mov.u32 %0, %lanemask_lt;" : "=r"( mask ) );
return mask;
}
__device__ int atomicAggInc(int *ctr, int warpLane)
{
// unsigned int active = __ballot_sync(0xFFFFFFFF, 1);
unsigned int active = __activemask();
int leader = __ffs(active) - 1;
int change = __popc(active);
unsigned int rank = __popc(active & __lanemask_lt());
int warp_res;
if(rank == 0)
warp_res = atomicAdd(ctr, change);
warp_res = __shfl_sync(active, warp_res, leader);
return warp_res + rank;
}
#endif
__global__
......
......@@ -2,9 +2,18 @@
#include "useful.h"
#define WARPSIZE 32
extern __device__ int warp_bcast(int v, int leader);
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION < 9000)
extern __device__ int atomicAggInc(int *ctr, int warpLane);
#else
__device__ inline int atomicAggInc(int *ctr, int warpLane) {
return atomicAdd(ctr, 1);
}
#endif
extern __global__
void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment