From fd77880007ff51885a67efb2e0b9d515d88a45f6 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Tue, 17 May 2016 17:28:57 -0500 Subject: [PATCH] added __restrict__ for pairlist speedup --- ComputeForce.cu | 2 +- ComputeForce.cuh | 13 +++++++------ GPUController.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 7 deletions(-) create mode 100644 GPUController.h diff --git a/ComputeForce.cu b/ComputeForce.cu index 6566354..d1cfc0b 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -413,7 +413,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { int tmp = 0; gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaDeviceSynchronize()); /* RBTOOD: maybe unnecessary */ + gpuErrchk(cudaDeviceSynchronize()); } diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 10454f2..715df18 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -286,10 +286,10 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas, __global__ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, - BaseGrid* sys, CellDecomposition* __restrict__ decomp, + const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, const int nCells, int* g_numPairs, int2* g_pair, - int numParts, int type[], int* __restrict__ g_pairTabPotType, + int numParts, const int* __restrict__ type, int* __restrict__ g_pairTabPotType, float pairlistdist2) { // Loop over threads searching for atom pairs // Each thread has designated values in shared memory as a buffer @@ -299,7 +299,7 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, const int split = 32; /* numblocks should be divisible by split */ /* const int blocksPerCell = gridDim.x/split; */ - const CellDecomposition::cell_t* pairs = decomp->getCells(); + const CellDecomposition::cell_t* __restrict__ pairs = decomp->getCells(); for (int cID = 0 + (blockIdx.x % split); cID < nCells; cID += split) { // for (int cID = blockIdx.x/blocksPerCell; cID < nCells; cID += split ) { for (int repID = 0; repID < numReplicas; repID++) { @@ -314,8 +314,8 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, const int ai = pairs[ci].particle; // const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); const CellDecomposition::cell_t celli = pairs[ci]; - const Vector3 posi = pos[ai]; - + // Vector3 posi = pos[ai]; + for (int x = -1; x <= 1; ++x) { for (int y = -1; y <= 1; ++y) { for (int z = -1; z <= 1; ++z) { @@ -329,7 +329,8 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, if (aj <= ai) continue; // skip ones that are too far away - float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); + const float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); + // const float dr = (sys->wrapDiff(pos[aj] - posi)).length2(); if (dr > pairlistdist2) continue; int gid = atomicAggInc( g_numPairs, warpLane ); diff --git a/GPUController.h b/GPUController.h new file mode 100644 index 0000000..088a3dd --- /dev/null +++ b/GPUController.h @@ -0,0 +1,44 @@ +// provides interface between main CPU loop and various GPUs +// -- holds data for each GPU + +#pragma once +#include "useful.h" + +class GPUcontroller { +public: + GPUcontroller(const Configuration& c, const long int randomSeed, + bool debug, int numReplicas = 0); + ~GPUcontroller(); + + static bool DEBUG; + +private: + + void copyToCUDA(); + + +private: + const Configuration& conf; + int numReplicas; + + // Integrator variables + BaseGrid* sys; + ComputeForce* internal; + Vector3* forceInternal; + + // CUDA device variables + Vector3 *pos_d, *forceInternal_d, *force_d; + int *type_d; + BrownianParticleType **part_d; + BaseGrid *sys_d, *kTGrid_d; + Random *randoGen_d; + Bond* bonds_d; + int2* bondMap_d; + Exclude* excludes_d; + int2* excludeMap_d; + Angle* angles_d; + Dihedral* dihedrals_d; + +} + + -- GitLab