Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
arbd
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tbgl
tools
arbd
Commits
8c67cfb9
Commit
8c67cfb9
authored
9 years ago
by
cmaffeo2
Browse files
Options
Downloads
Patches
Plain Diff
pairlist kernel being rewritten; ammend this broken commit
parent
8733d418
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
ComputeForce.cu
+4
-3
4 additions, 3 deletions
ComputeForce.cu
ComputeForce.cuh
+59
-21
59 additions, 21 deletions
ComputeForce.cuh
CudaUtil.cuh
+1
-1
1 addition, 1 deletion
CudaUtil.cuh
makefile
+1
-1
1 addition, 1 deletion
makefile
with
65 additions
and
26 deletions
ComputeForce.cu
+
4
−
3
View file @
8c67cfb9
...
...
@@ -369,17 +369,18 @@ void ComputeForce::decompose(Vector3* pos) {
// initializePairlistArrays
int
nCells
=
decomp
.
nCells
.
x
*
decomp
.
nCells
.
y
*
decomp
.
nCells
.
z
;
int
blocksPerCell
=
10
;
if
(
newDecomp
)
{
initializePairlistArrays
<<<
1
,
32
>>>
(
nCells
);
initializePairlistArrays
<<<
1
,
32
>>>
(
nCells
*
blocksPerCell
);
gpuErrchk
(
cudaDeviceSynchronize
());
}
const
int
NUMTHREADS
=
128
;
//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
const
size_t
nBlocks
=
nCells
;
const
size_t
nBlocks
=
nCells
*
blocksPerCell
;
/* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */
/* gpuErrchk(cudaDeviceSynchronize()); */
createPairlists
<<<
nBlocks
,
NUMTHREADS
>>>
(
pos
,
num
,
numReplicas
,
sys_d
,
decomp_d
,
nCells
);
createPairlists
<<<
nBlocks
,
NUMTHREADS
>>>
(
pos
,
num
,
numReplicas
,
sys_d
,
decomp_d
,
nCells
,
blocksPerCell
);
gpuErrchk
(
cudaDeviceSynchronize
());
...
...
This diff is collapsed.
Click to expand it.
ComputeForce.cuh
+
59
−
21
View file @
8c67cfb9
...
...
@@ -6,7 +6,6 @@
#include
"CudaUtil.cuh"
#include
<assert.h>
__host__
__device__
EnergyForce
ComputeForce
::
coulombForce
(
Vector3
r
,
float
alpha
,
float
start
,
float
len
)
{
...
...
@@ -201,27 +200,29 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
__device__
int
*
g_numPairs
;
__device__
int
**
g_pairI
;
__device__
int
**
g_pairJ
;
__device__
bool
g_pairsSet
=
false
;
__device__
int
g_nextPairlist
;
const
__device__
int
maxPairs
=
1
<<
12
;
__global__
void
initializePairlistArrays
(
const
int
n
Cell
s
)
{
void
initializePairlistArrays
(
const
int
n
List
s
)
{
const
int
tid
=
threadIdx
.
x
;
const
int
maxPairs
=
1
<<
1
6
;
/* ~
6
0,000 per cell */
//
const int maxPairs = 1 << 1
2
; /* ~
12
0,000 per cell */
if
(
blockIdx
.
x
>
0
)
return
;
// RBTODO: free later
if
(
tid
==
0
)
{
printf
(
"Initializing device pairlists for %d cells
\n
"
,
nCells
);
g_numPairs
=
(
int
*
)
malloc
(
nCells
*
sizeof
(
int
)
);
g_pairI
=
(
int
**
)
malloc
(
nCells
*
sizeof
(
int
*
));
g_pairJ
=
(
int
**
)
malloc
(
nCells
*
sizeof
(
int
*
));
printf
(
"Initializing device pairlists for %d cells
\n
"
,
nLists
);
g_numPairs
=
(
int
*
)
malloc
(
nLists
*
sizeof
(
int
)
);
g_pairI
=
(
int
**
)
malloc
(
nLists
*
sizeof
(
int
*
));
g_pairJ
=
(
int
**
)
malloc
(
nLists
*
sizeof
(
int
*
));
g_nextPairlist
=
0
;
}
__syncthreads
();
assert
(
g_numPairs
!=
NULL
);
assert
(
g_pairI
!=
NULL
);
assert
(
g_pairJ
!=
NULL
);
for
(
int
i
=
tid
;
i
<
n
Cell
s
;
i
+=
blockDim
.
x
)
{
for
(
int
i
=
tid
;
i
<
n
List
s
;
i
+=
blockDim
.
x
)
{
g_pairI
[
i
]
=
(
int
*
)
malloc
(
maxPairs
*
sizeof
(
int
));
g_pairJ
[
i
]
=
(
int
*
)
malloc
(
maxPairs
*
sizeof
(
int
));
g_numPairs
[
i
]
=
0
;
...
...
@@ -232,20 +233,27 @@ void initializePairlistArrays(const int nCells) {
__global__
void
createPairlists
(
Vector3
pos
[],
int
num
,
int
numReplicas
,
BaseGrid
*
sys
,
CellDecomposition
*
decomp
,
int
nCells
)
{
BaseGrid
*
sys
,
CellDecomposition
*
decomp
,
const
int
nCells
,
const
int
blocksPerCell
)
{
// Loop over threads searching for atom pairs
// Each thread has designated values in shared memory as a buffer
// A sync operation periodically moves data from shared to global
const
int
NUMTHREADS
=
128
;
/* RBTODO: fix */
const
int
tid
=
threadIdx
.
x
;
const
int
bid
=
blockIdx
.
x
;
const
int
cID
=
blockIdx
.
x
;
// * blockDim.x + threadIdx.x;
// const int cID = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
const
int
cID
=
bid
/
blocksPerCell
;
const
int
warpLane
=
tid
%
WARPSIZE
;
/* RBTODO: optimize */
const
int
wid
=
tid
/
WARPSIZE
;
const
int
blockLane
=
bid
%
blocksPerCell
;
__shared__
int
pid
[
NUMTHREADS
/
WARPSIZE
];
if
(
warpLane
==
0
)
pid
[
wid
]
=
0
;
/* if (warpLane == 0) */
/* pid = atomicAdd( &g_nextPairlist, 1 ); */
/* res = warp_bcast(res,leader); */
/* const int3& nCells = decomp->nCells; */
if
(
cID
>=
nCells
)
return
;
int
count
=
0
;
/* debug */
const
CellDecomposition
::
cell_t
*
pairs
=
decomp
->
getCells
();
...
...
@@ -255,7 +263,7 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
if
(
tid
==
0
)
printf
(
" Cell%d: Working on %d atoms for repID %d
\n
"
,
cID
,
rangeI
.
last
-
rangeI
.
first
,
repID
);
for
(
int
ci
=
rangeI
.
first
;
ci
<
rangeI
.
last
;
ci
+
+
)
{
for
(
int
ci
=
rangeI
.
first
+
blockLane
;
ci
<
rangeI
.
last
;
ci
+
=
blocksPerCell
)
{
// ai - index of the particle in the original, unsorted array
const
int
ai
=
pairs
[
ci
].
particle
;
// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
...
...
@@ -276,11 +284,41 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
const
int
aj
=
pairs
[
n
].
particle
;
if
(
aj
<=
ai
)
continue
;
// RBTODO: skip exclusions
int
wid
=
atomicAggInc
(
&
g_numPairs
[
cID
],
warpLane
);
// fails
// int wid = atomicAdd( &g_numPairs[cID], 1 ); // works
g_pairI
[
cID
][
wid
]
=
ai
;
g_pairJ
[
cID
][
wid
]
=
aj
;
/* int gid; */
/* { // per-warp atomic add to get global inices */
/* int t_active = __ballot(1); */
/* int leader = __ffs(t_active)-1; */
/* int res; */
/* // RBTODO: see if performance improves with __any(gid >= maxPairs */
/* if ( warpLane == leader ) { */
/* const int t_count = __popc(t_active); */
/* res = atomicAdd( &g_numPairs[pid], t_count ); */
/* if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */
/* int tmp = atomicSub( &g_numPairs[pid], t_count ); */
/* assert( tmp == res + t_count ); */
/* pid++; */
/* res = atomicAdd( &g_numPairs[pid], t_count ); */
/* } */
/* } */
/* pid = warp_bcast(pid,leader); */
/* res = warp_bcast(res,leader); */
/* gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */
/* } */
int
gid
=
atomicAggInc
(
&
g_numPairs
[
pid
],
warpLane
);
// fails
if
(
__any
(
gid
>=
maxPairs
))
{
// a little inefficient, but important
g_pairI
[
pid
][
gid
]
=
-
1
;
g_pairJ
[
pid
][
gid
]
=
-
1
;
pid
++
;
/* needs to apply to ALL warp threads */
// we assume arrays at pid are nearly empty (no while loop)
gid
=
atomicAggInc
(
&
g_numPairs
[
pid
],
warpLane
);
/* assume this hasn't filled */
}
// int wid = atomicAdd( &g_numPairs[pid], 1 ); // works
g_pairI
[
pid
][
gid
]
=
ai
;
g_pairJ
[
pid
][
gid
]
=
aj
;
}
// atoms J
}
// z
}
// y
...
...
This diff is collapsed.
Click to expand it.
CudaUtil.cuh
+
1
−
1
View file @
8c67cfb9
...
...
@@ -15,7 +15,7 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
return
res
+
__popc
(
mask
&
((
1
<<
warpLane
)
-
1
)
);
}
__device__
inline
void
exclIntCumSum
(
int
*
in
,
const
int
n
)
{
// 1) int* in must point to shared memory
...
...
This diff is collapsed.
Click to expand it.
makefile
+
1
−
1
View file @
8c67cfb9
...
...
@@ -11,7 +11,7 @@ include ./findcudalib.mk
INCLUDE
=
$(
CUDA_PATH
)
/include
#
DEBUG = -g -O0
DEBUG
=
-g
-O0
CC_FLAGS
=
-Wall
-Wno-write-strings
-I
$(
INCLUDE
)
$(
DEBUG
)
-std
=
c++0x
-pedantic
# NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
NV_FLAGS
=
-Xptxas
-v
# -v,-abi=no
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment