Something went wrong on our end
CellDecomposition.cu 6.62 KiB
// CellDecomposition.cu
//
// Terrance Howard <heyterrance@gmail.com>
#include "CellDecomposition.h"
// *****************************************************************************
// Error Check
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n",
cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define gpuKernelCheck() {kernelCheck( __FILE__, __LINE__); }
inline void kernelCheck(const char* file, int line)
{
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
std::fprintf(stderr,"Error: %s in %s %d\n", cudaGetErrorString(err),file, line);
assert(1==2);
}
//gpuErrchk(cudaDeviceSynchronize());
}
// *****************************************************************************
// CUDA Kernel Definitions
__global__ void decomposeKernel(Vector3 pos[],
CellDecomposition::cell_t cells[],
Vector3 origin, float cutoff,
int3 nCells, size_t num, int numReplicas);
__global__
void make_rangesKernel(CellDecomposition::cell_t cells[], int tmp[],
size_t num, int numCells, int numReplicas);
__global__
void bind_rangesKernel(CellDecomposition::range_t ranges[], int tmp[],
int numCells, int numReplicas);
// *****************************************************************************
// CellDecomposition Implementations
CellDecomposition::CellDecomposition(Matrix3 box, Vector3 origin,
float cutoff, int numReplicas) :
BaseGrid(box, origin, cutoff), cutoff(cutoff), numReplicas(numReplicas),
cells(NULL), cells_d(NULL), unsorted_cells(NULL), unsorted_cells_d(NULL),
ranges(NULL), ranges_d(NULL) {
const Vector3 dim = getExtent();
nCells.x = int((dim.x - 1) / cutoff) + 1;
nCells.y = int((dim.y - 1) / cutoff) + 1;
nCells.z = int((dim.z - 1) / cutoff) + 1;
numCells = nCells.x * nCells.y * nCells.z;
printf("Created Cell Decomposition (%lu, %lu, %lu)\n",
nCells.x, nCells.y, nCells.z);
}
CellDecomposition* CellDecomposition::copyToCUDA() {
cell_t* tmp_cells = this->cells;
cell_t* tmp_unsorted = this->unsorted_cells;
this->cells = this->cells_d;
this->unsorted_cells = this->unsorted_cells_d;
const size_t sz = sizeof(CellDecomposition);
CellDecomposition *c_d = NULL;
gpuErrchk(cudaMalloc(&c_d, sz));
gpuErrchk(cudaMemcpy(c_d, this, sz, cudaMemcpyHostToDevice));
this->cells = tmp_cells;
this->unsorted_cells = tmp_unsorted;
return c_d;
}
void CellDecomposition::decompose_d(Vector3 pos_d[], size_t num) {
const size_t cells_sz = sizeof(cell_t) * num * numReplicas;
const size_t numCellRep = numCells * numReplicas;
if (cells_d == NULL) {
gpuErrchk(cudaMalloc(&cells_d, cells_sz));
gpuErrchk(cudaMalloc(&unsorted_cells_d, cells_sz));
gpuErrchk(cudaMalloc(&ranges_d, sizeof(range_t) * numCellRep));
unsorted_cells = new cell_t[num * numReplicas];
cells = new cell_t[num * numReplicas];
ranges = new range_t[numCellRep];
}
// Pair particles with cells.
size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
decomposeKernel<<< nBlocks, NUM_THREADS >>>(pos_d, cells_d, origin, cutoff,
nCells, num, numReplicas);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(unsorted_cells_d, cells_d, cells_sz,
cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpyAsync(unsorted_cells, unsorted_cells_d, cells_sz,
cudaMemcpyDeviceToHost));
// Sort cells.
thrust::device_ptr<cell_t> c_d(cells_d);
thrust::sort(c_d, c_d + num * numReplicas);
gpuErrchk(cudaMemcpyAsync(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost));
//Han-Yi Chou
//gpuErrchk(cudaMemcpy(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost));
const size_t nMax = std::max(2lu * numCells, num);
nBlocks = (nMax * numReplicas) / NUM_THREADS + 1;
// Create ranges for cells.
int* temp_ranges = NULL;
gpuErrchk(cudaMalloc(&temp_ranges, 2 * sizeof(int) * numCellRep));
gpuErrchk(cudaMemset(temp_ranges, -1, 2 * sizeof(int) * numCellRep));
make_rangesKernel<<< nBlocks, NUM_THREADS >>>(cells_d, temp_ranges,
num, numCells, numReplicas);
gpuErrchk( cudaDeviceSynchronize() );
// Copy temp_ranges to ranges_d
bind_rangesKernel<<< nBlocks, NUM_THREADS >>>(ranges_d, temp_ranges,
numCells, numReplicas);
gpuErrchk(cudaMemcpy(ranges, ranges_d, numCellRep, cudaMemcpyDeviceToHost));
gpuErrchk( cudaFree(temp_ranges) );
}
// *****************************************************************************
// CUDA Kernels
__global__
void decomposeKernel(Vector3 *pos, CellDecomposition::cell_t *cells,
Vector3 origin, float cutoff, int3 nCells,
size_t num, int numReplicas) {
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num * numReplicas) {
const int repID = idx / num;
const Vector3& p = pos[idx];
const int id = CellDecomposition::getCellID(p, origin, cutoff, nCells);
const int3 r = CellDecomposition::getCellPos(id, nCells);
cells[idx] = CellDecomposition::cell_t(idx, id, r, repID);
}
}
__global__
void make_rangesKernel(CellDecomposition::cell_t cells[], int tmp[],
size_t num, int numCells, int numReplicas) {
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num * numReplicas) {
const int repID = cells[idx].repID;
assert(repID == idx/num);
const int cellID = cells[idx].id + repID * numCells; // cellID in tmp array
// Get positions in tmp array.
const int first = cellID * 2;
const int last = first + 1;
const int particle = idx % num;
if (particle == 0)
tmp[first] = idx;
if (particle == num - 1)
tmp[last] = idx + 1;
const int prev_id = idx - 1;
if (prev_id >= 0
and cells[prev_id].repID == repID
and cellID != cells[prev_id].id + repID * numCells)
tmp[first] = idx;
const int next_id = idx + 1;
if (next_id < num * numReplicas
and cells[next_id].repID == repID
and cellID != cells[next_id].id + repID * numCells)
tmp[last] = idx + 1;
}
}
__global__
void bind_rangesKernel(CellDecomposition::range_t ranges[], int tmp[],
int numCells, int numReplicas) {
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numCells * numReplicas)
ranges[idx] = CellDecomposition::range_t(tmp[2*idx], tmp[2*idx+1]);
/* Print range of each cell. Skip over empty cells
__syncthreads();
if (idx == 0) {
for (int i = 0; i < numCells * numReplicas; ++i) {
if (ranges[i].first == -1 and ranges[i].last == -1) continue;
printf("cell %d : [%d, %d)\n", i, ranges[i].first, ranges[i].last);
}
}
// */
}