diff --git a/src/CellDecomposition.cu b/src/CellDecomposition.cu index a94137c764c9635a917e6d7c3bc22d803f213760..45591f789a71f2abc490910dbb29a7bcfb884dce 100644 --- a/src/CellDecomposition.cu +++ b/src/CellDecomposition.cu @@ -17,6 +17,18 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +#define gpuKernelCheck() {kernelCheck( __FILE__, __LINE__); } +inline void kernelCheck(const char* file, int line) +{ + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::fprintf(stderr,"Error: %s in %s %d\n", cudaGetErrorString(err),file, line); + assert(1==2); + } + //gpuErrchk(cudaDeviceSynchronize()); +} + // ***************************************************************************** // CUDA Kernel Definitions @@ -96,7 +108,8 @@ void CellDecomposition::decompose_d(Vector3 pos_d[], size_t num) { thrust::device_ptr<cell_t> c_d(cells_d); thrust::sort(c_d, c_d + num * numReplicas); gpuErrchk(cudaMemcpyAsync(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost)); - + //Han-Yi Chou + //gpuErrchk(cudaMemcpy(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost)); const size_t nMax = std::max(2lu * numCells, num); nBlocks = (nMax * numReplicas) / NUM_THREADS + 1; @@ -172,7 +185,7 @@ void make_rangesKernel(CellDecomposition::cell_t cells[], int tmp[], __global__ void bind_rangesKernel(CellDecomposition::range_t ranges[], int tmp[], int numCells, int numReplicas) { - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < numCells * numReplicas) ranges[idx] = CellDecomposition::range_t(tmp[2*idx], tmp[2*idx+1]); /* Print range of each cell. Skip over empty cells diff --git a/src/CellDecomposition.h b/src/CellDecomposition.h index 81f2e41947d4eeda47798205886a1f6584025959..48b6f3c54c4dc3ac100136dfe3f8632c6379fb07 100644 --- a/src/CellDecomposition.h +++ b/src/CellDecomposition.h @@ -23,6 +23,15 @@ #include "useful.h" // Vector3, Matrix3 #include "BaseGrid.h" +#if defined(__CUDACC__) // NVCC + #define MY_ALIGN(n) __align__(n) +#elif defined(__GNUC__) // GCC + #define MY_ALIGN(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) // MSVC + #define MY_ALIGN(n) __declspec(align(n)) +#else + #error "Please provide a definition for MY_ALIGN macro for your host compiler!" +#endif class CellDecomposition : public BaseGrid { public: @@ -95,6 +104,11 @@ public: inline const cell_t* getCells() const { return cells; } + //Han-Yi Chou + HOST DEVICE + inline const cell_t* getCells_d() const { + return cells_d; + } /* HOST DEVICE @@ -159,7 +173,28 @@ public: if (nCells.z == 2 and (w < 0 || w > 1)) return -1; return getCellID(u, v, w, nCells); } - +/* + HOST DEVICE +inline int getNeighborID(int idx, int dx, int dy, int dz) const +{ + if(dx == 0 and dy == 0 and dz == 0) + return idx; + int idx_z = idx % nCells.z; + int idx_y = idx / nCells.z % nCells.y; + int idx_x = idx / (nCells.z * nCells.y); + + int u = (dx + idx_x + nCells.x) % nCells.x; + int v = (dy + idx_y + nCells.y) % nCells.y; + int w = (dz + idx_z + nCells.z) % nCells.z; + if (nCells.x == 1 and u != 0) return -1; + if (nCells.y == 1 and v != 0) return -1; + if (nCells.z == 1 and w != 0) return -1; + if (nCells.x == 2 and (u < 0 || u > 1)) return -1; + if (nCells.y == 2 and (v < 0 || v > 1)) return -1; + if (nCells.z == 2 and (w < 0 || w > 1)) return -1; + return getCellID(u, v, w, nCells); +} +*/ public: int3 nCells;