diff --git a/src/CellDecomposition.cu b/src/CellDecomposition.cu
index a94137c764c9635a917e6d7c3bc22d803f213760..45591f789a71f2abc490910dbb29a7bcfb884dce 100644
--- a/src/CellDecomposition.cu
+++ b/src/CellDecomposition.cu
@@ -17,6 +17,18 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 
+#define gpuKernelCheck() {kernelCheck( __FILE__, __LINE__); }
+inline void kernelCheck(const char* file, int line)
+{
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::fprintf(stderr,"Error: %s in %s %d\n", cudaGetErrorString(err),file, line);
+        assert(1==2);
+    }
+    //gpuErrchk(cudaDeviceSynchronize());
+}
+
 // *****************************************************************************
 // CUDA Kernel Definitions
 
@@ -96,7 +108,8 @@ void CellDecomposition::decompose_d(Vector3 pos_d[], size_t num) {
 	thrust::device_ptr<cell_t> c_d(cells_d);
 	thrust::sort(c_d, c_d + num * numReplicas);
 	gpuErrchk(cudaMemcpyAsync(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost));
-	
+	//Han-Yi Chou
+        //gpuErrchk(cudaMemcpy(cells, cells_d, cells_sz, cudaMemcpyDeviceToHost));
 	const size_t nMax = std::max(2lu * numCells, num);
 	nBlocks = (nMax * numReplicas) / NUM_THREADS + 1;
 
@@ -172,7 +185,7 @@ void make_rangesKernel(CellDecomposition::cell_t cells[], int tmp[],
 __global__
 void bind_rangesKernel(CellDecomposition::range_t ranges[], int tmp[],
 											 int numCells, int numReplicas) {
-	const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 	if (idx < numCells * numReplicas)
 		ranges[idx] = CellDecomposition::range_t(tmp[2*idx], tmp[2*idx+1]);
 	/* Print range of each cell. Skip over empty cells
diff --git a/src/CellDecomposition.h b/src/CellDecomposition.h
index 81f2e41947d4eeda47798205886a1f6584025959..48b6f3c54c4dc3ac100136dfe3f8632c6379fb07 100644
--- a/src/CellDecomposition.h
+++ b/src/CellDecomposition.h
@@ -23,6 +23,15 @@
 #include "useful.h" // Vector3, Matrix3
 #include "BaseGrid.h"
 
+#if defined(__CUDACC__) // NVCC
+   #define MY_ALIGN(n) __align__(n)
+#elif defined(__GNUC__) // GCC
+  #define MY_ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER) // MSVC
+  #define MY_ALIGN(n) __declspec(align(n))
+#else
+  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
 
 class CellDecomposition : public BaseGrid {
 public:
@@ -95,6 +104,11 @@ public:
 	inline const cell_t* getCells() const {
 		return cells;
 	}
+        //Han-Yi Chou
+        HOST DEVICE
+        inline const cell_t* getCells_d() const {
+                return cells_d;
+        }
 
 	/*
 	HOST DEVICE
@@ -159,7 +173,28 @@ public:
 		if (nCells.z == 2 and (w < 0 || w > 1)) return -1;
 		return getCellID(u, v, w, nCells);
 	}
-
+/*
+        HOST DEVICE
+inline int getNeighborID(int idx, int dx, int dy, int dz) const
+{
+    if(dx == 0 and dy == 0 and dz == 0)
+        return idx;
+    int idx_z = idx % nCells.z;
+    int idx_y = idx / nCells.z % nCells.y;
+    int idx_x = idx / (nCells.z * nCells.y);
+
+    int u = (dx + idx_x + nCells.x) % nCells.x;
+    int v = (dy + idx_y + nCells.y) % nCells.y;
+    int w = (dz + idx_z + nCells.z) % nCells.z;
+    if (nCells.x == 1 and u != 0) return -1;
+    if (nCells.y == 1 and v != 0) return -1;
+    if (nCells.z == 1 and w != 0) return -1;
+    if (nCells.x == 2 and (u < 0 || u > 1)) return -1;
+    if (nCells.y == 2 and (v < 0 || v > 1)) return -1;
+    if (nCells.z == 2 and (w < 0 || w > 1)) return -1;
+    return getCellID(u, v, w, nCells);
+}
+*/
 public:
 	int3 nCells;