diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 5ff658604f228061cd6c93daf8f4fdc8628743d9..3debd71d8a06d4892494d0b5a2799a58079b10a4 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -854,8 +854,8 @@ float ComputeForce::computeTabulated(bool get_energy) {
 		int start = floor( ((float) numPairs*i    )/ngpu );
 		int end   = floor( ((float) numPairs*(i+1))/ngpu );
 		if (i == ngpu-1) assert(end == numPairs);
-		computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d[i], sys_d[i],
-													    cutoff2, pairLists_d[i], pairTabPotType_d[i], tablePot_d[i], pairLists_tex[i], pos_tex[i], pairTabPotType_tex[i], start, end-start);
+		computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.gpus[i].get_next_stream() >>>(forceInternal_d[i], sys_d[i],
+														    cutoff2, pairLists_d[i], pairTabPotType_d[i], tablePot_d[i], pairLists_tex[i], pos_tex[i], pairTabPotType_tex[i], start, end-start);
                   gpuKernelCheck();
 	    }
 	    gpuman.use(0);
@@ -962,9 +962,9 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 {
 	const size_t tot_num = num * numReplicas;
 
-	gpuErrchk(cudaMalloc(&pos_d[0], sizeof(Vector3) * tot_num));
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
+	    gpuErrchk(cudaMalloc(&pos_d[i], sizeof(Vector3) * tot_num));
 	    //Han-Yi bind to the texture
 	    cudaResourceDesc resDesc;
 	    memset(&resDesc, 0, sizeof(resDesc));
@@ -990,7 +990,11 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 
 	gpuErrchk(cudaMemcpyAsync(pos_d[0], pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 
-	gpuErrchk(cudaMalloc(&forceInternal_d[0], sizeof(Vector3) * num * numReplicas));
+	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
+	    gpuman.use(i);
+	    gpuErrchk(cudaMalloc(&forceInternal_d[i], sizeof(Vector3) * num * numReplicas));
+	}
+	gpuman.use(0);
 	gpuErrchk(cudaMemcpyAsync(forceInternal_d[0], forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 
 	gpuErrchk(cudaDeviceSynchronize());