From 019a404971b20531872fcca882686e9e5a391a1e Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Sun, 3 Jan 2021 21:03:20 -0600
Subject: [PATCH] Fixed tablePot copy to multiple gpus, also slightly
 simplified (can get rid of tablePot_addr)

---
 src/ComputeForce.cu | 72 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 3debd71..be5e545 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -381,48 +381,48 @@ bool ComputeForce::addTabulatedPotential(String fileName, int type0, int type1)
 
 	TabulatedPotential* t = new TabulatedPotential(*tablePot[ind]);
 
-	// Copy tablePot[ind] to the device
-	float *v0, *v1, *v2, *v3;
-	size_t sz_n = sizeof(float) * tablePot[ind]->n;
-	gpuErrchk(cudaMalloc(&v0, sz_n));
-	gpuErrchk(cudaMalloc(&v1, sz_n));
-	gpuErrchk(cudaMalloc(&v2, sz_n));
-	gpuErrchk(cudaMalloc(&v3, sz_n));
-	gpuErrchk(cudaMemcpyAsync(v0, tablePot[ind]->v0, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v1, tablePot[ind]->v1, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v2, tablePot[ind]->v2, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v3, tablePot[ind]->v3, sz_n, cudaMemcpyHostToDevice));
-	t->v0 = v0; t->v1 = v1;
-	t->v2 = v2; t->v3 = v3;
-	gpuErrchk(cudaMalloc(&tablePot_addr[ind], sizeof(TabulatedPotential)));
-	gpuErrchk(cudaMemcpy(tablePot_addr[ind], t, sizeof(TabulatedPotential), cudaMemcpyHostToDevice));
-	t->v0 = NULL; t->v1 = NULL;
-	t->v2 = NULL; t->v3 = NULL;
-	delete t;
-	/** Same thing for ind1 **/
-	t = new TabulatedPotential(*tablePot[ind1]);
-	sz_n = sizeof(float) * tablePot[ind1]->n;
-	gpuErrchk(cudaMalloc(&v0, sz_n));
-	gpuErrchk(cudaMalloc(&v1, sz_n));
-	gpuErrchk(cudaMalloc(&v2, sz_n));
-	gpuErrchk(cudaMalloc(&v3, sz_n));
-	gpuErrchk(cudaMemcpyAsync(v0, tablePot[ind1]->v0, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v1, tablePot[ind1]->v1, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v2, tablePot[ind1]->v2, sz_n, cudaMemcpyHostToDevice));
-	gpuErrchk(cudaMemcpyAsync(v3, tablePot[ind1]->v3, sz_n, cudaMemcpyHostToDevice));
-	t->v0 = v0; t->v1 = v1;
-	t->v2 = v2; t->v3 = v3;
-	gpuErrchk(cudaMalloc(&tablePot_addr[ind1], sizeof(TabulatedPotential)));
-	gpuErrchk(cudaMemcpy(tablePot_addr[ind1], t, sizeof(TabulatedPotential), cudaMemcpyHostToDevice));
-	t->v0 = NULL; t->v1 = NULL;
-	t->v2 = NULL; t->v3 = NULL;
-	delete t;
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
+
+	    // Copy tablePot[ind] to the device
+	    float *v0, *v1, *v2, *v3;
+	    size_t sz_n = sizeof(float) * tablePot[ind]->n;
+	    gpuErrchk(cudaMalloc(&v0, sz_n));
+	    gpuErrchk(cudaMalloc(&v1, sz_n));
+	    gpuErrchk(cudaMalloc(&v2, sz_n));
+	    gpuErrchk(cudaMalloc(&v3, sz_n));
+	    gpuErrchk(cudaMemcpyAsync(v0, tablePot[ind]->v0, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v1, tablePot[ind]->v1, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v2, tablePot[ind]->v2, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v3, tablePot[ind]->v3, sz_n, cudaMemcpyHostToDevice));
+	    t->v0 = v0; t->v1 = v1;
+	    t->v2 = v2; t->v3 = v3;
+	    // gpuErrchk(cudaMalloc(&tablePot_addr[ind], sizeof(TabulatedPotential)));
+	    gpuErrchk(cudaMemcpy(tablePot_d[i][ind], t, sizeof(TabulatedPotential), cudaMemcpyHostToDevice));
+
+	    /** Same thing for ind1 **/
+	    sz_n = sizeof(float) * tablePot[ind1]->n;
+	    gpuErrchk(cudaMalloc(&v0, sz_n));
+	    gpuErrchk(cudaMalloc(&v1, sz_n));
+	    gpuErrchk(cudaMalloc(&v2, sz_n));
+	    gpuErrchk(cudaMalloc(&v3, sz_n));
+	    gpuErrchk(cudaMemcpyAsync(v0, tablePot[ind1]->v0, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v1, tablePot[ind1]->v1, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v2, tablePot[ind1]->v2, sz_n, cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpyAsync(v3, tablePot[ind1]->v3, sz_n, cudaMemcpyHostToDevice));
+	    t->v0 = v0; t->v1 = v1;
+	    t->v2 = v2; t->v3 = v3;
+	    // gpuErrchk(cudaMalloc(&tablePot_addr[ind1], sizeof(TabulatedPotential)));
+	    // gpuErrchk(cudaMemcpy(tablePot_addr[ind1], t, sizeof(TabulatedPotential), cudaMemcpyHostToDevice));
+	    gpuErrchk(cudaMemcpy(tablePot_d[i][ind1], t, sizeof(TabulatedPotential), cudaMemcpyHostToDevice));
+	    t->v0 = NULL; t->v1 = NULL;
+	    t->v2 = NULL; t->v3 = NULL;
 	    gpuErrchk(cudaMemcpy(tablePot_d[i], tablePot_addr,
 				 sizeof(TabulatedPotential*) * numParts * numParts, cudaMemcpyHostToDevice));
 	}
 	gpuman.use(0);
+	delete t;
+
 	return true;
 }
 
-- 
GitLab