From 2a5a326176a29d8aeebb1b818fdb57978197dd1b Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Wed, 20 Jan 2016 19:36:50 -0600
Subject: [PATCH] slightly improved tricubic interpolation; now 145 regs, a
 little faster

---
 ComputeGridGrid.cuh   |  6 ++--
 RigidBodyController.h |  4 +--
 RigidBodyGrid.cu      | 71 +++++++++++++++++++++++--------------------
 RigidBodyGrid.h       | 12 ++++++--
 4 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 35016cd..0e08ff7 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -49,8 +49,10 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	// RBTODO What about non-unit delta?
 	// RBTODO combine interp methods and reduce repetition! 
 	// Vector3 f = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
-	force[tid] = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
-	const float r_val = rho->val[r_id];
+	const ForceEnergy fe = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
+	force[tid] = fe.f;
+
+	const float r_val = rho->val[r_id]; /* move to beginning of script?  */
   // force[tid] = r_val*force[tid];
 	force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame */
 	// force[tid] = f;
diff --git a/RigidBodyController.h b/RigidBodyController.h
index 41cbcbf..b214722 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -5,8 +5,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#define NUMTHREADS 128					/* try with 64, every 32+ */
-// #define NUMTHREADS 64
+// #define NUMTHREADS 128					/* try with 64, every 32+ */
+#define NUMTHREADS 64
 #define NUMSTREAMS 8
 
 class Configuration;
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index 1917329..0013e58 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -364,7 +364,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(const Vector3& l) const {
 }
 
 /** interpolateForce() to be used on CUDA Device **/
-DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
+DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	Vector3 f;
 	// Vector3 l = basisInv.transform(pos - origin);
 	const int homeX = int(floor(l.x));
@@ -373,6 +373,10 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	const float wx = l.x - homeX;
 	const float wy = l.y - homeY;
 	const float wz = l.z - homeZ;
+	const float wx2 = wx*wx;
+
+	// RBTODO: test against cpu algorithm; also see if its the same algorithm used by NAMD
+	// RBTODO: test NAMD alg. for speed
 	
 	/* f.x */
 	float g3[3][4];
@@ -388,55 +392,54 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 				v[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
 					0 : val[ind];
 			}
-			const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3]);
-			const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3]);
+			const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
+			const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
 			const float a1 = 0.5f*(-v[0] + v[2]);
-			const float a0 = v[1];
-
-			g2[0][iy] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1; /* f.x (derivative) */
-			g2[1][iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; /* f.y & f.z */
+			g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1;				/* f.x (derivative) */
+			g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */
 		}
 
 		// Mix along y.
 		{
-			const float a3 = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3]);
-			const float a2 = 0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]);
-			const float a1 = 0.5f*(-g2[0][0] + g2[0][2]);
-			const float a0 = g2[0][1];
-			g3[0][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; /* f.x */
+			g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
+				0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3])      *wy*wy +
+				0.5f*(-g2[0][0] + g2[0][2])                                          *wy +
+				g2[0][1];
 		}
 
 		{
-			const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3]);
-			const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3]);
+			const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
+			const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
 			const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
-			const float a0 = g2[1][1];
-			g3[1][iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1; /* f.y */
-			g3[2][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;  /* f.z */
+			g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1;						/* f.y */
+			g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */
 		}
 	}
+
 	// Mix along z.
 	{
-		const float a3 = 0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3]);
-		const float a2 = 0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3]);
-		const float a1 = 0.5f*(-g3[0][0] + g3[0][2]);
-		const float a0 = g3[0][1];
-		f.x = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
+		f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
+			-0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
+			-0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
+			g3[0][1];
 	}
 	{
-		const float a3 = 0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3]);
-		const float a2 = 0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3]);
-		const float a1 = 0.5f*(-g3[1][0] + g3[1][2]);
-		const float a0 = g3[1][1];
-		f.y = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
+		f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
+			-0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
+			-0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
+			g3[1][1];
 	}
 	{
-		const float a3 = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3]);
-		const float a2 = 0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]);
-		const float a1 = 0.5f*(-g3[2][0] + g3[2][2]);
-		f.z = -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1);
+		f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
+			(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
+			0.5f*(-g3[2][0] + g3[2][2]);
 	}
-	return f;
+	float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
+		0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])    *wz*wz +
+		0.5f*(-g3[2][0] + g3[2][2])                                        *wz +
+		g3[2][1];
+	
+	return ForceEnergy(f,e);
 }
 
 
@@ -472,7 +475,7 @@ DEVICE float RigidBodyGrid::interpolatePotentialLinearly(const Vector3& l) const
 	const float wz = l.z - homeZ;
 	return wz * (g3[1] - g3[0]) + g3[0];
 }
-DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
+DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
 	// Find the home node.
 	const int homeX = int(floor(l.x));
 	const int homeY = int(floor(l.y));
@@ -514,6 +517,8 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const
 	f.x = -(wz * (g3[0][1] - g3[0][0]) + g3[0][0]);
 	f.y = -(wz * (g3[1][1] - g3[1][0]) + g3[1][0]);
 	f.z = -      (g3[2][1] - g3[2][0]);
+	float e = wz * (g3[2][1] - g3[2][0]) + g3[2][0];
+	return ForceEnergy(f,e);
 }
 
 
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
index ec514f4..d99a73f 100644
--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -28,6 +28,14 @@ using namespace std;
 
 // RBTODO: integrate with existing grid code?
 
+class ForceEnergy {
+public:
+	DEVICE ForceEnergy(Vector3 f, float e) :
+		f(f), e(e) {};
+	Vector3 f;
+	float e;
+};
+
 class RigidBodyGrid { 
 	friend class SparseGrid;
 private:
@@ -140,7 +148,7 @@ public:
   /* virtual float getPotential(Vector3 pos) const; */
 
 	DEVICE float interpolatePotentialLinearly(const Vector3& l) const;
-	DEVICE Vector3 interpolateForceDLinearly(const Vector3& l) const;
+	DEVICE ForceEnergy interpolateForceDLinearly(const Vector3& l) const;
 
   HOST DEVICE float interpolatePotential(const Vector3& l) const;
 
@@ -157,7 +165,7 @@ public:
 	}
 
 	/** interpolateForce() to be used on CUDA Device **/
-	DEVICE Vector3 interpolateForceD(Vector3 l) const;
+	DEVICE ForceEnergy interpolateForceD(Vector3 l) const;
 
   // Wrap coordinate: 0 <= x < l
   HOST DEVICE inline float wrapFloat(float x, float l) const {
-- 
GitLab