slightly improved tricubic interpolation; now 145 regs, a little faster

2a5a3261 · cmaffeo2 · 9b1b3888 · 2a5a3261 · 2a5a3261 · 2a5a3261
Commit 2a5a3261 authored 9 years ago by cmaffeo2
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -49,8 +49,10 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	// RBTODO What about non-unit delta?
 	// RBTODO combine interp methods and reduce repetition! 
 	// Vector3 f = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
-	force[tid] = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
+	const ForceEnergy fe = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
-	const float r_val = rho->val[r_id];
+	force[tid] = fe.f;
+	const float r_val = rho->val[r_id]; /* move to beginning of script?  */
  // force[tid] = r_val*force[tid];
 	force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame */
 	// force[tid] = f;

--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -5,8 +5,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
-#define NUMTHREADS 128					/* try with 64, every 32+ */
+// #define NUMTHREADS 128					/* try with 64, every 32+ */
-// #define NUMTHREADS 64
+#define NUMTHREADS 64
 #define NUMSTREAMS 8
 class Configuration;

--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -364,7 +364,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(const Vector3& l) const {
 }
 /** interpolateForce() to be used on CUDA Device **/
-DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
+DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	Vector3 f;
 	// Vector3 l = basisInv.transform(pos - origin);
 	const int homeX = int(floor(l.x));
@@ -373,6 +373,10 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	const float wx = l.x - homeX;
 	const float wy = l.y - homeY;
 	const float wz = l.z - homeZ;
+	const float wx2 = wx*wx;
+	// RBTODO: test against cpu algorithm; also see if its the same algorithm used by NAMD
+	// RBTODO: test NAMD alg. for speed
 	/* f.x */
 	float g3[3][4];
@@ -388,55 +392,54 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 				v[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
 					0 : val[ind];
 			}
-			const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3]);
+			const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
-			const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3]);
+			const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
 			const float a1 = 0.5f*(-v[0] + v[2]);
-			const float a0 = v[1];
+			g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1;				/* f.x (derivative) */
+			g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */
-			g2[0][iy] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1; /* f.x (derivative) */
-			g2[1][iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; /* f.y & f.z */
 		}
 		// Mix along y.
 		{
-			const float a3 = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3]);
+			g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
-			const float a2 = 0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]);
+				0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3])      *wy*wy +
-			const float a1 = 0.5f*(-g2[0][0] + g2[0][2]);
+				0.5f*(-g2[0][0] + g2[0][2])                                          *wy +
-			const float a0 = g2[0][1];
+				g2[0][1];
-			g3[0][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; /* f.x */
 		}
 		{
-			const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3]);
+			const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
-			const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3]);
+			const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
 			const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
-			const float a0 = g2[1][1];
+			g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1;						/* f.y */
-			g3[1][iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1; /* f.y */
+			g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */
-			g3[2][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;  /* f.z */
 		}
 	}
 	// Mix along z.
 	{
-		const float a3 = 0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3]);
+		f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
-		const float a2 = 0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3]);
+			-0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
-		const float a1 = 0.5f*(-g3[0][0] + g3[0][2]);
+			-0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
-		const float a0 = g3[0][1];
+			g3[0][1];
-		f.x = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
 	}
 	{
-		const float a3 = 0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3]);
+		f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
-		const float a2 = 0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3]);
+			-0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
-		const float a1 = 0.5f*(-g3[1][0] + g3[1][2]);
+			-0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
-		const float a0 = g3[1][1];
+			g3[1][1];
-		f.y = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
 	}
 	{
-		const float a3 = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3]);
+		f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
-		const float a2 = 0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]);
+			(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
-		const float a1 = 0.5f*(-g3[2][0] + g3[2][2]);
+			0.5f*(-g3[2][0] + g3[2][2]);
-		f.z = -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1);
 	}
-	return f;
+	float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
+		0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])    *wz*wz +
+		0.5f*(-g3[2][0] + g3[2][2])                                        *wz +
+		g3[2][1];
+	return ForceEnergy(f,e);
 }
@@ -472,7 +475,7 @@ DEVICE float RigidBodyGrid::interpolatePotentialLinearly(const Vector3& l) const
 	const float wz = l.z - homeZ;
 	return wz * (g3[1] - g3[0]) + g3[0];
 }
-DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
+DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
 	// Find the home node.
 	const int homeX = int(floor(l.x));
 	const int homeY = int(floor(l.y));
@@ -514,6 +517,8 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const
 	f.x = -(wz * (g3[0][1] - g3[0][0]) + g3[0][0]);
 	f.y = -(wz * (g3[1][1] - g3[1][0]) + g3[1][0]);
 	f.z = -      (g3[2][1] - g3[2][0]);
+	float e = wz * (g3[2][1] - g3[2][0]) + g3[2][0];
+	return ForceEnergy(f,e);
 }

--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -28,6 +28,14 @@ using namespace std;
 // RBTODO: integrate with existing grid code?
+class ForceEnergy {
+public:
+	DEVICE ForceEnergy(Vector3 f, float e) :
+		f(f), e(e) {};
+	Vector3 f;
+	float e;
+};
 class RigidBodyGrid { 
 	friend class SparseGrid;
 private:
@@ -140,7 +148,7 @@ public:
  /* virtual float getPotential(Vector3 pos) const; */
 	DEVICE float interpolatePotentialLinearly(const Vector3& l) const;
-	DEVICE Vector3 interpolateForceDLinearly(const Vector3& l) const;
+	DEVICE ForceEnergy interpolateForceDLinearly(const Vector3& l) const;
  HOST DEVICE float interpolatePotential(const Vector3& l) const;
@@ -157,7 +165,7 @@ public:
 	}
 	/** interpolateForce() to be used on CUDA Device **/
-	DEVICE Vector3 interpolateForceD(Vector3 l) const;
+	DEVICE ForceEnergy interpolateForceD(Vector3 l) const;
  // Wrap coordinate: 0 <= x < l
  HOST DEVICE inline float wrapFloat(float x, float l) const {