From 2a5a326176a29d8aeebb1b818fdb57978197dd1b Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Wed, 20 Jan 2016 19:36:50 -0600 Subject: [PATCH] slightly improved tricubic interpolation; now 145 regs, a little faster --- ComputeGridGrid.cuh | 6 ++-- RigidBodyController.h | 4 +-- RigidBodyGrid.cu | 71 +++++++++++++++++++++++-------------------- RigidBodyGrid.h | 12 ++++++-- 4 files changed, 54 insertions(+), 39 deletions(-) diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh index 35016cd..0e08ff7 100644 --- a/ComputeGridGrid.cuh +++ b/ComputeGridGrid.cuh @@ -49,8 +49,10 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, // RBTODO What about non-unit delta? // RBTODO combine interp methods and reduce repetition! // Vector3 f = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */ - force[tid] = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */ - const float r_val = rho->val[r_id]; + const ForceEnergy fe = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */ + force[tid] = fe.f; + + const float r_val = rho->val[r_id]; /* move to beginning of script? */ // force[tid] = r_val*force[tid]; force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame */ // force[tid] = f; diff --git a/RigidBodyController.h b/RigidBodyController.h index 41cbcbf..b214722 100644 --- a/RigidBodyController.h +++ b/RigidBodyController.h @@ -5,8 +5,8 @@ #include <cuda.h> #include <cuda_runtime.h> -#define NUMTHREADS 128 /* try with 64, every 32+ */ -// #define NUMTHREADS 64 +// #define NUMTHREADS 128 /* try with 64, every 32+ */ +#define NUMTHREADS 64 #define NUMSTREAMS 8 class Configuration; diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu index 1917329..0013e58 100644 --- a/RigidBodyGrid.cu +++ b/RigidBodyGrid.cu @@ -364,7 +364,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(const Vector3& l) const { } /** interpolateForce() to be used on CUDA Device **/ -DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { +DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const { Vector3 f; // Vector3 l = basisInv.transform(pos - origin); const int homeX = int(floor(l.x)); @@ -373,6 +373,10 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float wx = l.x - homeX; const float wy = l.y - homeY; const float wz = l.z - homeZ; + const float wx2 = wx*wx; + + // RBTODO: test against cpu algorithm; also see if its the same algorithm used by NAMD + // RBTODO: test NAMD alg. for speed /* f.x */ float g3[3][4]; @@ -388,55 +392,54 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { v[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ? 0 : val[ind]; } - const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3]); - const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3]); + const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2; + const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx; const float a1 = 0.5f*(-v[0] + v[2]); - const float a0 = v[1]; - - g2[0][iy] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1; /* f.x (derivative) */ - g2[1][iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; /* f.y & f.z */ + g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1; /* f.x (derivative) */ + g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */ } // Mix along y. { - const float a3 = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3]); - const float a2 = 0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]); - const float a1 = 0.5f*(-g2[0][0] + g2[0][2]); - const float a0 = g2[0][1]; - g3[0][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; /* f.x */ + g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy + + 0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3]) *wy*wy + + 0.5f*(-g2[0][0] + g2[0][2]) *wy + + g2[0][1]; } { - const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3]); - const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3]); + const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy; + const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy; const float a1 = 0.5f*(-g2[1][0] + g2[1][2]); - const float a0 = g2[1][1]; - g3[1][iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1; /* f.y */ - g3[2][iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; /* f.z */ + g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1; /* f.y */ + g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */ } } + // Mix along z. { - const float a3 = 0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3]); - const float a2 = 0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3]); - const float a1 = 0.5f*(-g3[0][0] + g3[0][2]); - const float a0 = g3[0][1]; - f.x = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); + f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz + + -0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz + + -0.5f*(-g3[0][0] + g3[0][2]) *wz - + g3[0][1]; } { - const float a3 = 0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3]); - const float a2 = 0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3]); - const float a1 = 0.5f*(-g3[1][0] + g3[1][2]); - const float a0 = g3[1][1]; - f.y = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); + f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz + + -0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz + + -0.5f*(-g3[1][0] + g3[1][2]) *wz - + g3[1][1]; } { - const float a3 = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3]); - const float a2 = 0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]); - const float a1 = 0.5f*(-g3[2][0] + g3[2][2]); - f.z = -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1); + f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz - + (2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz - + 0.5f*(-g3[2][0] + g3[2][2]); } - return f; + float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz + + 0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3]) *wz*wz + + 0.5f*(-g3[2][0] + g3[2][2]) *wz + + g3[2][1]; + + return ForceEnergy(f,e); } @@ -472,7 +475,7 @@ DEVICE float RigidBodyGrid::interpolatePotentialLinearly(const Vector3& l) const const float wz = l.z - homeZ; return wz * (g3[1] - g3[0]) + g3[0]; } -DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const { +DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const { // Find the home node. const int homeX = int(floor(l.x)); const int homeY = int(floor(l.y)); @@ -514,6 +517,8 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const f.x = -(wz * (g3[0][1] - g3[0][0]) + g3[0][0]); f.y = -(wz * (g3[1][1] - g3[1][0]) + g3[1][0]); f.z = - (g3[2][1] - g3[2][0]); + float e = wz * (g3[2][1] - g3[2][0]) + g3[2][0]; + return ForceEnergy(f,e); } diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h index ec514f4..d99a73f 100644 --- a/RigidBodyGrid.h +++ b/RigidBodyGrid.h @@ -28,6 +28,14 @@ using namespace std; // RBTODO: integrate with existing grid code? +class ForceEnergy { +public: + DEVICE ForceEnergy(Vector3 f, float e) : + f(f), e(e) {}; + Vector3 f; + float e; +}; + class RigidBodyGrid { friend class SparseGrid; private: @@ -140,7 +148,7 @@ public: /* virtual float getPotential(Vector3 pos) const; */ DEVICE float interpolatePotentialLinearly(const Vector3& l) const; - DEVICE Vector3 interpolateForceDLinearly(const Vector3& l) const; + DEVICE ForceEnergy interpolateForceDLinearly(const Vector3& l) const; HOST DEVICE float interpolatePotential(const Vector3& l) const; @@ -157,7 +165,7 @@ public: } /** interpolateForce() to be used on CUDA Device **/ - DEVICE Vector3 interpolateForceD(Vector3 l) const; + DEVICE ForceEnergy interpolateForceD(Vector3 l) const; // Wrap coordinate: 0 <= x < l HOST DEVICE inline float wrapFloat(float x, float l) const { -- GitLab