Skip to content
Snippets Groups Projects
RigidBodyGrid.cu 26.57 KiB
//////////////////////////////////////////////////////////////////////
// Grid base class that does just the basics.
// Author: Jeff Comer <jcomer2@illinois.edu>

#include "RigidBodyGrid.h"
#include <cuda.h>

#define STRLEN 512

	/*                               \
	| CONSTRUCTORS, DESTRUCTORS, I/O |
	\===============================*/

RigidBodyGrid::RigidBodyGrid() {
	RigidBodyGrid tmp(1,1,1);
	val = new float[1];
	*this = tmp;									// TODO: verify that this is OK
}

// The most obvious of constructors.
RigidBodyGrid::RigidBodyGrid(int nx0, int ny0, int nz0) {
	nx = abs(nx0);
	ny = abs(ny0);
	nz = abs(nz0);
	
	val = new float[nx*ny*nz];
	zero();
}

RigidBodyGrid::RigidBodyGrid(const BaseGrid& g) {
	nx = g.nx;
	ny = g.ny;
	nz = g.nz;
	
	val = new float[nx*ny*nz];
	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];
}

// Make an exact copy of a grid.
RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g) {
	nx = g.nx;
	ny = g.ny;
	nz = g.nz;
	
	val = new float[nx*ny*nz];
	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];
}

RigidBodyGrid RigidBodyGrid::mult(const RigidBodyGrid& g) {
	for (int i = 0; i < nx*ny*nz; i++) val[i] *= g.val[i];
	return *this;
}

RigidBodyGrid& RigidBodyGrid::operator=(const RigidBodyGrid& g) {
	if(val!=NULL) 
            delete[] val;
	val = NULL;
	nx = g.nx;
	ny = g.ny;
	nz = g.nz;
	
	val = new float[nx*ny*nz];
	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];

	return *this;
}

RigidBodyGrid::~RigidBodyGrid() {
	if (val != NULL)
        {
		delete[] val;
                val = NULL;
        }
}

void RigidBodyGrid::zero() {
	for (int i = 0; i < nx*ny*nz; i++) val[i] = 0.0f;
}

bool RigidBodyGrid::setValue(int j, float v) {
	if (j < 0 || j >= nx*ny*nz) return false;
	val[j] = v;
	return true;
}

bool RigidBodyGrid::setValue(int ix, int iy, int iz, float v) {
	if (ix < 0 || ix >= nx) return false;
	if (iy < 0 || iy >= ny) return false;
	if (iz < 0 || iz >= nz) return false;
	int j = iz + iy*nz + ix*ny*nz;

	val[j] = v;
	return true;
}

float RigidBodyGrid::getValue(int j) const {

	if (j < 0 || j >= nx*ny*nz) return 0.0f;
	return val[j];
/*
    Vector3 idx = getPosition(j)
    return getValue(idx.x,idx.y,idx.z);
*/
}

HOST DEVICE float RigidBodyGrid::getValue(int ix, int iy, int iz) const {
/*
           if(ix < 0) ix = 0;
           else if(ix >= nx) ix = nx -1;

           if(iy < 0) iy = 0;
           else if(iy >= ny) iy = ny-1;

           if(iz < 0) iz = 0;
           else if(iz >= nz) iz = nz-1;

           int j = iz + nz * (iy + ny * ix);
           return val[j];
*/

	if (ix < 0 || ix >= nx) return 0.0f;
	if (iy < 0 || iy >= ny) return 0.0f;
	if (iz < 0 || iz >= nz) return 0.0f;
	
	int j = iz + iy*nz + ix*ny*nz;
	return val[j];

}

Vector3 RigidBodyGrid::getPosition(const int j) const {
	/* const int iz = j%nz; */
	/* const int iy = (j/nz)%ny; */
	/* const int ix = j/(nz*ny); */
	const int jy = j/nz;
	const int jx = jy/ny;

	const int iz = j - jy*nz;
	const int iy = jy - jx*ny;
	// const int ix = jx;

	return Vector3(jx,iy,iz);
}

Vector3 RigidBodyGrid::getPosition(int j, Matrix3 basis, Vector3 origin) const {
	int iz = j%nz;
	int iy = (j/nz)%ny;
	int ix = j/(nz*ny);

	return basis.transform(Vector3(ix, iy, iz)) + origin;
}

IndexList RigidBodyGrid::index(int j) const {
	int iz = j%nz;
	int iy = (j/nz)%ny;
	int ix = j/(nz*ny);
	IndexList ret;
	ret.add(ix);
	ret.add(iy);
	ret.add(iz);
	return ret;
}
int RigidBodyGrid::indexX(int j) const { return j/(nz*ny); }
int RigidBodyGrid::indexY(int j) const { return (j/nz)%ny; }
int RigidBodyGrid::indexZ(int j) const { return j%nz; }
int RigidBodyGrid::index(int ix, int iy, int iz) const { return iz + iy*nz + ix*ny*nz; }

// Add a fixed value to the grid.
void RigidBodyGrid::shift(float s) {
	for (int i = 0; i < nx*ny*nz; i++) val[i] += s;
}

// Multiply the grid by a fixed value.
void RigidBodyGrid::scale(float s) {
	for (int i = 0; i < nx*ny*nz; i++) val[i] *= s;
}

/** interpolateForce() to be used on CUDA Device **/
DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
	Vector3 f;
	// Vector3 l = basisInv.transform(pos - origin);
	const int homeX = int(floor(l.x));
	const int homeY = int(floor(l.y));
	const int homeZ = int(floor(l.z));
	const float wx = l.x - homeX;
	const float wy = l.y - homeY;
	const float wz = l.z - homeZ;
	const float wx2 = wx*wx;

	/* f.x */
	float g3[3][4];
	for (int iz = 0; iz < 4; iz++) {
		float g2[2][4];
		const int jz = (iz + homeZ - 1);
		for (int iy = 0; iy < 4; iy++) {
			float v[4];
			const int jy = (iy + homeY - 1);
			for (int ix = 0; ix < 4; ix++) {
				const int jx = (ix + homeX - 1);
				const int ind = jz + jy*nz + jx*nz*ny;
				v[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
					0 : val[ind];
			}
			const float a3 = 0.5f*(-v[0] + 3.0f*v[1] - 3.0f*v[2] + v[3])*wx2;
			const float a2 = 0.5f*(2.0f*v[0] - 5.0f*v[1] + 4.0f*v[2] - v[3])*wx;
			const float a1 = 0.5f*(-v[0] + v[2]);
			g2[0][iy] = 3.0f*a3 + 2.0f*a2 + a1;				/* f.x (derivative) */
			g2[1][iy] = a3*wx + a2*wx + a1*wx + v[1]; /* f.y & f.z */
		}

		// Mix along y.
		{
			g3[0][iz] = 0.5f*(-g2[0][0] + 3.0f*g2[0][1] - 3.0f*g2[0][2] + g2[0][3])*wy*wy*wy +
				0.5f*(2.0f*g2[0][0] - 5.0f*g2[0][1] + 4.0f*g2[0][2] - g2[0][3])      *wy*wy +
				0.5f*(-g2[0][0] + g2[0][2])                                          *wy +
				g2[0][1];
		}

		{
			const float a3 = 0.5f*(-g2[1][0] + 3.0f*g2[1][1] - 3.0f*g2[1][2] + g2[1][3])*wy*wy;
			const float a2 = 0.5f*(2.0f*g2[1][0] - 5.0f*g2[1][1] + 4.0f*g2[1][2] - g2[1][3])*wy;
			const float a1 = 0.5f*(-g2[1][0] + g2[1][2]);
			g3[1][iz] = 3.0f*a3 + 2.0f*a2 + a1;						/* f.y */
			g3[2][iz] = a3*wy + a2*wy + a1*wy + g2[1][1]; /* f.z */
		}
	}

	// Mix along z.
	f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
		-0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
		-0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
		g3[0][1];
	f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
		-0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
		-0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
		g3[1][1];
	f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
		(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
		0.5f*(-g3[2][0] + g3[2][2]);
	float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
		0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])    *wz*wz +
		0.5f*(-g3[2][0] + g3[2][2])                                        *wz +
		g3[2][1];
	
	return ForceEnergy(f,e);
}
//#define cubic
DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
//#ifdef cubic
//return interpolateForceD(l);
//#elif defined(cubic_namd)
//return interpolateForceDnamd(l);
//#else
	// Find the home node.
	const int homeX = int(floor(l.x));
	const int homeY = int(floor(l.y));
	const int homeZ = int(floor(l.z));

	Vector3 f;

	const float wx = l.x - homeX;
	const float wy = l.y - homeY;	
	const float wz = l.z - homeZ;

	float v[2][2][2];
	for (int iz = 0; iz < 2; iz++) {
		int jz = (iz + homeZ);
		for (int iy = 0; iy < 2; iy++) {
			int jy = (iy + homeY);
			for (int ix = 0; ix < 2; ix++) {
				int jx = (ix + homeX);
				int ind = jz + jy*nz + jx*nz*ny;
				v[ix][iy][iz] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
					0 : val[ind];
			}
		}
	}

	float g3[3][2];
	for (int iz = 0; iz < 2; iz++) {
		float g2[2][2];
		for (int iy = 0; iy < 2; iy++) {
			g2[0][iy] = (v[1][iy][iz] - v[0][iy][iz]); /* f.x */
			g2[1][iy] = wx * (v[1][iy][iz] - v[0][iy][iz]) + v[0][iy][iz]; /* f.y & f.z */
		}
		// Mix along y.
		g3[0][iz] = wy * (g2[0][1] - g2[0][0]) + g2[0][0];
		g3[1][iz] = (g2[1][1] - g2[1][0]);
		g3[2][iz] = wy * (g2[1][1] - g2[1][0]) + g2[1][0];
	}
	// Mix along z.
	f.x = -(wz * (g3[0][1] - g3[0][0]) + g3[0][0]);
	f.y = -(wz * (g3[1][1] - g3[1][0]) + g3[1][0]);
	f.z = -      (g3[2][1] - g3[2][0]);
	float e = wz * (g3[2][1] - g3[2][0]) + g3[2][0];
	return ForceEnergy(f,e);
//#endif
}
DEVICE ForceEnergy RigidBodyGrid::interpolateForceDnamd(const Vector3& l) const
{
                Vector3 f;
                //const Vector3 l = basisInv.transform(pos - origin);

                const int homeX = int(floor(l.x));
                const int homeY = int(floor(l.y));
                const int homeZ = int(floor(l.z));
                const float wx = l.x - homeX;
                const float wy = l.y - homeY;
                const float wz = l.z - homeZ;

                Vector3 dg = Vector3(wx,wy,wz);

                int inds[3];
                inds[0] = homeX;
                inds[1] = homeY;
                inds[2] = homeZ;

                // TODO: handle edges

                // Compute b
                                   float b[64];    // Matrix of values at 8 box corners
                compute_b(b, inds);

                // Compute a
                                   float a[64];
                compute_a(a, b);

                // Calculate powers of x, y, z for later use
                                   // e.g. x[2] = x^2
                                                      float x[4], y[4], z[4];
                x[0] = 1; y[0] = 1; z[0] = 1;
                for (int j = 1; j < 4; j++) {
                    x[j] = x[j-1] * dg.x;
                    y[j] = y[j-1] * dg.y;
                    z[j] = z[j-1] * dg.z;
                }

                float e = compute_V(a, x, y, z);
                f = compute_dV(a, x, y, z);

                //f = basisInv.transpose().transform(f);
                return ForceEnergy(f,e);
        }

DEVICE float RigidBodyGrid::compute_V(float *a, float *x, float *y, float *z) const
        {
            float V = 0.0;
            long int ind = 0;
            for (int l = 0; l < 4; l++) {
                for (int k = 0; k < 4; k++) {
                    for (int j = 0; j < 4; j++) {
                        V += a[ind] * x[j] * y[k] * z[l];
                        ind++;
                    }
                }
            }
            return V;
        }
DEVICE Vector3 RigidBodyGrid::compute_dV(float *a, float *x, float *y, float *z) const
        {
            Vector3 dV = Vector3(0.0f);
            long int ind = 0;
            for (int l = 0; l < 4; l++) {
                for (int k = 0; k < 4; k++) {
                    for (int j = 0; j < 4; j++) {
                        if (j > 0) dV.x += a[ind] * j * x[j-1] * y[k]   * z[l];         // dV/dx
                        if (k > 0) dV.y += a[ind] * k * x[j]   * y[k-1] * z[l];         // dV/dy
                        if (l > 0) dV.z += a[ind] * l * x[j]   * y[k]   * z[l-1];       // dV/dz
                        ind++;
                    }
                }
            }
            return dV*(-1.f);
        }
DEVICE void RigidBodyGrid::compute_a(float *a, float *b) const
        {
            // Static sparse 64x64 matrix times vector ... nicer looking way than this?
            a[0] = b[0];
            a[1] = b[8];
            a[2] = -3*b[0] + 3*b[1] - 2*b[8] - b[9];
            a[3] = 2*b[0] - 2*b[1] + b[8] + b[9];
            a[4] = b[16];
            a[5] = b[32];
            a[6] = -3*b[16] + 3*b[17] - 2*b[32] - b[33];
            a[7] = 2*b[16] - 2*b[17] + b[32] + b[33];
            a[8] = -3*b[0] + 3*b[2] - 2*b[16] - b[18];
            a[9] = -3*b[8] + 3*b[10] - 2*b[32] - b[34];
            a[10] = 9*b[0] - 9*b[1] - 9*b[2] + 9*b[3] + 6*b[8] + 3*b[9] - 6*b[10] - 3*b[11]
                + 6*b[16] - 6*b[17] + 3*b[18] - 3*b[19] + 4*b[32] + 2*b[33] + 2*b[34] + b[35];
            a[11] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 3*b[8] - 3*b[9] + 3*b[10] + 3*b[11]
                - 4*b[16] + 4*b[17] - 2*b[18] + 2*b[19] - 2*b[32] - 2*b[33] - b[34] - b[35];
            a[12] = 2*b[0] - 2*b[2] + b[16] + b[18];
            a[13] = 2*b[8] - 2*b[10] + b[32] + b[34];
            a[14] = -6*b[0] + 6*b[1] + 6*b[2] - 6*b[3] - 4*b[8] - 2*b[9] + 4*b[10] + 2*b[11]
                - 3*b[16] + 3*b[17] - 3*b[18] + 3*b[19] - 2*b[32] - b[33] - 2*b[34] - b[35];
            a[15] = 4*b[0] - 4*b[1] - 4*b[2] + 4*b[3] + 2*b[8] + 2*b[9] - 2*b[10] - 2*b[11]
                + 2*b[16] - 2*b[17] + 2*b[18] - 2*b[19] + b[32] + b[33] + b[34] + b[35];
            a[16] = b[24];
            a[17] = b[40];
            a[18] = -3*b[24] + 3*b[25] - 2*b[40] - b[41];
            a[19] = 2*b[24] - 2*b[25] + b[40] + b[41];
            a[20] = b[48];
            a[21] = b[56];
            a[22] = -3*b[48] + 3*b[49] - 2*b[56] - b[57];
            a[23] = 2*b[48] - 2*b[49] + b[56] + b[57];
            a[24] = -3*b[24] + 3*b[26] - 2*b[48] - b[50];
            a[25] = -3*b[40] + 3*b[42] - 2*b[56] - b[58];
            a[26] = 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43]
                + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 4*b[56] + 2*b[57] + 2*b[58] + b[59];
            a[27] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43]
                - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 2*b[56] - 2*b[57] - b[58] - b[59];
            a[28] = 2*b[24] - 2*b[26] + b[48] + b[50];
            a[29] = 2*b[40] - 2*b[42] + b[56] + b[58];
            a[30] = -6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43]
                - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 2*b[56] - b[57] - 2*b[58] - b[59];
            a[31] = 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43]
                + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + b[56] + b[57] + b[58] + b[59];
            a[32] = -3*b[0] + 3*b[4] - 2*b[24] - b[28];
            a[33] = -3*b[8] + 3*b[12] - 2*b[40] - b[44];
            a[34] = 9*b[0] - 9*b[1] - 9*b[4] + 9*b[5] + 6*b[8] + 3*b[9] - 6*b[12] - 3*b[13]
                + 6*b[24] - 6*b[25] + 3*b[28] - 3*b[29] + 4*b[40] + 2*b[41] + 2*b[44] + b[45];
            a[35] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 3*b[8] - 3*b[9] + 3*b[12] + 3*b[13]
                - 4*b[24] + 4*b[25] - 2*b[28] + 2*b[29] - 2*b[40] - 2*b[41] - b[44] - b[45];
            a[36] = -3*b[16] + 3*b[20] - 2*b[48] - b[52];
            a[37] = -3*b[32] + 3*b[36] - 2*b[56] - b[60];
            a[38] = 9*b[16] - 9*b[17] - 9*b[20] + 9*b[21] + 6*b[32] + 3*b[33] - 6*b[36] - 3*b[37]
                + 6*b[48] - 6*b[49] + 3*b[52] - 3*b[53] + 4*b[56] + 2*b[57] + 2*b[60] + b[61];
            a[39] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 3*b[32] - 3*b[33] + 3*b[36] + 3*b[37]
                - 4*b[48] + 4*b[49] - 2*b[52] + 2*b[53] - 2*b[56] - 2*b[57] - b[60] - b[61];
            a[40] = 9*b[0] - 9*b[2] - 9*b[4] + 9*b[6] + 6*b[16] + 3*b[18] - 6*b[20] - 3*b[22]
                + 6*b[24] - 6*b[26] + 3*b[28] - 3*b[30] + 4*b[48] + 2*b[50] + 2*b[52] + b[54];
            a[41] = 9*b[8] - 9*b[10] - 9*b[12] + 9*b[14] + 6*b[32] + 3*b[34] - 6*b[36] - 3*b[38]
                + 6*b[40] - 6*b[42] + 3*b[44] - 3*b[46] + 4*b[56] + 2*b[58] + 2*b[60] + b[62];
            a[42] = -27*b[0] + 27*b[1] + 27*b[2] - 27*b[3] + 27*b[4] - 27*b[5] - 27*b[6] + 27*b[7]
                - 18*b[8] - 9*b[9] + 18*b[10] + 9*b[11] + 18*b[12] + 9*b[13] - 18*b[14] - 9*b[15]
                - 18*b[16] + 18*b[17] - 9*b[18] + 9*b[19] + 18*b[20] - 18*b[21] + 9*b[22] - 9*b[23]
                - 18*b[24] + 18*b[25] + 18*b[26] - 18*b[27] - 9*b[28] + 9*b[29] + 9*b[30] - 9*b[31]
                - 12*b[32] - 6*b[33] - 6*b[34] - 3*b[35] + 12*b[36] + 6*b[37] + 6*b[38] + 3*b[39]
                - 12*b[40] - 6*b[41] + 12*b[42] + 6*b[43] - 6*b[44] - 3*b[45] + 6*b[46] + 3*b[47]
                - 12*b[48] + 12*b[49] - 6*b[50] + 6*b[51] - 6*b[52] + 6*b[53] - 3*b[54] + 3*b[55]
                - 8*b[56] - 4*b[57] - 4*b[58] - 2*b[59] - 4*b[60] - 2*b[61] - 2*b[62] - b[63];
            a[43] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
                + 9*b[8] + 9*b[9] - 9*b[10] - 9*b[11] - 9*b[12] - 9*b[13] + 9*b[14] + 9*b[15]
                + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
                + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
                + 6*b[32] + 6*b[33] + 3*b[34] + 3*b[35] - 6*b[36] - 6*b[37] - 3*b[38] - 3*b[39]
                + 6*b[40] + 6*b[41] - 6*b[42] - 6*b[43] + 3*b[44] + 3*b[45] - 3*b[46] - 3*b[47]
                + 8*b[48] - 8*b[49] + 4*b[50] - 4*b[51] + 4*b[52] - 4*b[53] + 2*b[54] - 2*b[55]
                + 4*b[56] + 4*b[57] + 2*b[58] + 2*b[59] + 2*b[60] + 2*b[61] + b[62] + b[63];
            a[44] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 3*b[16] - 3*b[18] + 3*b[20] + 3*b[22]
                - 4*b[24] + 4*b[26] - 2*b[28] + 2*b[30] - 2*b[48] - 2*b[50] - b[52] - b[54];
            a[45] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 3*b[32] - 3*b[34] + 3*b[36] + 3*b[38]
                - 4*b[40] + 4*b[42] - 2*b[44] + 2*b[46] - 2*b[56] - 2*b[58] - b[60] - b[62];
            a[46] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
                + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
                + 9*b[16] - 9*b[17] + 9*b[18] - 9*b[19] - 9*b[20] + 9*b[21] - 9*b[22] + 9*b[23]
                + 12*b[24] - 12*b[25] - 12*b[26] + 12*b[27] + 6*b[28] - 6*b[29] - 6*b[30] + 6*b[31]
                + 6*b[32] + 3*b[33] + 6*b[34] + 3*b[35] - 6*b[36] - 3*b[37] - 6*b[38] - 3*b[39]
                + 8*b[40] + 4*b[41] - 8*b[42] - 4*b[43] + 4*b[44] + 2*b[45] - 4*b[46] - 2*b[47]
                + 6*b[48] - 6*b[49] + 6*b[50] - 6*b[51] + 3*b[52] - 3*b[53] + 3*b[54] - 3*b[55]
                + 4*b[56] + 2*b[57] + 4*b[58] + 2*b[59] + 2*b[60] + b[61] + 2*b[62] + b[63];
            a[47] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
                - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
                - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
                - 8*b[24] + 8*b[25] + 8*b[26] - 8*b[27] - 4*b[28] + 4*b[29] + 4*b[30] - 4*b[31]
                - 3*b[32] - 3*b[33] - 3*b[34] - 3*b[35] + 3*b[36] + 3*b[37] + 3*b[38] + 3*b[39]
                - 4*b[40] - 4*b[41] + 4*b[42] + 4*b[43] - 2*b[44] - 2*b[45] + 2*b[46] + 2*b[47]
                - 4*b[48] + 4*b[49] - 4*b[50] + 4*b[51] - 2*b[52] + 2*b[53] - 2*b[54] + 2*b[55]
                - 2*b[56] - 2*b[57] - 2*b[58] - 2*b[59] - b[60] - b[61] - b[62] - b[63];
            a[48] = 2*b[0] - 2*b[4] + b[24] + b[28];
            a[49] = 2*b[8] - 2*b[12] + b[40] + b[44];
            a[50] = -6*b[0] + 6*b[1] + 6*b[4] - 6*b[5] - 4*b[8] - 2*b[9] + 4*b[12] + 2*b[13]
                - 3*b[24] + 3*b[25] - 3*b[28] + 3*b[29] - 2*b[40] - b[41] - 2*b[44] - b[45];
            a[51] = 4*b[0] - 4*b[1] - 4*b[4] + 4*b[5] + 2*b[8] + 2*b[9] - 2*b[12] - 2*b[13]
                + 2*b[24] - 2*b[25] + 2*b[28] - 2*b[29] + b[40] + b[41] + b[44] + b[45];
            a[52] = 2*b[16] - 2*b[20] + b[48] + b[52];
            a[53] = 2*b[32] - 2*b[36] + b[56] + b[60];
            a[54] = -6*b[16] + 6*b[17] + 6*b[20] - 6*b[21] - 4*b[32] - 2*b[33] + 4*b[36] + 2*b[37]
                - 3*b[48] + 3*b[49] - 3*b[52] + 3*b[53] - 2*b[56] - b[57] - 2*b[60] - b[61];
            a[55] = 4*b[16] - 4*b[17] - 4*b[20] + 4*b[21] + 2*b[32] + 2*b[33] - 2*b[36] - 2*b[37]
                + 2*b[48] - 2*b[49] + 2*b[52] - 2*b[53] + b[56] + b[57] + b[60] + b[61];
            a[56] = -6*b[0] + 6*b[2] + 6*b[4] - 6*b[6] - 4*b[16] - 2*b[18] + 4*b[20] + 2*b[22]
                - 3*b[24] + 3*b[26] - 3*b[28] + 3*b[30] - 2*b[48] - b[50] - 2*b[52] - b[54];
            a[57] = -6*b[8] + 6*b[10] + 6*b[12] - 6*b[14] - 4*b[32] - 2*b[34] + 4*b[36] + 2*b[38]
                - 3*b[40] + 3*b[42] - 3*b[44] + 3*b[46] - 2*b[56] - b[58] - 2*b[60] - b[62];
           a[58] = 18*b[0] - 18*b[1] - 18*b[2] + 18*b[3] - 18*b[4] + 18*b[5] + 18*b[6] - 18*b[7]
                + 12*b[8] + 6*b[9] - 12*b[10] - 6*b[11] - 12*b[12] - 6*b[13] + 12*b[14] + 6*b[15]
                + 12*b[16] - 12*b[17] + 6*b[18] - 6*b[19] - 12*b[20] + 12*b[21] - 6*b[22] + 6*b[23]
                + 9*b[24] - 9*b[25] - 9*b[26] + 9*b[27] + 9*b[28] - 9*b[29] - 9*b[30] + 9*b[31]
                + 8*b[32] + 4*b[33] + 4*b[34] + 2*b[35] - 8*b[36] - 4*b[37] - 4*b[38] - 2*b[39]
                + 6*b[40] + 3*b[41] - 6*b[42] - 3*b[43] + 6*b[44] + 3*b[45] - 6*b[46] - 3*b[47]
                + 6*b[48] - 6*b[49] + 3*b[50] - 3*b[51] + 6*b[52] - 6*b[53] + 3*b[54] - 3*b[55]
                + 4*b[56] + 2*b[57] + 2*b[58] + b[59] + 4*b[60] + 2*b[61] + 2*b[62] + b[63];
            a[59] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
                - 6*b[8] - 6*b[9] + 6*b[10] + 6*b[11] + 6*b[12] + 6*b[13] - 6*b[14] - 6*b[15]
                - 8*b[16] + 8*b[17] - 4*b[18] + 4*b[19] + 8*b[20] - 8*b[21] + 4*b[22] - 4*b[23]
                - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
                - 4*b[32] - 4*b[33] - 2*b[34] - 2*b[35] + 4*b[36] + 4*b[37] + 2*b[38] + 2*b[39]
                - 3*b[40] - 3*b[41] + 3*b[42] + 3*b[43] - 3*b[44] - 3*b[45] + 3*b[46] + 3*b[47]
                - 4*b[48] + 4*b[49] - 2*b[50] + 2*b[51] - 4*b[52] + 4*b[53] - 2*b[54] + 2*b[55]
                - 2*b[56] - 2*b[57] - b[58] - b[59] - 2*b[60] - 2*b[61] - b[62] - b[63];
            a[60] = 4*b[0] - 4*b[2] - 4*b[4] + 4*b[6] + 2*b[16] + 2*b[18] - 2*b[20] - 2*b[22]
                + 2*b[24] - 2*b[26] + 2*b[28] - 2*b[30] + b[48] + b[50] + b[52] + b[54];
            a[61] = 4*b[8] - 4*b[10] - 4*b[12] + 4*b[14] + 2*b[32] + 2*b[34] - 2*b[36] - 2*b[38]
                + 2*b[40] - 2*b[42] + 2*b[44] - 2*b[46] + b[56] + b[58] + b[60] + b[62];
            a[62] = -12*b[0] + 12*b[1] + 12*b[2] - 12*b[3] + 12*b[4] - 12*b[5] - 12*b[6] + 12*b[7]
                - 8*b[8] - 4*b[9] + 8*b[10] + 4*b[11] + 8*b[12] + 4*b[13] - 8*b[14] - 4*b[15]
                - 6*b[16] + 6*b[17] - 6*b[18] + 6*b[19] + 6*b[20] - 6*b[21] + 6*b[22] - 6*b[23]
                - 6*b[24] + 6*b[25] + 6*b[26] - 6*b[27] - 6*b[28] + 6*b[29] + 6*b[30] - 6*b[31]
                - 4*b[32] - 2*b[33] - 4*b[34] - 2*b[35] + 4*b[36] + 2*b[37] + 4*b[38] + 2*b[39]
                - 4*b[40] - 2*b[41] + 4*b[42] + 2*b[43] - 4*b[44] - 2*b[45] + 4*b[46] + 2*b[47]
                - 3*b[48] + 3*b[49] - 3*b[50] + 3*b[51] - 3*b[52] + 3*b[53] - 3*b[54] + 3*b[55]
                - 2*b[56] - b[57] - 2*b[58] - b[59] - 2*b[60] - b[61] - 2*b[62] - b[63];
            a[63] = 8*b[0] - 8*b[1] - 8*b[2] + 8*b[3] - 8*b[4] + 8*b[5] + 8*b[6] - 8*b[7]
                + 4*b[8] + 4*b[9] - 4*b[10] - 4*b[11] - 4*b[12] - 4*b[13] + 4*b[14] + 4*b[15]
                + 4*b[16] - 4*b[17] + 4*b[18] - 4*b[19] - 4*b[20] + 4*b[21] - 4*b[22] + 4*b[23]
                + 4*b[24] - 4*b[25] - 4*b[26] + 4*b[27] + 4*b[28] - 4*b[29] - 4*b[30] + 4*b[31]
                + 2*b[32] + 2*b[33] + 2*b[34] + 2*b[35] - 2*b[36] - 2*b[37] - 2*b[38] - 2*b[39]
                + 2*b[40] + 2*b[41] - 2*b[42] - 2*b[43] + 2*b[44] + 2*b[45] - 2*b[46] - 2*b[47]
                + 2*b[48] - 2*b[49] + 2*b[50] - 2*b[51] + 2*b[52] - 2*b[53] + 2*b[54] - 2*b[55]
                + b[56] + b[57] + b[58] + b[59] + b[60] + b[61] + b[62] + b[63];
        }
DEVICE void RigidBodyGrid::compute_b(float * __restrict__ b, int * __restrict__ inds) const
        {
            int k[3];
            k[0] = nx;
            k[1] = ny;
            k[2] = nz;

            int inds2[3] = {0,0,0};

            for (int i0 = 0; i0 < 8; i0++) {
                inds2[0] = 0;
                inds2[1] = 0;
                inds2[2] = 0;

                /* printf("%d\n", inds2[0]); */
                /* printf("%d\n", inds2[1]); */
                /* printf("%d\n", inds2[2]); */

                bool zero_derivs = false;

                int bit = 1;    // bit = 2^i1 in the below loop
                for (int i1 = 0; i1 < 3; i1++) {
                    inds2[i1] = (inds[i1] + ((i0 & bit) ? 1 : 0)) % k[i1];
                    bit <<= 1;  // i.e. multiply by 2
                }
                //int d_hi[3] = {1, 1, 1};
                int d_lo[3] = {1, 1, 1};
                float voffs[3] = {0.0f, 0.0f, 0.0f};
                float dscales[3] = {0.5, 0.5, 0.5};

                for (int i1 = 0; i1 < 3; i1++) {
                    if (inds2[i1] == 0) {
                        zero_derivs = true;
                    }
                    else if (inds2[i1] == k[i1]-1) {
                        zero_derivs = true;
                    }
                    else {
                        voffs[i1] = 0.0;
                    }
                }

                // V
                b[i0] = getValue(inds2[0],inds2[1],inds2[2]);

                if (zero_derivs) {
                    b[8+i0] = 0.0;
                    b[16+i0] = 0.0;
                    b[24+i0] = 0.0;
                    b[32+i0] = 0.0;
                    b[40+i0] = 0.0;
                    b[48+i0] = 0.0;
                    b[56+i0] = 0.0;
                } else {
                    b[8+i0]  = dscales[0] * (getValue(inds2[0]+1,inds2[1],inds2[2]) - getValue(inds2[0]-d_lo[0],inds2[1],inds2[2]) + voffs[0]); //  dV/dx
                    b[16+i0] = dscales[1] * (getValue(inds2[0],inds2[1]+1,inds2[2]) - getValue(inds2[0],inds2[1]-d_lo[1],inds2[2]) + voffs[1]); //  dV/dy
                    b[24+i0] = dscales[2] * (getValue(inds2[0],inds2[1],inds2[2]+1) - getValue(inds2[0],inds2[1],inds2[2]-d_lo[2]) + voffs[2]); //  dV/dz
                    b[32+i0] = dscales[0] * dscales[1] *
                        (getValue(inds2[0]+1,inds2[1]+1,inds2[2]) - getValue(inds2[0]-d_lo[0],inds2[1]+1,inds2[2])
                       - getValue(inds2[0]+1,inds2[1]-d_lo[1],inds2[2]) + getValue(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]));      //  d2V/dxdy

                    b[40+i0] = dscales[0] * dscales[2] *
                              (getValue(inds2[0]+1,inds2[1],inds2[2]+1) - getValue(inds2[0]-d_lo[0],inds2[1],inds2[2]+1)
                             - getValue(inds2[0]+1,inds2[1],inds2[2]-d_lo[2]) + getValue(inds2[0]-d_lo[0],inds2[1],inds2[2]-d_lo[2]));      //  d2V/dxdz

                    b[48+i0] = dscales[1] * dscales[2] *
                               (getValue(inds2[0],inds2[1]+1,inds2[2]+1) - getValue(inds2[0],inds2[1]-d_lo[1],inds2[2]+1)
                              - getValue(inds2[0],inds2[1]+1,inds2[2]-d_lo[2]) + getValue(inds2[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));      //  d2V/dydz

                    b[56+i0] = dscales[0] * dscales[1] * dscales[2] *                                    // d3V/dxdydz
                       (getValue(inds2[0]+1,inds2[1]+1,inds2[2]+1) - getValue(inds2[0]+1,inds2[1]+1,inds2[2]-d_lo[2]) -
                        getValue(inds2[0]+1,inds2[1]-d_lo[1],inds2[2]+1) - getValue(inds2[0]-d_lo[0],inds2[1]+1,inds2[2]+1) +
                        getValue(inds2[0]+1,inds2[1]-d_lo[1],inds2[2]-d_lo[2]) + getValue(inds2[0]-d_lo[0],inds2[1]+1,inds2[2]-d_lo[2]) +
                        getValue(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]+1) - getValue(inds2[0]-d_lo[0],inds2[1]-d_lo[1],inds2[2]-d_lo[2]));

                        }
                    }
                }