diff --git a/RigidBodyController.h b/RigidBodyController.h index 90a76b54f2e2ec395404aa6794d64b1f361f0243..41cbcbf524e06bc6e6234dce35a01b6956aaf88d 100644 --- a/RigidBodyController.h +++ b/RigidBodyController.h @@ -5,8 +5,8 @@ #include <cuda.h> #include <cuda_runtime.h> -// #define NUMTHREADS 256 /* try with 64, every 32+ */ -#define NUMTHREADS 64 +#define NUMTHREADS 128 /* try with 64, every 32+ */ +// #define NUMTHREADS 64 #define NUMSTREAMS 8 class Configuration; diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu index 3721144099a629fcef0210ed3d2ac6b22f1cea17..2a793feb790b8e2876510e5dec292bc3f1801060 100644 --- a/RigidBodyGrid.cu +++ b/RigidBodyGrid.cu @@ -370,18 +370,20 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const int homeX = int(floor(l.x)); const int homeY = int(floor(l.y)); const int homeZ = int(floor(l.z)); - + const float wx = l.x - homeX; + const float wy = l.y - homeY; + const float wz = l.z - homeZ; { /* f.x */ float g3[4]; // #pragma unroll {1,4} doesn't seem to matter - for (int iz = 0; iz < 4; iz++) { + for (int iz = -1; iz < 3; iz++) { float g2[4]; - for (int iy = 0; iy < 4; iy++) { + for (int iy = -1; iy < 3; iy++) { float g1[4]; - for (volatile int ix = 0; ix < 4; ix++) { /* RBTODO: unsigned/short? Best practices guide says no */ - volatile int jx = (ix-1 + homeX); - volatile int jy = (iy-1 + homeY); - volatile int jz = (iz-1 + homeZ); + for (volatile int ix = -1; ix < 3; ix++) { /* RBTODO: unsigned/short? Best practices guide says no */ + volatile int jx = (ix + homeX); + volatile int jy = (iy + homeY); + volatile int jz = (iz + homeZ); const int ind = jz + jy*nz + jx*nz*ny; g1[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ? 0 : val[ind]; @@ -391,7 +393,6 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g1[0] - 5.0f*g1[1] + 4.0f*g1[2] - g1[3]); const float a1 = 0.5f*(-g1[0] + g1[2]); /* const float a0 = g1[1]; */ - const float wx = l.x - homeX; // g2[iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; g2[iy] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1; /* derivative */ } @@ -401,7 +402,6 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g2[0] - 5.0f*g2[1] + 4.0f*g2[2] - g2[3]); const float a1 = 0.5f*(-g2[0] + g2[2]); const float a0 = g2[1]; - const float wy = l.y - homeY; g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; } @@ -410,21 +410,20 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]); const float a1 = 0.5f*(-g3[0] + g3[2]); const float a0 = g3[1]; - const float wz = l.z - homeZ; f.x = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); - } + } { /* f.y */ float g3[4]; - for (short iz = 0; iz < 4; iz++) { + for (int iz = -1; iz < 3; iz++) { float g2[4]; - for (short iy = 0; iy < 4; iy++) { + for (int iy = -1; iy < 3; iy++) { // Fetch values from nearby float g1[4]; - for (volatile unsigned short ix = 0; ix < 4; ix++) { - volatile short jx = (ix-1 + homeX); - volatile short jy = (iy-1 + homeY); - volatile short jz = (iz-1 + homeZ); + for (volatile int ix = -1; ix < 3; ix++) { + volatile int jx = (ix + homeX); + volatile int jy = (iy + homeY); + volatile int jz = (iz + homeZ); const int ind = jz + jy*nz + jx*nz*ny; g1[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ? 0 : val[ind]; @@ -434,7 +433,6 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g1[0] - 5.0f*g1[1] + 4.0f*g1[2] - g1[3]); const float a1 = 0.5f*(-g1[0] + g1[2]); const float a0 = g1[1]; - const float wx = l.x - homeX; g2[iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; } @@ -443,7 +441,6 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g2[0] - 5.0f*g2[1] + 4.0f*g2[2] - g2[3]); const float a1 = 0.5f*(-g2[0] + g2[2]); /* const float a0 = g2[1]; */ - const float wy = l.y - homeY; // g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; g3[iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1; /* derivative */ @@ -453,21 +450,19 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]); const float a1 = 0.5f*(-g3[0] + g3[2]); const float a0 = g3[1]; - const float wz = l.z - homeZ; - f.y = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); } { /* f.z */ float g3[4]; - for (short iz = 0; iz < 4; iz++) { + for (int iz = -1; iz < 3; iz++) { float g2[4]; - for (short iy = 0; iy < 4; iy++) { + for (int iy = -1; iy < 3; iy++) { // Fetch values from nearby float g1[4]; - for (volatile unsigned short ix = 0; ix < 4; ix++) { - volatile short jx = (ix-1 + homeX); - volatile short jy = (iy-1 + homeY); - volatile short jz = (iz-1 + homeZ); + for (volatile int ix = -1; ix < 3; ix++) { + volatile int jx = (ix + homeX); + volatile int jy = (iy + homeY); + volatile int jz = (iz + homeZ); const int ind = jz + jy*nz + jx*nz*ny; g1[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ? 0 : val[ind]; @@ -477,7 +472,6 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g1[0] - 5.0f*g1[1] + 4.0f*g1[2] - g1[3]); const float a1 = 0.5f*(-g1[0] + g1[2]); const float a0 = g1[1]; - const float wx = l.x - homeX; g2[iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; } @@ -486,17 +480,13 @@ DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { const float a2 = 0.5f*(2.0f*g2[0] - 5.0f*g2[1] + 4.0f*g2[2] - g2[3]); const float a1 = 0.5f*(-g2[0] + g2[2]); const float a0 = g2[1]; - const float wy = l.y - homeY; - g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; } // Mix along z. const float a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]); const float a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]); const float a1 = 0.5f*(-g3[0] + g3[2]); - /* const float a0 = g3[1]; */ - const float wz = l.z - homeZ; - + /* const float a0 = g3[1]; */ f.z = -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1); /* derivative */ } return f; diff --git a/makefile b/makefile index d7c89a1d3794ca395130588ca7265917b820d012..a7de250d1b0280edfcadf243c2f76c0bb5bbaf75 100644 --- a/makefile +++ b/makefile @@ -14,7 +14,8 @@ INCLUDE = $(CUDA_PATH)/include # DEBUG = -g -O0 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic # NV_FLAGS = -g -G #debug -NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no +# NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no +NV_FLAGS = -Xptxas -v # -v,-abi=no EX_FLAGS = -O3 -m$(OS_SIZE) # EX_FLAGS = -O0 -m$(OS_SIZE)