diff --git a/ComputeGridGrid.cu b/ComputeGridGrid.cu new file mode 100644 index 0000000000000000000000000000000000000000..e98c2f16389f15375bc32226b27d6933cc158336 --- /dev/null +++ b/ComputeGridGrid.cu @@ -0,0 +1,119 @@ +// Included in RigidBodyController.cu +#include "ComputeGridGrid.cuh" +#include "RigidBodyGrid.h" +#include "CudaUtil.cuh" + +//RBTODO handle periodic boundaries +//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?) +__global__ +void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, + const Matrix3 basis_rho, const Matrix3 basis_u_inv, + const Vector3 origin_rho_minus_origin_u, + Vector3 * retForce, Vector3 * retTorque) { + + extern __shared__ Vector3 s[]; + Vector3 *force = s; + Vector3 *torque = &s[NUMTHREADS]; + + // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops + const int tid = threadIdx.x; + const int r_id = blockIdx.x * blockDim.x + threadIdx.x; + + force[tid] = Vector3(0.0f); + torque[tid] = Vector3(0.0f); + if (r_id < rho->getSize()) { // skip threads with nothing to do + // RBTODO: reduce registers used; + // commenting out interpolateForceD still uses ~40 registers + // -- the innocuous-looking fn below is responsible; consumes ~17 registers! + Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */ + + r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */ + const Vector3 u_ijk_float = basis_u_inv.transform( r_pos ); + + // RBTODO What about non-unit delta? + /* Vector3 tmpf = Vector3(0.0f); */ + /* float tmpe = 0.0f; */ + /* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */ + const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */ + force[tid] = fe.f; + + const float r_val = rho->val[r_id]; /* maybe move to beginning of function? */ + force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */ + + // Calculate torque about origin_u in the lab frame + torque[tid] = r_pos.cross(force[tid]); // RBTODO: test if sign is correct! + } + + // Reduce force and torques + // http://www.cuvilib.com/Reduction.pdf + // RBTODO optimize further, perhaps + __syncthreads(); + for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { + if (tid < offset) { + int oid = tid + offset; + force[tid] = force[tid] + force[oid]; + torque[tid] = torque[tid] + torque[oid]; + } + __syncthreads(); + } + + if (tid == 0) { + retForce[blockIdx.x] = force[0]; + retTorque[blockIdx.x] = torque[0]; + } +} + +__global__ +void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce, + const int num, const int* __restrict__ particleIds, + const RigidBodyGrid* __restrict__ u, + const Matrix3 basis_u_inv, const Vector3 origin_u, + Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque) { + + extern __shared__ Vector3 s[]; + Vector3 *force = s; + Vector3 *torque = &s[NUMTHREADS]; + + const int tid = threadIdx.x; + const int i = blockIdx.x * blockDim.x + threadIdx.x; + + force[tid] = Vector3(0.0f); + torque[tid] = Vector3(0.0f); + if (i < num) { + const int& id = particleIds[i]; + Vector3 p = pos[id] - origin_u; + // TODO: wrap to center of u + const Vector3 u_ijk_float = basis_u_inv.transform( p ); + const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */ + force[tid] = fe.f; + force[tid] = basis_u_inv.transpose().transform( force[tid] ); /* transform to lab frame */ + atomicAdd( &particleForce[id], force[tid] ); // apply force to particle + + // Calculate torque about origin_u in the lab frame + torque[tid] = p.cross(force[tid]); // RBTODO: test if sign is correct! + } + + // Reduce force and torques + __syncthreads(); + for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { + if (tid < offset) { + int oid = tid + offset; + force[tid] = force[tid] + force[oid]; + torque[tid] = torque[tid] + torque[oid]; + } + __syncthreads(); + } + + if (tid == 0) { + retForce[blockIdx.x] = force[0]; + retTorque[blockIdx.x] = torque[0]; + } +} + + +__global__ +void printRigidBodyGrid(const RigidBodyGrid* rho) { + printf("Printing an RB of size %d\n",rho->size); + for (int i=0; i < rho->size; i++) + printf(" val[%d] = %f\n", i, rho->val[i]); +} diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh index 066680dff4697656c13e89ab0825500a94b03dfd..20067121dc23b853cdf362ac0a650445062e0bd3 100644 --- a/ComputeGridGrid.cuh +++ b/ComputeGridGrid.cuh @@ -1,68 +1,22 @@ // Included in RigidBodyController.cu #pragma once +#include "useful.h" +#define NUMTHREADS 96 -//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?) -__global__ -void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, - const Matrix3 basis_rho, const Matrix3 basis_u_inv, - const Vector3 origin_rho_minus_origin_u, - Vector3 * retForce, Vector3 * retTorque) { - - extern __shared__ Vector3 s[]; - Vector3 *force = s; - Vector3 *torque = &s[NUMTHREADS]; - - // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops - const int tid = threadIdx.x; - const int r_id = blockIdx.x * blockDim.x + threadIdx.x; - - force[tid] = Vector3(0.0f); - torque[tid] = Vector3(0.0f); - if (r_id < rho->getSize()) { // skip threads with nothing to do - // RBTODO: reduce registers used; - // commenting out interpolateForceD still uses ~40 registers - // -- the innocuous-looking fn below is responsible; consumes ~17 registers! - Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */ - - r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */ - const Vector3 u_ijk_float = basis_u_inv.transform( r_pos ); - - // RBTODO What about non-unit delta? - /* Vector3 tmpf = Vector3(0.0f); */ - /* float tmpe = 0.0f; */ - /* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */ - const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */ - force[tid] = fe.f; +class RigidBodyGrid; - const float r_val = rho->val[r_id]; /* maybe move to beginning of function? */ - force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */ - - // Calculate torque about origin_u in the lab frame - torque[tid] = r_pos.cross(force[tid]); // RBTODO: test if sign is correct! - } - - // Reduce force and torques - // http://www.cuvilib.com/Reduction.pdf - // RBTODO optimize further, perhaps - __syncthreads(); - for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { - if (tid < offset) { - int oid = tid + offset; - force[tid] = force[tid] + force[oid]; - torque[tid] = torque[tid] + torque[oid]; - } - __syncthreads(); - } - - if (tid == 0) { - retForce[blockIdx.x] = force[0]; - retTorque[blockIdx.x] = torque[0]; - } -} - -__global__ -void printRigidBodyGrid(const RigidBodyGrid* rho) { - printf("Printing an RB of size %d\n",rho->size); - for (int i=0; i < rho->size; i++) - printf(" val[%d] = %f\n", i, rho->val[i]); -} +extern __global__ +void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, + const Matrix3 basis_rho, const Matrix3 basis_u_inv, + const Vector3 origin_rho_minus_origin_u, + Vector3 * retForce, Vector3 * retTorque); + +extern __global__ +void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce, + const int num, const int* __restrict__ particleIds, + const RigidBodyGrid* __restrict__ u, + const Matrix3 basis_u_inv, const Vector3 origin_u, + Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque); + +extern __global__ +void printRigidBodyGrid(const RigidBodyGrid* rho); diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh index da0ffd97221b237b38bc0f1c33566c8fbb3e85c1..736970e4876bb08c3bacee56131c94d09701203a 100644 --- a/GrandBrownTown.cuh +++ b/GrandBrownTown.cuh @@ -4,6 +4,8 @@ #pragma once #include "CudaUtil.cuh" +#include "RigidBodyType.h" +#include "RigidBodyGrid.h" __device__ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion, @@ -32,7 +34,7 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, // TODO: Make this a grid-stride loop to make efficient reuse of RNG states - // Loop over ALL particles in ALL replicas + // Loop over ALL particles in ALL replicas if (idx < num * numReplicas) { const int t = type[idx]; Vector3& p = pos[idx]; diff --git a/RigidBody.h b/RigidBody.h index b6df0612475857b783c89f93c6a8f05b2671e1c5..59703c5f2e0f7094e787fe7bc79355dbdfe7e5d9 100644 --- a/RigidBody.h +++ b/RigidBody.h @@ -15,7 +15,7 @@ #define DEVICE #endif -class Configuration; /* forward decleration */ +class Configuration; typedef float BigReal; /* strip this out later */ typedef Vector3 Force; diff --git a/RigidBodyController.cu b/RigidBodyController.cu index 9db8d2d7261d19d5b9a45a868068e662c65208bf..e3744a2db8583ac62e8b59064db37f5ae3df110e 100644 --- a/RigidBodyController.cu +++ b/RigidBodyController.cu @@ -8,6 +8,7 @@ #include "RigidBodyController.h" #include "Configuration.h" #include "RigidBodyType.h" +#include "RigidBodyGrid.h" #include "ComputeGridGrid.cuh" #include <cuda_profiler_api.h> diff --git a/RigidBodyController.h b/RigidBodyController.h index ec31395e1021444b1fa923075adce64000cd8720..f04251f606ea6350abeffa54f2c10a9c2d4898cc 100644 --- a/RigidBodyController.h +++ b/RigidBodyController.h @@ -1,14 +1,20 @@ +#pragma once + #include <vector> #include <fstream> -#include "RigidBody.h" /* #include "RandomCUDA.h" */ #include <cuda.h> #include <cuda_runtime.h> +#include "useful.h" // #define NUMTHREADS 128 /* try with 64, every 32+ */ #define NUMTHREADS 96 #define NUMSTREAMS 8 +// #include "RigidBody.h" + +class RigidBodyType; +class RigidBody; class Configuration; class RandomCPU; diff --git a/RigidBodyType.cu b/RigidBodyType.cu index 2c3965adc8bb3c43de9a3fb70ed0240ab0307ba8..b8ea109855c6dc2cdfc80ddf98b4287acbcb232b 100644 --- a/RigidBodyType.cu +++ b/RigidBodyType.cu @@ -1,4 +1,7 @@ #include "RigidBodyType.h" +#include "Reservoir.h" +#include "BaseGrid.h" +#include "RigidBodyGrid.h" void RigidBodyType::clear() { num = 0; // RBTODO: not 100% sure about this diff --git a/RigidBodyType.h b/RigidBodyType.h index 4b793234afd950cb016dbd09da80d2451dda2551..44a813ded4ee4f6112a91f898b7ef88658a4eef7 100644 --- a/RigidBodyType.h +++ b/RigidBodyType.h @@ -5,13 +5,17 @@ #include <vector> /* #include <thrust/host_vector.h> */ /* #include <thrust/device_vector.h> */ -#include "Reservoir.h" -#include "BaseGrid.h" -#include "RigidBodyGrid.h" +/* #include "Reservoir.h" */ +/* #include "BaseGrid.h" */ +/* #include "RigidBodyGrid.h" */ #include "useful.h" #include <cstdio> +class Reservoir; +class BaseGrid; +class RigidBodyGrid; + class RigidBodyType { private: // Deletes all members