Something went wrong on our end
-
cmaffeo2 authored
Implemented "particleLists" for rigid body--particle interactions; made RigidBodyType class manage its own device data for particle interactions
cmaffeo2 authoredImplemented "particleLists" for rigid body--particle interactions; made RigidBodyType class manage its own device data for particle interactions
ComputeGridGrid.cu 4.66 KiB
// Included in RigidBodyController.cu
#include "ComputeGridGrid.cuh"
#include "RigidBodyGrid.h"
#include "CudaUtil.cuh"
//RBTODO handle periodic boundaries
//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
__global__
void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
const Matrix3 basis_rho, const Matrix3 basis_u_inv,
const Vector3 origin_rho_minus_origin_u,
Vector3 * retForce, Vector3 * retTorque) {
extern __shared__ Vector3 s[];
Vector3 *force = s;
Vector3 *torque = &s[NUMTHREADS];
// RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
const int tid = threadIdx.x;
const int r_id = blockIdx.x * blockDim.x + threadIdx.x;
force[tid] = Vector3(0.0f);
torque[tid] = Vector3(0.0f);
if (r_id < rho->getSize()) { // skip threads with nothing to do
// RBTODO: reduce registers used;
// commenting out interpolateForceD still uses ~40 registers
// -- the innocuous-looking fn below is responsible; consumes ~17 registers!
Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */
r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */
const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
// RBTODO What about non-unit delta?
/* Vector3 tmpf = Vector3(0.0f); */
/* float tmpe = 0.0f; */
/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
force[tid] = fe.f;
const float r_val = rho->val[r_id]; /* maybe move to beginning of function? */
force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
// Calculate torque about origin_u in the lab frame
torque[tid] = r_pos.cross(force[tid]);
}
// Reduce force and torques
// http://www.cuvilib.com/Reduction.pdf
// RBTODO optimize further, perhaps
__syncthreads();
for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
if (tid < offset) {
int oid = tid + offset;
force[tid] = force[tid] + force[oid];
torque[tid] = torque[tid] + torque[oid];
}
__syncthreads();
}
if (tid == 0) {
retForce[blockIdx.x] = force[0];
retTorque[blockIdx.x] = torque[0];
}
}
__global__
void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce,
const int num, const int* __restrict__ particleIds,
const RigidBodyGrid* __restrict__ u,
const Matrix3 basis_u_inv, const Vector3 origin_u,
Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque) {
extern __shared__ Vector3 s[];
Vector3 *force = s;
Vector3 *torque = &s[NUMTHREADS];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
force[tid] = Vector3(0.0f);
torque[tid] = Vector3(0.0f);
if (i < num) {
const int& id = particleIds[i];
Vector3 p = pos[id] - origin_u;
// TODO: wrap to center of u
const Vector3 u_ijk_float = basis_u_inv.transform( p );
const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
force[tid] = fe.f;
force[tid] = basis_u_inv.transpose().transform( force[tid] ); /* transform to lab frame */
atomicAdd( &particleForce[id], force[tid] ); // apply force to particle
// Calculate torque about origin_u in the lab frame
torque[tid] = p.cross(force[tid]); // RBTODO: test if sign is correct!
}
// Reduce force and torques
__syncthreads();
for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
if (tid < offset) {
int oid = tid + offset;
force[tid] = force[tid] + force[oid];
torque[tid] = torque[tid] + torque[oid];
}
__syncthreads();
}
if (tid == 0) {
retForce[blockIdx.x] = force[0];
retTorque[blockIdx.x] = torque[0];
}
}
__global__
void createPartlist(const Vector3* __restrict__ pos,
const int numTypeParticles, const int* __restrict__ typeParticles_d,
int* numParticles_d, int* particles_d,
const Vector3 gridCenter, const float radius2) {
const int tid = threadIdx.x;
const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
const int split = 32; /* numblocks should be divisible by split */
/* const int blocksPerCell = gridDim.x/split; */
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < numTypeParticles) {
int aid = typeParticles_d[i];
float dist = (pos[aid] - gridCenter).length2();
if (dist <= radius2) {
int tmp = atomicAggInc(numParticles_d, warpLane);
particles_d[tmp] = aid;
}
}
}
__global__
void printRigidBodyGrid(const RigidBodyGrid* rho) {
printf("Printing an RB of size %d\n",rho->size);
for (int i=0; i < rho->size; i++)
printf(" val[%d] = %f\n", i, rho->val[i]);
}