From ee3383f62c02c34e9de38c3c1ac7fdbe5f2ee48c Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 27 Sep 2016 22:08:51 +0000
Subject: [PATCH] fixed *char to const *char warnings; created list-based
 kernel for angles and dihedrals (untested)

---
 CellDecomposition.cu |   2 +-
 ComputeForce.cu      |  63 ++++++++++++++-------
 ComputeForce.cuh     |  67 ++++++++++++++++++++--
 ComputeForce.h       |   8 ++-
 Configuration.cpp    |   2 +-
 CudaUtil.cuh         |   2 +-
 GrandBrownTown.cu    |  28 +++++++---
 GrandBrownTown.h     |   3 +
 TabulatedAngle.h     |   7 ++-
 TabulatedDihedral.h  |   5 +-
 TabulatedMethods.cuh | 128 +++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 271 insertions(+), 44 deletions(-)
 create mode 100644 TabulatedMethods.cuh

diff --git a/CellDecomposition.cu b/CellDecomposition.cu
index 07a7b02..a17133f 100644
--- a/CellDecomposition.cu
+++ b/CellDecomposition.cu
@@ -7,7 +7,7 @@
 // *****************************************************************************
 // Error Check
 
-inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 	if (code != cudaSuccess) {
 		fprintf(stderr,"CUDA Error: %s %s %d\n",
 						cudaGetErrorString(code), file, line);
diff --git a/ComputeForce.cu b/ComputeForce.cu
index aee09cd..68224de 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -8,7 +8,7 @@
 
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
       fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
       if (abort) exit(code);
@@ -596,19 +596,22 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	/* printPairForceCounter<<<1,32>>>(); */
 	/* gpuErrchk(cudaDeviceSynchronize()); */
 
-	computeAngles<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, angles_d,
-			tableAngle_d, numAngles, num, sys_d, energies_d, get_energy);
-
 	//Mlog: the commented function doesn't use bondList, uncomment for testing.
 	//if(bondMap_d != NULL && tableBond_d != NULL)
 	if(bondList_d != NULL && tableBond_d != NULL)
 
 	{
-		//computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d);
-		computeTabulatedBonds <<<numBlocks, numThreads>>> ( forceInternal_d, pos_d, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d);
+	    //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d);
+	    computeTabulatedBonds <<<numBlocks, numThreads>>> ( forceInternal_d, pos_d, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d);
 	}
-	computeDihedrals<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, dihedrals_d,
-			tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy);
+	if (angleList_d != NULL && tableAngle_d != NULL)
+	    computeTabulatedAngles<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, sys_d, numReplicas,
+							      numAngles, angleList_d, tableAngle_d);
+	
+	if (dihedralList_d != NULL && tableDihedral_d != NULL)
+	    computeTabulatedDihedrals<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, sys_d, numReplicas,
+								 numDihedrals, dihedralList_d, dihedralPotList_d, tableDihedral_d);
+	
 
 	// Calculate the energy based on the array created by the kernel
 	if (get_energy) {
@@ -709,17 +712,35 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap,
 	gpuErrchk(cudaDeviceSynchronize());
 }
 
-void ComputeForce::createBondList(int3 *bondList)
-{
-	size_t size = (numBonds / 2) * numReplicas * sizeof(int3);
-	gpuErrchk( cudaMalloc( &bondList_d, size ) );
-	gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) );
-
-	for(int i = 0 ; i < (numBonds / 2) * numReplicas ; i++)
-	{
-		cout << "Displaying: bondList_d["<< i <<"].x = " << bondList[i].x << ".\n"
-			<< "Displaying: bondList_d["<< i <<"].y = " << bondList[i].y << ".\n"
-			<< "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n";
-
-	}
+// void ComputeForce::createBondList(int3 *bondList)
+// {
+// 	size_t size = (numBonds / 2) * numReplicas * sizeof(int3);
+// 	gpuErrchk( cudaMalloc( &bondList_d, size ) );
+// 	gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) );
+
+// 	for(int i = 0 ; i < (numBonds / 2) * numReplicas ; i++)
+// 	{
+// 		cout << "Displaying: bondList_d["<< i <<"].x = " << bondList[i].x << ".\n"
+// 			<< "Displaying: bondList_d["<< i <<"].y = " << bondList[i].y << ".\n"
+// 			<< "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n";
+
+// 	}
+// }
+
+void ComputeForce::copyBondedListsToGPU(int3 *bondList, int4 *angleList, int4 *dihedralList, int *dihedralPotList) {
+    size_t size = (numBonds / 2) * numReplicas * sizeof(int3);
+    gpuErrchk( cudaMalloc( &bondList_d, size ) );
+    gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) );
+
+    size = numAngles * numReplicas * sizeof(int4);
+    gpuErrchk( cudaMalloc( &angleList_d, size ) );
+    gpuErrchk( cudaMemcpyAsync( angleList_d, angleList, size, cudaMemcpyHostToDevice) );
+    
+    size = numDihedrals * numReplicas * sizeof(int4);
+    gpuErrchk( cudaMalloc( &dihedralList_d, size ) );
+    gpuErrchk( cudaMemcpyAsync( dihedralList_d, dihedralList, size, cudaMemcpyHostToDevice) );
+
+    size = numDihedrals * numReplicas * sizeof(int);
+    gpuErrchk( cudaMalloc( &dihedralPotList_d, size ) );
+    gpuErrchk( cudaMemcpyAsync( dihedralPotList_d, dihedralPotList, size, cudaMemcpyHostToDevice) );
 }
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index cfb2b96..37b6264 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -3,8 +3,9 @@
 // Terrance Howard <heyterrance@gmail.com>
 
 #pragma once
-#include "CudaUtil.cuh"
 #include <assert.h>
+#include "CudaUtil.cuh"
+#include "TabulatedMethods.cuh"
 
 #define BD_PI 3.1415927f
 
@@ -228,7 +229,7 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas,
 	const int cID = bid / blocksPerCell; /* cellID */
 
 	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
-	const int wid = tid/WARPSIZE;
+	// const int wid = tid/WARPSIZE;
 	const int blockLane = bid % blocksPerCell;
 
 	if (cID >= nCells) return;
@@ -444,7 +445,6 @@ __global__ void computeTabulatedKernel(
 		const int2 pair = g_pair[i];
 		const int ai = pair.x;
 		const int aj = pair.y;
-		// BONDS: RBTODO: handle through an independent kernel call
 			
 		// Particle's type and position
 		const int ind = g_pairTabPotType[i];
@@ -619,7 +619,7 @@ void computeAngles(Vector3 force[], Vector3 pos[],
 			Angle& a = angles[i];
 			const int ind = a.getIndex(idx);
 			if (ind >= 0) {
-				EnergyForce ef = tableAngle[a.tabFileIndex]->compute(&a, pos, sys, ind);
+				EnergyForce ef = tableAngle[a.tabFileIndex]->computeOLD(&a, pos, sys, ind);
 				force_local += ef.f;
 				if (ind == 1 and get_energy)
 					energy_local += ef.e;
@@ -733,6 +733,30 @@ __global__ void computeTabulatedBonds( Vector3* force,		Vector3* pos,					int nu
 	}
 }
 
+// TODO: add kernel for energy calculation 
+__global__
+void computeTabulatedAngles(Vector3* __restrict__ force, Vector3* __restrict__ pos,
+			    BaseGrid* __restrict__ sys, int numReplicas,
+			    int numAngles, int4* angleList_d, TabulatedAnglePotential** tableAngle) {
+				
+    int currAngle = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
+
+    // Loop over ALL angles in ALL replicas
+    if( currAngle < (numAngles * numReplicas) ) {
+
+	int4& ids = angleList_d[currAngle];
+	computeAngle(tableAngle[ ids.w ], sys, force, pos, ids.x, ids.y, ids.z);
+
+	// if (get_energy)
+	// {
+	//     //TODO: clarification on energy computation needed, consider changing.
+	//     atomicAdd( &g_energies[i], energy_local);
+	//     //atomicAdd( &g_energies[j], energy_local);
+	// }
+    }
+}
+
+
 __global__
 void computeDihedrals(Vector3 force[], Vector3 pos[],
 											Dihedral dihedrals[],
@@ -740,7 +764,7 @@ void computeDihedrals(Vector3 force[], Vector3 pos[],
 											int numDihedrals, int num, BaseGrid* sys, float g_energies[],
 											bool get_energy) {
 	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	float energy_local = 0.0f;
+	// float energy_local = 0.0f;
 	Vector3 force_local(0.0f);
 
 	if (i < numDihedrals) {
@@ -813,3 +837,36 @@ void computeDihedrals(Vector3 force[], Vector3 pos[],
 		}
 	}
 }
+
+
+    // void computeTabulatedDihedrals(Vector3* __restrict__ force, Vector3* __restrict__ pos, int num,
+    // 			    int numParts, BaseGrid* __restrict__ sys, int4* __restrict__ dihedralList_d,
+    // 			    int* __restrict__ dihedralPotList_d,
+    // 			    int numDihedrals, int numReplicas, float* __restrict g_energies,
+    // 			    bool get_energy, TabulatedDihedralPotential** __restrict__ tableDihedral) {
+
+__global__
+void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __restrict__ pos,
+			       const BaseGrid* __restrict__ sys, int numReplicas,
+			       int numDihedrals, const int4* __restrict__ dihedralList_d,
+			       const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) {
+
+    int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
+
+    // Loop over ALL dihedrals in ALL replicas
+    // TODO make grid stride loop
+    if( currDihedral < (numDihedrals * numReplicas) ) {
+	const int4& ids = dihedralList_d[currDihedral];
+	const int& id = dihedralPotList_d[currDihedral];
+
+	computeDihedral(tableDihedral[ id ], sys, force, pos, ids.x, ids.y, ids.z, ids.w);
+
+	// if (get_energy)
+	// {
+	//     //TODO: clarification on energy computation needed, consider changing.
+	//     atomicAdd( &g_energies[i], energy_local);
+	//     //atomicAdd( &g_energies[j], energy_local);
+	// }
+    }
+}
+
diff --git a/ComputeForce.h b/ComputeForce.h
index 22fd9ea..66d6e75 100644
--- a/ComputeForce.h
+++ b/ComputeForce.h
@@ -74,8 +74,9 @@ public:
 	void copyToCUDA(Vector3* forceInternal, Vector3* pos);
 	void copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals);
 	
-	void createBondList(int3 *bondList);
-
+	// void createBondList(int3 *bondList);
+	void copyBondedListsToGPU(int3 *bondList, int4 *angleList, int4 *dihedralList, int *dihedralPotList);
+	    
 	//MLog: because of the move of a lot of private variables, some functions get starved necessary memory access to these variables, below is a list of functions that return the specified private variable.
 	Vector3* getPos_d()
 	{
@@ -190,6 +191,9 @@ private:
 	Angle* angles_d;
 	Dihedral* dihedrals_d;
 	int3* bondList_d;
+	int4* angleList_d;
+	int4* dihedralList_d;
+	int* dihedralPotList_d;
 };
 
 #endif
diff --git a/Configuration.cpp b/Configuration.cpp
index 9332a16..ce253af 100644
--- a/Configuration.cpp
+++ b/Configuration.cpp
@@ -2,7 +2,7 @@
 
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
       fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), __FILE__, line);
       if (abort) exit(code);
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index 1daf958..49e0f83 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -86,7 +86,7 @@ __device__ inline void inclIntCumSum(int* in, const int n) {
 	__syncthreads();
 }
 
-__device__ void atomicAdd( Vector3* address, Vector3 val) {
+__device__ inline void atomicAdd(Vector3* address, Vector3 val) {
 	atomicAdd( &(address->x), val.x);
 	atomicAdd( &(address->y), val.y);
 	atomicAdd( &(address->z), val.z);
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index b459fed..ca30125 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -3,7 +3,7 @@
 /* #include "ComputeGridGrid.cuh" */
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 	if (code != cudaSuccess) {
 		fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
 	if (abort) exit(code);
@@ -243,16 +243,28 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 			}
 		}
 	}
+	// internal->createBondList(bondList);
 
-	internal -> createBondList(bondList);
-
-	// TODO: copy data to device (not here)
-	// TODO: use bondList in computeTabulatedBonds kernel
-
+	angleList = new int4[ (numAngles) * numReplicas ];
+	for(int k = 0 ; k < numReplicas; k++) {
+	    for(int i = 0; i < numAngles; ++i) {
+		angleList[i] = make_int4( angles[i].ind1+k*num, angles[i].ind2+k*num, angles[i].ind3+k*num, angles[i].tabFileIndex );
+	    }
+	}
+	
+	dihedralList = new int4[ (numDihedrals) * numReplicas ];
+	dihedralPotList = new  int[ (numDihedrals) * numReplicas ];
+	for(int k = 0 ; k < numReplicas; k++) {
+	    for(int i = 0; i < numDihedrals; ++i) {
+		dihedralList[i] = make_int4( dihedrals[i].ind1+k*num, dihedrals[i].ind2+k*num, dihedrals[i].ind3+k*num, dihedrals[i].ind4+k*num);
+		dihedralPotList[i] = dihedrals[i].tabFileIndex;
+	    }
+	}
+	internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList);
+	
 	forceInternal = new Vector3[num * numReplicas];
-
 	if (fullLongRange != 0)
-		printf("No cell decomposition created.\n");
+	    printf("No cell decomposition created.\n");
 
 	// Prepare the trajectory output writer.
 	for (int repID = 0; repID < numReplicas; ++repID) {
diff --git a/GrandBrownTown.h b/GrandBrownTown.h
index 57bd2c1..44a783e 100644
--- a/GrandBrownTown.h
+++ b/GrandBrownTown.h
@@ -241,10 +241,13 @@ private:
 	Angle* angles;
 	String* angleTableFile;
 	int numTabAngleFiles;
+	int4 *angleList;
 
 	Dihedral* dihedrals;
 	String* dihedralTableFile;
 	int numTabDihedralFiles;
+	int4 *dihedralList;
+	int  *dihedralPotList;
 
 	void updateNameList();
 
diff --git a/TabulatedAngle.h b/TabulatedAngle.h
index 777832b..0b53e65 100644
--- a/TabulatedAngle.h
+++ b/TabulatedAngle.h
@@ -7,7 +7,9 @@
 #include "Angle.h"
 #include "TabulatedPotential.h"
 #include "BaseGrid.h"
-#include <cuda.h>
+
+__device__ void atomicAdd( Vector3* address, Vector3 val);
+
 
 class TabulatedAnglePotential
 {
@@ -20,7 +22,8 @@ public:
 	float angle_step;	// 'step' angle in potential file. potential file might not go 1, 2, 3,...,360, it could be in steps of .5 or something smaller 
 	int size;			// The number of data points in the file
 	String fileName;
-	HOST DEVICE inline EnergyForce compute(Angle* a, Vector3* pos, BaseGrid* sys, int index) {
+
+	HOST DEVICE inline EnergyForce computeOLD(Angle* a, Vector3* pos, BaseGrid* sys, int index) {
 		// First, we must find the actual angle we're working with. 
 		// Grab the positions of each particle in the angle
 		const Vector3 posa = pos[a->ind1];
diff --git a/TabulatedDihedral.h b/TabulatedDihedral.h
index 84518c6..1a8e9ee 100644
--- a/TabulatedDihedral.h
+++ b/TabulatedDihedral.h
@@ -4,9 +4,8 @@
 #ifndef TABULATEDDIHEDRAL_H
 #define TABULATEDDIHEDRAL_H
 
-#include <cuda.h>
-
 #include "useful.h"
+
 #include "Dihedral.h"
 #include "TabulatedPotential.h"
 #include "BaseGrid.h"
@@ -29,7 +28,7 @@ public:
 	String fileName;
 
 	// RBTODO: deprecate
-	HOST DEVICE inline EnergyForce compute(Dihedral* d, Vector3* pos, BaseGrid* sys, int index) { 
+	HOST DEVICE inline EnergyForce computeOLD(Dihedral* d, Vector3* pos, BaseGrid* sys, int index) { 
 		const Vector3 posa = d->ind1;
 		const Vector3 posb = d->ind2;
 		const Vector3 posc = d->ind3;
diff --git a/TabulatedMethods.cuh b/TabulatedMethods.cuh
new file mode 100644
index 0000000..8986e1c
--- /dev/null
+++ b/TabulatedMethods.cuh
@@ -0,0 +1,128 @@
+#pragma once
+
+#define BD_PI 3.1415927f
+
+__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* __restrict__ force, const Vector3* __restrict__ pos,
+				const int& i, const int& j, const int& k) {
+	    
+	    
+	// Particle's type and position
+	Vector3 posa = pos[i];
+	Vector3 posb = pos[j];
+	Vector3 posc = pos[k];
+		
+	// The vectors between each pair of particles
+	const Vector3 ab = sys->wrapDiff(posa - posb);
+	const Vector3 bc = sys->wrapDiff(posb - posc);
+	const Vector3 ac = sys->wrapDiff(posc - posa);
+  
+	// Find the distance between each pair of particles
+	const float distab = ab.length();
+	const float distbc = bc.length();
+	const float distac = ac.length();
+  
+	// Find the cosine of the angle we want - <ABC	
+	float cos = (distbc * distbc + distab * distab - distac * distac) / (2.0f * distbc * distab);
+  
+	// If the cosine is illegitimate, set it to 1 or -1 so that acos won't fail
+	if (cos < -1.0f) cos = -1.0f;
+	if (cos > 1.0f) cos = 1.0f;
+
+	// Find the sine while we're at it.
+	float sin = sqrtf(1.0f - cos*cos);
+
+	// Now we can use the cosine to find the actual angle (in radians)		
+	float angle = acos(cos);
+
+	// transform angle to units of tabulated array index
+	angle /= a->angle_step;
+
+	// tableAngle[0] stores the potential at angle_step
+	// tableAngle[1] stores the potential at angle_step * 2, etc.
+	// 'home' is the index after which 'convertedAngle' would appear if it were stored in the table	
+	int home = int(floor(angle));
+	assert(home >= 0);
+	assert(home < a->size);
+
+	// Make angle the distance from [0,1) from the first index in the potential array index
+	angle -= home;
+		
+	// Linearly interpolate the potential	
+	float U0 = a->pot[home];
+	float dUdx = (a->pot[(home+1)] - U0) / a->angle_step;
+	// float energy = (dUdx * angle) + U0;
+	dUdx /= sqrtf(1.0f-cos*cos);
+
+	// Calculate the forces
+	Vector3 force1 = (dUdx/distab) * (ab * (cos/distab) - bc / distbc); // force on particle 1
+	Vector3 force3 = (dUdx/distbc) * (bc * (cos/distbc) - ab / distab); // force on particle 3
+	atomicAdd( &force[i], force1 );
+	atomicAdd( &force[k], force3 );
+	atomicAdd( &force[j], -(force1 + force3) );
+}
+
+
+__device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d,
+				const BaseGrid* __restrict__ sys, Vector3* __restrict__ forces, const Vector3* __restrict__ pos,
+				const int& i, const int& j, const int& k, const int& l) {
+	const Vector3 posa = pos[i];
+	const Vector3 posb = pos[j];
+	const Vector3 posc = pos[k];
+	const Vector3 posd = pos[l];
+		
+	const Vector3 ab = sys->wrapDiff(posa - posb);
+	const Vector3 bc = sys->wrapDiff(posb - posc);
+	const Vector3 cd = sys->wrapDiff(posc - posd);
+		
+	//const float distab = ab.length();
+	const float distbc = bc.length();
+	//const float distcd = cd.length();
+	
+	Vector3 crossABC = ab.cross(bc);
+	Vector3 crossBCD = bc.cross(cd);
+	Vector3 crossX = bc.cross(crossABC);
+
+	const float cos_phi = crossABC.dot(crossBCD) / (crossABC.length() * crossBCD.length());
+	const float sin_phi = crossX.dot(crossBCD) / (crossX.length() * crossBCD.length());
+		
+	const float angle = -atan2(sin_phi, cos_phi);
+
+	// float energy = 0.0f;
+	float force = 0.0f;
+	
+	Vector3 f1, f2, f3; // forces
+	f1 = -distbc * crossABC.rLength2() * crossABC;
+	f3 = -distbc * crossBCD.rLength2() * crossBCD;
+	f2 = -(ab.dot(bc) * bc.rLength2()) * f1 - (bc.dot(cd) * bc.rLength2()) * f3;
+	
+	// Shift "angle" by "PI" since    -PI < dihedral < PI
+	// And our tabulated potential data: 0 < angle < 2 PI
+	float t = (angle + BD_PI) / d->angle_step;
+	int home = (int) floorf(t);
+	t = t - home;
+
+	assert(home >= 0);
+	assert(home < d->size);
+	// home = home % size;
+	int home1 = (home + 1) >= d->size ? (home+1-d->size) : home+1;
+
+	//================================================
+	// Linear interpolation
+	float U0 = d->pot[home];       // Potential
+	float dU = d->pot[home1] - U0; // Change in potential
+		
+	// energy = dU * t + U0;
+	force = -dU / d->angle_step;
+
+	// avoid singularity when one angle is straight 
+	force = (crossABC.rLength() > 1.0f || crossBCD.rLength() > 1.0f) ? 0.0f : force;
+	    
+	f1 *= force;
+	f2 *= force;
+	f3 *= force;
+
+	atomicAdd( &forces[i], f1 );
+	atomicAdd( &forces[j], f2-f1 );
+	atomicAdd( &forces[k], f3-f2 );
+	atomicAdd( &forces[l], -f3 );
+}
-- 
GitLab