diff --git a/src/BaseGrid.cu b/src/BaseGrid.cu
index 75ccfbee4144e028bc69dc9c6e5236c5d0e64c90..0a8d9701061761b2a8105f3d95d0d2227bcdd91e 100644
--- a/src/BaseGrid.cu
+++ b/src/BaseGrid.cu
@@ -13,7 +13,6 @@
 // Also, allocate the main value array.
 void BaseGrid::init() {
 	basisInv = basis.inverse();
-	nynz = ny*nz;
 	size = nx*ny*nz;
 	val = new float[size];
 }
@@ -267,7 +266,6 @@ if(strcmp("origi", start) == 0) {
 	//printf("Size: %d %d %d\n", nx, ny, nz);
 	if (read == 3) {
 		size = nx*ny*nz;
-		nynz = ny*nz;
 		val = new float[size];
 		zero();
 	}
@@ -537,7 +535,7 @@ void BaseGrid::averageProfile(const char* fileName, int axis) {
 	int dir2 = (axis+2)%3;
 
 	int jump[3];
-	jump[0] = nynz;
+	jump[0] = ny*nz;
 	jump[1] = nz;
 	jump[2] = 1;
 
@@ -602,11 +600,10 @@ bool BaseGrid::crop(int x0, int y0, int z0, int x1, int y1, int z1, bool keep_or
 	float *new_val = new float[new_size];
 
 	int ind = 0;
-	int nynz = ny * nz;
 	for (int i = x0; i < x1; i++)
 		for (int j = y0; j < y1; j++)
 			for (int k = z0; k < z1; k++) {
-				int ind1 = k + j * nz + i * nynz;
+				int ind1 = k + j * nz + i * ny*nz;
 				new_val[ind++] = val[ind1];
 			}
 
@@ -658,7 +655,7 @@ void BaseGrid::getNeighbors(int j, int* indexBuffer) const {
 	for (int ix = -1; ix <= 1; ix++) {
 		for (int iy = -1; iy <= 1; iy++) {
 			for (int iz = -1; iz <= 1; iz++) {
-				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + nynz*wrap(jx+ix,nx);
+				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + ny*nz*wrap(jx+ix,nx);
 				indexBuffer[k] = ind;
 				k++;
 			}
@@ -673,7 +670,7 @@ void BaseGrid::getNeighborValues(NeighborList* neigh, int homeX, int homeY, int
 	for (int ix = -1; ix <= 1; ix++) {
 		for (int iy = -1; iy <= 1; iy++) {
 			for (int iz = -1; iz <= 1; iz++) {
-				int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + nynz*wrap(homeX+ix,nx);
+				int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + ny*nz*wrap(homeX+ix,nx);
 				neigh->v[ix+1][iy+1][iz+1] = val[ind];
 			}
 		}
diff --git a/src/BaseGrid.h b/src/BaseGrid.h
index 55669bd770225d88c910ce94f3f6c9e7e15dc348..b3a4cfa170d62790c73306dc3c659328b9208815 100644
--- a/src/BaseGrid.h
+++ b/src/BaseGrid.h
@@ -21,6 +21,14 @@
 #include <ctime>
 // #include <cuda.h>
 
+#ifndef gpuErrchk
+#define delgpuErrchk
+#define gpuErrchk(code) { if ((code) != cudaSuccess) {			                            \
+	    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), __FILE__, __LINE__); \
+	}}
+#endif
+
+
 enum BoundaryCondition { dirichlet, neumann, periodic };
 enum InterpolationOrder { linear = 1, cubic = 3 };
 
@@ -1044,14 +1052,49 @@ public:
   void getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const;
   inline void setVal(float* v) { val = v; }
 
+    BaseGrid* copy_to_cuda() const {
+	BaseGrid* g_d = NULL;
+	BaseGrid g_tmp;
+	float* val_d = NULL;
+	size_t sz = sizeof(float) * size;
+	gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid)));
+	gpuErrchk(cudaMalloc(&val_d, sz));
+	gpuErrchk(cudaMemcpy(val_d, val, sz, cudaMemcpyHostToDevice));
+	g_tmp.origin = origin;
+	g_tmp.basis = basis;
+	g_tmp.nx = nx;
+	g_tmp.ny = ny;
+	g_tmp.nz = nz;
+	g_tmp.size= size;
+	g_tmp.basisInv = basisInv;
+	g_tmp.val = val_d;
+	gpuErrchk(cudaMemcpy(g_d, &g_tmp, sizeof(BaseGrid), cudaMemcpyHostToDevice));
+	g_tmp.val = NULL;
+	return g_d;
+    }
+
+    static void remove_from_cuda(BaseGrid* g_d) {
+	BaseGrid g_tmp;
+	gpuErrchk(cudaMemcpy(&g_tmp, g_d, sizeof(BaseGrid), cudaMemcpyDeviceToHost));
+	gpuErrchk(cudaFree(&(g_tmp.val)));
+	g_tmp.val = NULL;
+	gpuErrchk(cudaMemcpy(g_d, &g_tmp, sizeof(BaseGrid), cudaMemcpyHostToDevice)); // copy NULL back to device
+	gpuErrchk(cudaFree(&g_d));
+    }
+
 public:
   Vector3 origin;
   Matrix3 basis;
   int nx, ny, nz;
-  int nynz;
   int size;
   Matrix3 basisInv;
 public:
   float* val;
 };
+
+#ifndef delgpuErrchk
+#undef  delgpuErrchk
+#undef  gpuErrchk
+#endif
+
 #endif
diff --git a/src/BrownParticlesKernel.h b/src/BrownParticlesKernel.h
index 36f7daa3ea95f0e3ddafdabde661b143382f17c7..5475a454d4a7b3ef4c70a904c8ce262a8d71780e 100644
--- a/src/BrownParticlesKernel.h
+++ b/src/BrownParticlesKernel.h
@@ -4,7 +4,7 @@
 
 template<const int BlockSize>
 static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], BrownianParticleType* part[], 
-                                                   float *vec_red, int n)
+                                                   float *vec_red, int num, int num_rb_attached_particles, int num_replicas)
 {
     __shared__ __align__(4) float sdata[BlockSize];
     
@@ -17,9 +17,12 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
 
     sdata[tid] = 0.f; 
 
+    int n = (num*num_replicas);
+
     while (i < n) 
-    { 
-        int t1 = type[i];
+    {
+	const int i1 = (i % num) +  (i/num)*(num+num_rb_attached_particles);
+        const int t1 = type[i1];
         const BrownianParticleType& pt1 = *part[t1];
 
         p1    = P_n[i];
@@ -27,7 +30,8 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         
         if(i + BlockSize < n)
         {
-            int t2 = type[i+BlockSize];
+	    const int i2 = ((i+BlockSize) % num) +  ((i+BlockSize)/num)*(num+num_rb_attached_particles);
+            const int t2 = type[i2];
             const BrownianParticleType& pt2 = *part[t2];
 
             p2    = P_n[i+BlockSize];
@@ -38,7 +42,7 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         else
             sdata[tid] += p1.length2() / mass1;
 
-        i += gridSize; 
+        i += gridSize;
     }
 
     sdata[tid] *= 0.50f;
diff --git a/src/BrownianParticleType.cpp b/src/BrownianParticleType.cpp
index e514500ec3b6b87058cf58389c2a99ab7ddd506d..226814419a20fef51671d3a17ada2539b445f738 100644
--- a/src/BrownianParticleType.cpp
+++ b/src/BrownianParticleType.cpp
@@ -5,6 +5,7 @@
 //////////////////////////////////////////
 void BrownianParticleType::clear() {
 	if (pmf != NULL) delete [] pmf;
+	if (pmf_scale != NULL) delete [] pmf_scale;
 	if (diffusionGrid != NULL) delete diffusionGrid;
 	if (forceXGrid != NULL) delete forceXGrid;
 	if (forceYGrid != NULL) delete forceYGrid;
@@ -12,6 +13,7 @@ void BrownianParticleType::clear() {
 	if (reservoir != NULL) delete reservoir;
         if (meanPmf != NULL) delete []  meanPmf;
 	pmf = NULL, diffusionGrid = NULL;
+	pmf_scale = NULL;
 	pmf_boundary_conditions = NULL;
 	forceXGrid = NULL, forceYGrid = NULL, forceZGrid = NULL;
 	reservoir = NULL, meanPmf = NULL;
@@ -26,6 +28,7 @@ void BrownianParticleType::copy(const BrownianParticleType& src) {
 	radius = src.radius;
 	eps = src.eps;
         pmf = src.pmf;
+        pmf_scale = src.pmf_scale;
         pmf_boundary_conditions = src.pmf_boundary_conditions;
 	meanPmf = src.meanPmf;
         numPartGridFiles = src.numPartGridFiles;
diff --git a/src/BrownianParticleType.h b/src/BrownianParticleType.h
index 524a5b66be452771f0aa8aec65d76dfddb60b7dd..8af1385a803768eabf9d3efe225ac01a31be38de 100644
--- a/src/BrownianParticleType.h
+++ b/src/BrownianParticleType.h
@@ -30,7 +30,7 @@ class BrownianParticleType {
 		BrownianParticleType(const String& name = "") :
 				name(name), num(0),
 				diffusion(0.0f), radius(1.0f), charge(0.0f), eps(0.0f), meanPmf(NULL),
-				numPartGridFiles(-1), reservoir(NULL), pmf(NULL), pmf_boundary_conditions(NULL),
+				numPartGridFiles(-1), reservoir(NULL), pmf(NULL), pmf_scale(NULL), pmf_boundary_conditions(NULL),
 				diffusionGrid(NULL),
 				forceXGrid(NULL), forceYGrid(NULL), forceZGrid(NULL){ }
 
@@ -62,8 +62,9 @@ public:
                 float mu; //for Nose-Hoover Langevin dynamics
 
 		Reservoir* reservoir;
-		BaseGrid* pmf;
-		BoundaryCondition* pmf_boundary_conditions;
+		BaseGrid** pmf;
+    float* pmf_scale;
+    BoundaryCondition* pmf_boundary_conditions;
 		BaseGrid* diffusionGrid;
 		BaseGrid* forceXGrid;
 		BaseGrid* forceYGrid;
diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 4f39b3fe3a905fca9703f03423b357a401bef198..440b2580a07b3bcb508913bdf40b0d56539a2442 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -40,14 +40,15 @@ void runSort(int2 *d1, int *d2, float *key,
 				unsigned int count);
 
 ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
-    num(c.num), numParts(c.numParts), sys(c.sys), switchStart(c.switchStart),
+    num(c.num), numParts(c.numParts), num_rb_attached_particles(c.num_rb_attached_particles),
+    sys(c.sys), switchStart(c.switchStart),
     switchLen(c.switchLen), electricConst(c.coulombConst),
     cutoff2((c.switchLen + c.switchStart) * (c.switchLen + c.switchStart)),
     decomp(c.sys->getBox(), c.sys->getOrigin(), c.switchStart + c.switchLen + c.pairlistDistance, numReplicas),
     numBonds(c.numBonds), numTabBondFiles(c.numTabBondFiles),
     numExcludes(c.numExcludes), numAngles(c.numAngles),
     numTabAngleFiles(c.numTabAngleFiles), numDihedrals(c.numDihedrals),
-    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), 
+    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints),
     numGroupSites(c.numGroupSites),
     numReplicas(numReplicas) {
 
@@ -234,10 +235,10 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 	restraintIds_d = NULL;
 
 	//Calculate the number of blocks the grid should contain
-	gridSize =  num / NUM_THREADS + 1;
+	gridSize =  (num+num_rb_attached_particles) / NUM_THREADS + 1;
 
 	// Create and allocate the energy arrays
-	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+numGroupSites) * numReplicas));
+	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+num_rb_attached_particles+numGroupSites) * numReplicas));
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 }
@@ -257,6 +258,7 @@ ComputeForce::~ComputeForce() {
 		    for (std::size_t g = 0; g < gpuman.gpus.size(); ++g) {
 			gpuman.use(g);
 			tablePot_addr[g][ind]->free_from_cuda(tablePot_addr[g][ind]);
+			tablePot_addr[g][ind] = NULL;
 		    }
 		    delete tablePot[ind];
 		}
@@ -341,7 +343,7 @@ void ComputeForce::updateNumber(int newNum) {
 
 	// Recalculate the number of blocks in the grid
 	gridSize = 0;
-	while ((int)sqrt(NUM_THREADS) * gridSize < num)
+	while ((int)sqrt(NUM_THREADS) * gridSize < num+num_rb_attached_particles)
 		++gridSize;
 
 	gpuErrchk(cudaFree(energies_d));
@@ -374,7 +376,7 @@ bool ComputeForce::addTabulatedPotential(String fileName, int type0, int type1)
 	    for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 		gpuman.use(i);
 		tablePot_addr[i][ind]->free_from_cuda(tablePot_addr[i][ind]);
-		delete tablePot_addr[i][ind];
+		tablePot_addr[i][ind] = NULL;
 	    }
 	    gpuman.use(0);
 	    delete tablePot[ind];
@@ -500,7 +502,7 @@ void ComputeForce::decompose() {
             cudaFree(decomp_d);
             decomp_d = NULL;
 	}	
-	decomp.decompose_d(pos_d[0], num);
+	decomp.decompose_d(pos_d[0], num+num_rb_attached_particles);
 	decomp_d = decomp.copyToCUDA();
 
 	// Update pairlists using cell decomposition (not sure this is really needed or good) 
@@ -514,52 +516,25 @@ void ComputeForce::decompose() {
 	
 	// initializePairlistArrays
 	int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
-
-	// int blocksPerCell = 10;
-
 	
 	/* cuMemGetInfo(&free,&total); */
 	/* printf("Free memory: %zu / %zu\n", free, total); */
 	
-	// const int NUMTHREADS = 128;
-	//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
-	// const size_t nBlocks = nCells*blocksPerCell;
-
-	/* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d[0], decomp_d); */
-	/* gpuErrchk(cudaDeviceSynchronize()); */
-	/* pairlistTest<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */
-	/* 																					 sys_d[0], decomp_d, nCells, blocksPerCell, */
-	/* 																					 numPairs_d[0], pairListListI_d, pairListListJ_d); */
-	/* gpuErrchk(cudaDeviceSynchronize());	 */
-
 	int tmp = 0;
 	gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp,	sizeof(int), cudaMemcpyHostToDevice));
 	gpuErrchk(cudaDeviceSynchronize());
-	// printf("Pairlistdist: %f\n",sqrt(pairlistdist2));
 
 #ifdef DEBUGEXCLUSIONS
 	initExSum();
 	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
 #endif
-    //Han-Yi Chou bind texture
-    //printf("%d\n", sizeof(Vector3));
-    //gpuErrchk(cudaBindTexture(0,  PosTex, pos_d[0],sizeof(Vector3)*num*numReplicas));
-    //gpuErrchk(cudaBindTexture(0,CellsTex, decomp_d->getCells_d(),sizeof(CellDecomposition::cell_t)*num*numReplicas));
-   
-//#if __CUDA_ARCH__ >= 300
-	//createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-    //#ifdef NEW
-   //for sm52
-    //createPairlists<32,64,1><<< dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], 
-      //GTX 980
-      //Han-Yi Chou 2017 my code
       
       #if __CUDA_ARCH__ >= 520
-      createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
+      createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
                                                                              pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
 									   excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex);
       #else //__CUDA_ARCH__ == 300
-      createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
+      createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
                                                                            pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, 
                                                                            excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex);
       #endif
@@ -576,77 +551,6 @@ void ComputeForce::decompose() {
       }
       gpuman.sync();
       #endif
-
-    //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
-    //                                                                  pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
-    //                                                                  excludeMap_d, numExcludes, pairlistdist2);
-
-    //#else
-    //createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, 
-      //                            pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-    //#endif
-//#else
-	// Use shared memory for warp_bcast function
-	//createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-    //#ifdef NEW
-    //for sm52
-    //createPairlists<32,64,1><<<dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], 
-      //GTX 980
-      //createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
-        //GTX 680
-        //createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
-        //                                                              pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, 
-        //                                                              excludeMap_d, numExcludes, pairlistdist2);
-    //createPairlists<64,64><<<dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
-    //                                                                  pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
-    //                                                                  excludeMap_d, numExcludes, pairlistdist2);
-
-    //#else
-    //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d,
-      //                                             pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, CellNeighborsList);
-    //#endif
-
-//#endif
-#if 0
-//////debug section			
-	// DEBUGING
-	gpuErrchk(cudaMemcpy(&tmp, numPairs_d[0],	sizeof(int), cudaMemcpyDeviceToHost));
-	//printf("CreatePairlist found %d pairs\n",tmp);
-        gpuErrchk(cudaDeviceSynchronize());
-
-        gpuErrchk( cudaProfilerStart() );
-
-        // Reset the cell decomposition.
-        if (decomp_d)
-            cudaFree(decomp_d);
-
-        decomp.decompose_d(pos_d[0], num);
-        decomp_d = decomp.copyToCUDA();
-
-	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
-        int tmp1 = 0;
-        gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp1,     sizeof(int), cudaMemcpyHostToDevice));
-        gpuErrchk(cudaDeviceSynchronize());
-        // printf("Pairlistdist: %f\n",sqrt(pairlistdist2));
-
-#ifdef DEBUGEXCLUSIONS
-        initExSum();
-        gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
-#endif
-        #if __CUDA_ARCH__ >= 300
-        createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-#else
-        // Use shared memory for warp_bcast function
-        createPairlists_debug<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-#endif
-    gpuErrchk(cudaMemcpy(&tmp1, numPairs_d[0],  sizeof(int), cudaMemcpyDeviceToHost));
-    printf("Difference CreatePairlist found %d pairs\n",tmp-tmp1);
-    gpuErrchk(cudaDeviceSynchronize());
-
-#ifdef DEBUGEXCLUSIONS
-	printf("Counted %d exclusions\n", getExSum());
-#endif
-#endif
 }
 
 IndexList ComputeForce::decompDim() const {
@@ -663,20 +567,20 @@ float ComputeForce::decompCutoff() { return decomp.getCutoff(); }
 
 float ComputeForce::computeFull(bool get_energy) {
 	float energy = 0.0f;
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tableAlpha_d,
-		tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize,
+		tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize,
 		numReplicas, get_energy);
 
 	// Calculate energy based on the array created by the kernel
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites);
 	}
 
 	return energy;
@@ -684,20 +588,20 @@ float ComputeForce::computeFull(bool get_energy) {
 
 float ComputeForce::computeSoftcoreFull(bool get_energy) {
 	float energy = 0.0f;
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d,
-			tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize,
+			tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize,
 			numReplicas, get_energy);
 
 	// Calculate energy based on the array created by the kernel
 	if (get_energy) {
 		cudaDeviceSynchronize();
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles);
 	}
 
 	return energy;
@@ -729,13 +633,13 @@ float ComputeForce::computeElecFull(bool get_energy) {
 float ComputeForce::compute(bool get_energy) {
 	float energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d,
-			tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d[0],
+			tableAlpha_d, tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0],
 			decomp_d, energies_d, switchStart, switchLen, gridSize, numReplicas,
 			get_energy);
 
@@ -744,7 +648,7 @@ float ComputeForce::compute(bool get_energy) {
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites);
 	}
 
 	return energy;
@@ -757,7 +661,7 @@ float ComputeForce::compute(bool get_energy) {
 float ComputeForce::computeTabulated(bool get_energy) {
 	float energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 	
@@ -773,7 +677,7 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	{
 		//clearEnergies<<< nb, numThreads >>>(energies_d,num);
 		//gpuErrchk(cudaDeviceSynchronize());
-	        cudaMemset((void*)energies_d, 0, sizeof(float)*(num+numGroupSites)*numReplicas);
+	        cudaMemset((void*)energies_d, 0, sizeof(float)*(num+num_rb_attached_particles+numGroupSites)*numReplicas);
 		computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0],
 						cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], energies_d);
 	}
@@ -866,27 +770,27 @@ float ComputeForce::computeTabulated(bool get_energy) {
 float ComputeForce::computeTabulatedFull(bool get_energy) {
 	energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
-	computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d,	tablePot_d[0], tableBond_d, num, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d);
+	computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d,	tablePot_d[0], tableBond_d, num+num_rb_attached_particles, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d);
 	gpuErrchk(cudaDeviceSynchronize());
 
 	computeAngles<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], angles_d, tableAngle_d,
-																						 numAngles, num, sys_d[0], energies_d,
+																						 numAngles, num+num_rb_attached_particles, sys_d[0], energies_d,
 																						 get_energy);
 	gpuErrchk(cudaDeviceSynchronize());
 	computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], dihedrals_d,
 																							  tableDihedral_d, numDihedrals,
-																								num, sys_d[0], energies_d,
+																								num+num_rb_attached_particles, sys_d[0], energies_d,
 																								get_energy);
 	// Calculate the energy based on the array created by the kernel
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles);
 	}
 
 	return energy;
@@ -894,7 +798,7 @@ float ComputeForce::computeTabulatedFull(bool get_energy) {
 
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 {
-    const size_t tot_num = (num+numGroupSites) * numReplicas;
+    const size_t tot_num = (num+num_rb_attached_particles+numGroupSites) * numReplicas;
 
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
@@ -935,11 +839,10 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom)
 {
-    const size_t tot_num = (num+numGroupSites) * numReplicas;
-    const size_t small_tot_num = num * numReplicas;
+    const size_t tot_num = num * numReplicas;
 
         gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
-        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * small_tot_num, cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 
 	copyToCUDA(forceInternal,pos);
         gpuErrchk(cudaDeviceSynchronize());
@@ -965,10 +868,11 @@ void ComputeForce::setForceInternalOnDevice(Vector3* f) {
 void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, const Restraint* const restraints)
 {
     assert(simNum == numReplicas); // Not sure why we have both of these things
-    int tot_num = num * simNum;
+    int tot_num_with_rb = (num+num_rb_attached_particles) * simNum;
+    int tot_num_with_rb_group = (num+num_rb_attached_particles+numGroupSites) * simNum;
 	// type_d
-	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num));
-	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num, cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num_with_rb));
+	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num_with_rb, cudaMemcpyHostToDevice));
 	
 	if (numBonds > 0)
 	{
@@ -977,8 +881,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap,
 		gpuErrchk(cudaMemcpyAsync(bonds_d, bonds, sizeof(Bond) * numBonds, cudaMemcpyHostToDevice));
 		
 		// bondMap_d
-		gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * num));
-		gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * num, cudaMemcpyHostToDevice));
+		gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * tot_num_with_rb_group));
+		gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * tot_num_with_rb_group, cudaMemcpyHostToDevice));
 	}
 
 	if (numExcludes > 0) {
@@ -990,8 +894,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap,
 				cudaMemcpyHostToDevice));
 		
 		// excludeMap_d
-		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num));
-		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num,
+		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * tot_num_with_rb));
+		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * tot_num_with_rb,
 				cudaMemcpyHostToDevice));
 	}
 
diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh
index 8735799be098867866ddc865fa39579e3ff77bc5..2d6dac72262eb436802303010731b03040de542e 100644
--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -296,7 +296,6 @@ void createNeighborsList(const int3 *Cells,int* __restrict__ CellNeighborsList)
         }
     }
 }
-//#ifdef NEW
 template<const int BlockSize,const int Size,const int N>
 __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas,
                                 const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
@@ -424,101 +423,6 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
     }
 }
 
-#if 0
-template<const int BlockSize,const int Size> 
-__global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, 
-                                const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, 
-                                const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type, 
-                                int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes, 
-                                const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2) 
-{
-    const int TotalBlocks = gridDim.x * gridDim.y;
-    const int cells       = TotalBlocks / Size;
-    const int cell_start  = (blockIdx.x + gridDim.x  *  blockIdx.y) / Size;
-    const int pid_start   = (blockIdx.x + gridDim.x  *  blockIdx.y) % Size;
-    const int tid         = threadIdx.x + blockDim.x * threadIdx.y 
-                                        + blockDim.x *  blockDim.y * threadIdx.z;
-    const int warpLane    =  tid % WARPSIZE;
-    const int repID       =  blockIdx.z;
-
-    const CellDecomposition::cell_t* __restrict__ cellInfo = decomp->getCells_d();
-
-    for(int cellid_i = cell_start; cellid_i < nCells; cellid_i += cells)
-    {
-        CellDecomposition::range_t rangeI = decomp->getRange(cellid_i,repID);
-        int Ni = rangeI.last-rangeI.first;
-
-        for(int pid_i = pid_start; pid_i < Ni; pid_i += Size)
-        {
-            int ai      = cellInfo[rangeI.first+pid_i].particle;
-
-            float4 a = tex1Dfetch<float4>(PosTex,ai);
-            Vector3 A(a.x,a.y,a.z);
-
-            int2 ex_pair = make_int2(-1,-1); 
-            if(numExcludes > 0 && excludeMap != NULL)
-            {
-                ex_pair = excludeMap[ai];
-            }
-
-            int currEx = ex_pair.x;
-            int nextEx = (ex_pair.x >= 0) ? excludes[currEx].ind2 : -1;
- 
-            //loop over neighbor directions
-            for(int idx = 0; idx < 27; ++idx)
-            {
-                int neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i);
-
-                if(neighbor_cell < 0)
-                {
-                    continue;       
-                }
-
-                CellDecomposition::range_t rangeJ = decomp->getRange(neighbor_cell,repID);
-                int Nj = rangeJ.last-rangeJ.first;
-
-                // In each neighbor cell, loop over particles
-                for(int pid_j = tid; pid_j < Nj; pid_j += BlockSize)
-                {
-                    int aj = cellInfo[pid_j+rangeJ.first].particle;
-                    if(aj <= ai)
-                    {
-                        continue;
-                    }
-
-                    while (nextEx >= 0 && nextEx < ( aj - repID * num))
-                    {
-                        nextEx = (currEx < ex_pair.y - 1) ? excludes[++currEx].ind2 : -1;
-                    }
-
-                    if (nextEx == (aj - repID * num))
-                    {
-                        #ifdef DEBUGEXCLUSIONS
-                        atomicAggInc( exSum, warpLane );
-                        #endif
-                        nextEx = (currEx < ex_pair.y - 1) ? excludes[++currEx].ind2 : -1;
-                        continue;
-                    }
-
-                    float4 b = tex1Dfetch(PosTex,aj);
-                    Vector3 B(b.x,b.y,b.z);
-
-                    float dr = (sys->wrapDiff(A-B)).length2();
-
-                    if(dr < pairlistdist2)
-                    {
-                        int gid = atomicAggInc( g_numPairs, warpLane );
-                        int pairType = type[ai] + type[aj] * numParts;
-
-                        g_pair[gid] = make_int2(ai,aj);
-                        g_pairTabPotType[gid] = pairType;
-                    }
-                }    
-            }
-        }
-    }
-}
-#endif
 __global__
 void createPairlists_debug(Vector3* __restrict__ pos, const int num, const int numReplicas,
                                 const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
@@ -597,104 +501,6 @@ void createPairlists_debug(Vector3* __restrict__ pos, const int num, const int n
     }
 }
 
-//#else
-#if 0
-__global__
-void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas,
-				const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
-				const int nCells,
-				int* g_numPairs, int2* g_pair,
-				int numParts, const int* __restrict__ type, int* __restrict__ g_pairTabPotType,
-				const Exclude* __restrict__ excludes, const int2* __restrict__ excludeMap, const int numExcludes,
-				float pairlistdist2, const int* __restrict__ CellNeighborsList) {
-	// TODO: loop over all cells with edges within pairlistdist2
-
-	// Loop over threads searching for atom pairs
-	//   Each thread has designated values in shared memory as a buffer
-	//   A sync operation periodically moves data from shared to global
-	const int tid = threadIdx.x;
-	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
-	const int split = 32;					/* numblocks should be divisible by split */
-	/* const int blocksPerCell = gridDim.x/split;  */
-	const CellDecomposition::cell_t* __restrict__ cellInfo = decomp->getCells();
-	for (int cID = 0 + (blockIdx.x % split); cID < nCells; cID += split) {
-	// for (int cID = blockIdx.x/blocksPerCell; cID < nCells; cID += split ) {
-		for (int repID = 0; repID < numReplicas; repID++) {
-			const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID);
-
-			for (int ci = rangeI.first + blockIdx.x/split; ci < rangeI.last; ci += gridDim.x/split) {
-			// ai - index of the particle in the original, unsorted array
-				const int ai = cellInfo[ci].particle;
-				// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
-				const CellDecomposition::cell_t celli = cellInfo[ci];
-				// Vector3 posi = pos[ai];
-
-				// Same as for bonds, but for exclusions now
-                                const int ex_start = (numExcludes > 0 && excludeMap != NULL) ? excludeMap[ai -repID*num].x : -1;
-                                const int ex_end   = (numExcludes > 0 && excludeMap != NULL) ? excludeMap[ai -repID*num].y : -1;
-				/*
-				for (int x = -1; x <= 1; ++x) {
-					for (int y = -1; y <= 1; ++y) {
-                                                #pragma unroll(3)
-						for (int z = -1; z <= 1; ++z) {					
-				*/
-		                for(int x = 0; x < 27; ++x) {	
-							//const int nID = decomp->getNeighborID(celli, x, y, z);
-							const int nID = CellNeighborsList[x+27*cID];//elli.id]; 
-							if (nID < 0) continue; 
-							
-							// Initialize exclusions
-							// TODO: optimize exclusion code (and entire kernel)
-							int currEx = ex_start;
-							int nextEx = (ex_start >= 0) ? excludes[currEx].ind2 : -1;
-							int ajLast = -1; // TODO: remove this sanity check
-							
-							const CellDecomposition::range_t range = decomp->getRange(nID, repID);
-
-							for (int n = range.first + tid; n < range.last; n+=blockDim.x) {
-							    const int aj = cellInfo[n].particle;
-							    if (aj <= ai) continue;
-								
-								// Skip excludes
-								//   Implementation requires that aj increases monotonically
-								assert( ajLast < aj ); ajLast = aj; // TODO: remove this sanity check
-								while (nextEx >= 0 && nextEx < (aj - repID * num)) // TODO get rid of this
-								    nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1;
-								if (nextEx == (aj - repID * num)) {
-#ifdef DEBUGEXCLUSIONS
-								    atomicAggInc( exSum, warpLane );	
-#endif
-								    nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1;
-								    continue;
-								} 
-								// TODO: Skip non-interacting types for efficiency
-
-								// Skip ones that are too far away
-								const float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2();
-								// const float dr = (sys->wrapDiff(pos[aj] - posi)).length2();
-								if (dr > pairlistdist2) continue;
-
-								// Add to pairlist
-								
-								int gid = atomicAggInc( g_numPairs, warpLane );
-								int pairType = type[ai] + type[aj] * numParts;
-								/* assert( ai >= 0 ); */
-								/* assert( aj >= 0 ); */
-
-								g_pair[gid] = make_int2(ai,aj);
-								g_pairTabPotType[gid] = pairType;
-                                                                
-								// g_pairDists[gid] = dr; 
-							} 	// atoms J
-						//} 		// z				
-					//} 			// y
-				} 				// x
-			} // atoms I					
-		} // replicas
-		/* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */
-	}
-}
-#endif
 // TODO: deprecate?
 __global__
 void computeKernel(Vector3 force[], Vector3 pos[], int type[],
@@ -774,42 +580,6 @@ __global__ void printPairForceCounter() {
 		printf("Computed the force for %d pairs\n", pairForceCounter);
 }
 
-// ============================================================================
-// Kernel computes forces between Brownian particles (ions)
-// using cell decomposition
-//
-#if 0
-__global__ void computeTabulatedKernel(
-	Vector3* force, const Vector3* __restrict__ pos,
-	const BaseGrid* __restrict__ sys, float cutoff2,
-	const int* __restrict__ g_numPairs,	const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType, 	TabulatedPotential** __restrict__ tablePot) {
-//remove int* type. remove bond references	
-
-	const int numPairs = *g_numPairs;
-	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) {
-		const int2 pair = g_pair[i];
-		const int ai = pair.x;
-		const int aj = pair.y;
-			
-		// Particle's type and position
-		const int ind = g_pairTabPotType[i];
-
-	 	/* printf("tid,bid,pos[ai(%d)]: %d %d %f %f %f\n", ai, threadIdx.x, blockIdx.x, pos[ai].x, pos[ai].y, pos[ai].z); //*/
-
-		Vector3 dr = pos[aj] - pos[ai];
-		dr = sys->wrapDiff(dr);
-	
-    // Calculate the force based on the tabulatedPotential file
-		float d2 = dr.length2();
-		if (tablePot[ind] != NULL && d2 <= cutoff2) {
-			Vector3 f = tablePot[ind]->computef(dr,d2);
-			atomicAdd( &force[ai],  f );
-			atomicAdd( &force[aj], -f );
-		}
-	}
-}
-#endif
-//#if 0
 template<const int BlockSize>
 __device__ inline void _computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys, 
 					       float cutoff2, const int numPairs, const int2* __restrict__ g_pair, 
@@ -870,11 +640,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restric
 				       cutoff2, numPairs, g_pair+start,
 				       g_pairTabPotType+start, tablePot,
 				       pairListsTex, PosTex, pairTabPotTypeTex);
-}
-
-    
-//#endif
- 
+} 
 
 __global__ void clearEnergies(float* __restrict__  g_energies, int num) {
 	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
diff --git a/src/ComputeForce.h b/src/ComputeForce.h
index 5d602e4a609482a73ba7c639f8a575fc3023b2a7..b3bc258541f9603f387221363f3c39fe7fbae881 100644
--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -174,6 +174,7 @@ private:
 	int numReplicas;
 	int num;
 	int numParts;
+	int num_rb_attached_particles;
 	int numBonds;
 	int numExcludes;
 	int numTabBondFiles;
diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu
index 1ecfb9fad191cb1e70d787e384198b57b2a87883..f2eb6fde0bd9802b89537f39af10efd9f32618df 100644
--- a/src/ComputeGridGrid.cu
+++ b/src/ComputeGridGrid.cu
@@ -2,7 +2,6 @@
 #include "ComputeGridGrid.cuh"
 #include "RigidBodyGrid.h"
 #include "CudaUtil.cuh"
-//RBTODO handle periodic boundaries
 //RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
 
 class GridPositionTransformer {
@@ -17,6 +16,7 @@ private:
     const Vector3 c;
     const BaseGrid* s;
 };
+
 //class PmfPositionTransformer : public BasePositionTransformer {
 class PmfPositionTransformer {
 public:
@@ -52,8 +52,7 @@ inline void common_computeGridGridForce(const RigidBodyGrid* rho, const RigidBod
 	    Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */
 
 	    r_pos = basis_rho.transform( r_pos );
-	    r_pos = transformer(r_pos);
-		const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
+	    const Vector3 u_ijk_float = basis_u_inv.transform( transformer( r_pos ) );
 		// RBTODO: Test for non-unit delta
 		/* Vector3 tmpf  = Vector3(0.0f); */
 		/* float tmpe = 0.0f; */
@@ -71,7 +70,8 @@ inline void common_computeGridGridForce(const RigidBodyGrid* rho, const RigidBod
 		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
 		force[tid].f = basis_u_inv.transpose().transform( r_val*(force[tid].f) ); /* transform to lab frame, with correct scaling factor */
                 force[tid].e = r_val;
-		// Calculate torque about origin_u in the lab frame
+
+		// Calculate torque about origin_rho in the lab frame
 		torque[tid].f = r_pos.cross(force[tid].f);
 	}
 
@@ -171,9 +171,11 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	}
 }
 
+
 __global__
 void createPartlist(const Vector3* __restrict__ pos,
 				const int numTypeParticles, const int* __restrict__ typeParticles_d,
+		    const int attached_particle_start, const int attached_particle_end,
 				int* numParticles_d, int* particles_d,
 				const Vector3 gridCenter, const float radius2, BaseGrid* sys_d) {
 	const int tid = threadIdx.x;
@@ -182,11 +184,13 @@ void createPartlist(const Vector3* __restrict__ pos,
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < numTypeParticles) {
 		int aid = typeParticles_d[i];
-		float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2();
-
-		if (dist <= radius2) {
+		if (aid < attached_particle_start || aid >= attached_particle_end) { 
+		    float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2();
+		
+		    if (dist <= radius2) {
 			int tmp = atomicAggInc(numParticles_d, warpLane);
 			particles_d[tmp] = aid;
+		    }
 		}
 	}
 }		
diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh
index 739aa622e2514840f8586c47d0a277d306427ff4..27fc32ccb6b3b3fd43dd005fd9eda7eb1689c8f8 100644
--- a/src/ComputeGridGrid.cuh
+++ b/src/ComputeGridGrid.cuh
@@ -29,6 +29,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 extern __global__
 void createPartlist(const Vector3* __restrict__ pos,
 				const int numTypeParticles, const int* __restrict__ typeParticles_d,
+		    const int attached_particle_start, const int attached_particle_end,
 				int* numParticles_d, int* particles_d,
 				const Vector3 gridCenter, const float radius2, BaseGrid* sys_d);
 	
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index 41b4cf3a1dd9c97bd1bf9b28b2591058627b0287..e7bdfd71c57e163646c644d6ef3bb85485f4e7a7 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -6,6 +6,7 @@
 #include <cassert>
 #include <stdlib.h>     /* srand, rand */
 #include <time.h>       /* time */
+#include <string>
 #include <iostream>
 using namespace std;
 
@@ -96,8 +97,24 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
     }
   } // end result: variable "num" is set
 
+	// Count particles associated with rigid bodies
+	num_rb_attached_particles = 0;
+	if (numRigidTypes > 0) {
+	    // grow list of rbs
+	    for (int i = 0; i < numRigidTypes; i++) {
+		RigidBodyType &rbt = rigidBody[i];
+		rbt.attach_particles();
+		num_rb_attached_particles += rbt.num * rbt.num_attached_particles();
+	    }
+	}
+	assert( num_rb_attached_particles == 0 || simNum == 1 ); // replicas not yet implemented
+	// num = num+num_rb_attached_particles;
+
+
 	// Set the number capacity
 	printf("\n%d particles\n", num);
+	printf("%d particles attached to RBs\n", num_rb_attached_particles);
+
 	if (numCap <= 0) numCap = numCapFactor*num; // max number of particles
 	if (numCap <= 0) numCap = 20;
 
@@ -105,18 +122,35 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	printf("%d groups\n", numGroupSites);
 
 	// Allocate particle variables.
-	pos = new Vector3[num * simNum];
+	// Each replica works with num+num_rb_attached_particles in array
+	pos = new Vector3[ (num+num_rb_attached_particles) * simNum];
+
         //Han-Yi Chou
         if (ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin"))
-            momentum = new Vector3[num * simNum];
-
-	type   = new int[num * simNum];
-	serial = new int[num * simNum];
-	posLast = new Vector3[num * simNum];
+            momentum = new Vector3[(num+num_rb_attached_particles) * simNum];
+
+	type   = new int[(num+num_rb_attached_particles) * simNum];
+	serial = new int[(num+num_rb_attached_particles) * simNum];
+	posLast = new Vector3[(num+num_rb_attached_particles) * simNum];
+
+	{
+	    int pidx = 0;
+	    for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types
+		RigidBodyType &rbt = rigidBody[i];
+		for (int j = 0; j < rbt.num; ++j) { // Loop over RBs
+		    for (const int& t: rbt.get_attached_particle_types()) {
+			type[num+pidx] = t;
+			serial[num+pidx] = num+pidx;
+			pidx++;
+		    }
+		}
+	    }
+	}	
+	
         //Han-Yi Chou
         if(ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin"))
-           momLast = new Vector3[num * simNum];
-	name = new String[num * simNum];
+           momLast = new Vector3[(num+num_rb_attached_particles) * simNum];
+	name = new String[(num+num_rb_attached_particles) * simNum];
 	currSerial = 0;
 
 
@@ -163,7 +197,12 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 				String* tokenList = new String[numTokens];
 				partsFromFile[i].tokenize(tokenList);
 
-				int currType = 0;
+				int currType = find_particle_type(tokenList[2]);
+				if (currType == -1) {
+				    printf("Error: Unable to find particle type %s\n", tokenList[2].val());
+				    exit(1);
+
+				}
 				for (int j = 0; j < numParts; j++)
 					if (tokenList[2] == part[j].name)
 						currType = j;
@@ -215,7 +254,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
                                 printf("done!\n");
                         }
                         else
-                            loadedMomentum = Boltzmann(COM_Velocity, num * simNum);
+                            loadedMomentum = Boltzmann(COM_Velocity, (num * simNum));
                     }
                 }
             }
@@ -257,72 +296,66 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	}
 
 	printf("\nFound %d particle types.\n", numParts);
-	// Load the potential grids.
+
 	printf("Loading the potential grids...\n");
+	// First load a single copy of each grid
 	for (int i = 0; i < numParts; i++) 
         {
-            String map = partGridFile[i][0];
-            int len = map.length();
-
-            if (len >= 3 && map[len-3]=='.' && map[len-2]=='d' && map[len-1]=='x') 
-            {
-                part[i].pmf     = new BaseGrid[part[i].numPartGridFiles];
-                part[i].meanPmf = new float[part[i].numPartGridFiles];
-                for(int j = 0; j < part[i].numPartGridFiles; ++j)
-                {
-                    map = partGridFile[i][j];
-                    len = map.length();
-                    if (!(len >= 3 && map[len-3]=='.' && map[len-2]=='d' && map[len-1]=='x'))
-                    {
-                        cout << "currently do not support different format " << endl;
-                        exit(1);
-                    }
-                    // Decide which type of grid is given.
-                    //String map = partGridFile[i];
- 
-	            // A dx file. Load the old-fashioned way.
-	            //part[i].pmf[j] = new BaseGrid(map.val());
-	            BaseGrid tmp(map.val());
-		    part[i].pmf[j] = tmp;
-		    if (partGridFileScale[i][j] != 1.0f) 
-                        part[i].pmf[j].scale(partGridFileScale[i][j]);
-
-			//part[i].meanPmf = part[i].pmf->mean();
-		    part[i].meanPmf[j] = part[i].pmf[j].mean();
-		    printf("Loaded dx potential grid `%s'.\n", map.val());
-		    printf("Grid size %s.\n", part[i].pmf[j].getExtent().toString().val());
-                }
-            } 
-	    else if  (len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f') 
-            {
-                OverlordGrid* over = new OverlordGrid[part[i].numPartGridFiles];
-                part[i].meanPmf = new float[part[i].numPartGridFiles];
-                for(int j = 0; j < part[i].numPartGridFiles; ++j)
-                {
-                    map = partGridFile[i][j];
-                    len = map.length();
-                    if (!(len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f'))
-                    {
-                        cout << "currently do not support different format " << endl;
-                        exit(1);
-                    }
-
-                    String rootGrid = OverlordGrid::readDefFirst(map);
-                    over[j] = OverlordGrid(rootGrid.val());
-		    int count = over->readDef(map);
-		    printf("Loaded system def file `%s'.\n", map.val());
-		    printf("Found %d unique grids.\n", over->getUniqueGridNum());
-		    printf("Linked %d subgrids.\n", count);
-                    part[i].meanPmf[j] = part[i].pmf[j].mean();
-                 }
-      		 part[i].pmf = static_cast<BaseGrid*>(over);
-	     } 
-             else 
-             {
-	         printf("WARNING: Unrecognized gridFile extension. Must be *.def or *.dx.\n");
-		 exit(-1);
-	     }
+	    for(int j = 0; j < part[i].numPartGridFiles; ++j)
+	    {
+		std::string fname(partGridFile[i][j].val(), partGridFile[i][j].length());
+		if (part_grid_dictionary.count( fname ) == 0)
+		{
+		    int len = fname.length();
+		    if (len >= 3 && fname[len-3]=='.' && fname[len-2]=='d' && fname[len-1]=='x')
+		    {
+			part_grid_dictionary.insert({fname, BaseGrid(fname.c_str())});
+		    }
+		    else if  (len >= 4 && fname[len-4]=='.' && fname[len-3]=='d' && fname[len-2]=='e' && fname[len-1]=='f')
+		    {
+			assert(1==2); // Throw exception because this implementation needs to be revisited
+/*                OverlordGrid* over = new OverlordGrid[part[i].numPartGridFiles];
+		  part[i].meanPmf = new float[part[i].numPartGridFiles];
+		  for(int j = 0; j < part[i].numPartGridFiles; ++j)
+		  {
+		  map = partGridFile[i][j];
+		  len = map.length();
+		  if (!(len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f'))
+		  {
+		  cout << "currently do not support different format " << endl;
+		  exit(1);
+		  }
+
+		  String rootGrid = OverlordGrid::readDefFirst(map);
+		  over[j] = OverlordGrid(rootGrid.val());
+		  int count = over->readDef(map);
+		  printf("Loaded system def file `%s'.\n", map.val());
+		  printf("Found %d unique grids.\n", over->getUniqueGridNum());
+		  printf("Linked %d subgrids.\n", count);
+		  part[i].meanPmf[j] = part[i].pmf[j].mean();
+		  }
+		  part[i].pmf = static_cast<BaseGrid*>(over);
+*/
+		    } else {
+			printf("WARNING: Unrecognized gridFile extension. Must be *.def or *.dx.\n");
+			exit(-1);
+		    }
+		}
+	    }
+	}
 
+	// Then assign grid addresses to particles
+	for (int i = 0; i < numParts; i++)
+        {
+	    part[i].pmf     = new BaseGrid*[part[i].numPartGridFiles];
+	    part[i].pmf_scale = new float[part[i].numPartGridFiles];
+	    part[i].meanPmf = new float[part[i].numPartGridFiles];
+	    for(int j = 0; j < part[i].numPartGridFiles; ++j)
+	    {
+		part[i].pmf[j] = &(part_grid_dictionary.find( std::string(partGridFile[i][j]) )->second);
+		part[i].pmf_scale[j] = partGridFileScale[i][j];
+		part[i].meanPmf[j] = part[i].pmf[j]->mean(); // TODO: review how this is used and decide whether to scale
+	    }
 		if (partForceXGridFile[i].length() != 0) {
 			part[i].forceXGrid = new BaseGrid(partForceXGridFile[i].val());
 			printf("Loaded `%s'.\n", partForceXGridFile[i].val());
@@ -385,7 +418,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	sys = new BaseGrid( Matrix3(basis1,basis2,basis3), origin, 1, 1, 1 );
     } else {
 	// TODO: use largest system in x,y,z
-	sys = part[0].pmf;
+	sys = *part[0].pmf;
     }
     sysDim = sys->getExtent();
 
@@ -425,13 +458,45 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 			excludes = new Exclude[numExcludes];
 			for (int i = 0; i < oldNumExcludes; i++)
 				excludes[i] = tempExcludes[i];
-			delete tempExcludes;
+			delete [] tempExcludes;
 		}
 		for (int i = oldNumExcludes; i < numExcludes; i++)
 			excludes[i] = newExcludes[i - oldNumExcludes];
 		printf("Built %d exclusions.\n",numExcludes);
 	}
 
+	{ // Add exclusions for RB attached particles
+	    std::vector<Exclude> ex;
+	    int start = num;
+	    for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types
+		RigidBodyType &rbt = rigidBody[i];
+		const int nap = rbt.num_attached_particles();
+		for (int j = 0; j < rbt.num; ++j) { // Loop over RBs
+		    for (int ai = 0; ai < nap-1; ++ai) {
+			for (int aj = ai+1; aj < nap; ++aj) {
+			    ex.push_back( Exclude( ai+start, aj+start ) );
+			}
+		    }
+		    start += nap;
+		}
+	    }
+	    // copy
+	    int oldNumExcludes = numExcludes;
+	    numExcludes = numExcludes + ex.size();
+	    if (excludes == NULL) {
+		excludes = new Exclude[numExcludes];
+	    } else if (numExcludes >= excludeCapacity) {
+		Exclude* tempExcludes = excludes;
+		excludes = new Exclude[numExcludes];
+		for (int i = 0; i < oldNumExcludes; i++)
+		    excludes[i] = tempExcludes[i];
+		delete [] tempExcludes;
+	    }
+	    for (int i = oldNumExcludes; i < numExcludes; i++)
+		excludes[i] = ex[i - oldNumExcludes];
+	}
+
+	printf("Built %d exclusions.\n",numExcludes);		
 	buildExcludeMap();
 
 	// Count number of particles of each type
@@ -439,7 +504,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	for (int i = 0; i < numParts; ++i) {
 		numPartsOfType[i] = 0;
 	}
-	for (int i = 0; i < num; ++i) {
+	for (int i = 0; i < num+num_rb_attached_particles; ++i) {
 		++numPartsOfType[type[i]];
 	}
 
@@ -594,7 +659,17 @@ Configuration::~Configuration() {
 }
 
 void Configuration::copyToCUDA() {
-	printf("Copying particle data to GPU %d\n", GPUManager::current());
+    printf("Copying particle grids to GPU %d\n", GPUManager::current());
+    for (const auto& pair : part_grid_dictionary)
+    {
+	// Copy PMF
+	const BaseGrid& g = pair.second;
+	BaseGrid *g_d = g.copy_to_cuda();
+	part_grid_dictionary_d.insert({pair.first, g_d});
+	// printf("Assigning grid for %s to %p (originally %p)\n", pair.first.c_str(), (void *) part_grid_dictionary_d[pair.first], (void *) g_d);
+    }
+
+    printf("Copying particle data to GPU %d\n", GPUManager::current());
 
 	BrownianParticleType **part_addr = new BrownianParticleType*[numParts];
 
@@ -604,11 +679,31 @@ void Configuration::copyToCUDA() {
 	
 	for (int i = 0; i < numParts; i++) 
         {
-	        BaseGrid *pmf = NULL, *diffusionGrid = NULL;
 		BrownianParticleType *b = new BrownianParticleType(part[i]);
-		// Copy PMF
+		// Copy PMF pointers
 		if (part[i].pmf != NULL) 
                 {
+		    {
+			BaseGrid** tmp_d = new BaseGrid*[part[i].numPartGridFiles];
+			BaseGrid** tmp   = new BaseGrid*[part[i].numPartGridFiles];
+			for(int j = 0; j < part[i].numPartGridFiles; ++j) {
+			    // printf("Retrieving grid for %s (at %p)\n", partGridFile[i][j].val(), (void *) part_grid_dictionary_d[std::string(partGridFile[i][j])]);
+			    tmp[j] = part_grid_dictionary_d[std::string(partGridFile[i][j])];
+			}
+			gpuErrchk(cudaMalloc(&tmp_d, sizeof(BaseGrid*)*part[i].numPartGridFiles));
+			gpuErrchk(cudaMemcpy(tmp_d, tmp, sizeof(BaseGrid*)*part[i].numPartGridFiles,
+					     cudaMemcpyHostToDevice));
+			b->pmf = tmp_d;
+		    }
+
+		    {
+			float *tmp;
+			gpuErrchk(cudaMalloc(&tmp, sizeof(float)*part[i].numPartGridFiles));
+			gpuErrchk(cudaMemcpy(tmp, part[i].pmf_scale, sizeof(float)*part[i].numPartGridFiles,
+					     cudaMemcpyHostToDevice));
+			b->pmf_scale = tmp;
+		    }
+
 		    {
 			float *tmp;
 			gpuErrchk(cudaMalloc(&tmp, sizeof(float)*part[i].numPartGridFiles));
@@ -624,40 +719,17 @@ void Configuration::copyToCUDA() {
 			gpuErrchk(cudaMemcpy(tmp, part[i].pmf_boundary_conditions, s, cudaMemcpyHostToDevice));
 			b->pmf_boundary_conditions = tmp;
 		    }
-
-		    gpuErrchk(cudaMalloc(&pmf, sizeof(BaseGrid)*part[i].numPartGridFiles));
-                    for(int j = 0; j < part[i].numPartGridFiles; ++j)
-                    { 
-                        float *val = NULL;
-		        size_t sz = sizeof(float) * part[i].pmf[j].getSize();
-		        //gpuErrchk(cudaMalloc(pmf, sizeof(BaseGrid)));
-		        gpuErrchk(cudaMalloc(&val, sz));
-		        gpuErrchk(cudaMemcpyAsync(val, part[i].pmf[j].val, sz, cudaMemcpyHostToDevice));
-		        BaseGrid *pmf_h = new BaseGrid(part[i].pmf[j]);
-	                pmf_h->val = val;
-		        gpuErrchk(cudaMemcpy(pmf+j, pmf_h, sizeof(BaseGrid), cudaMemcpyHostToDevice));
-		        pmf_h->val = NULL;
-                    }
                     
 		}
-		b->pmf = pmf;
 
 		// Copy the diffusion grid
 		if (part[i].diffusionGrid != NULL) {
-			float *val = NULL;
-			size_t sz = sizeof(float) * part[i].diffusionGrid->getSize();
-		  BaseGrid *diffusionGrid_h = new BaseGrid(*part[i].diffusionGrid);
-		  gpuErrchk(cudaMalloc(&diffusionGrid, sizeof(BaseGrid)));
-		  gpuErrchk(cudaMalloc(&val, sz));
-			diffusionGrid_h->val = val;
-			gpuErrchk(cudaMemcpyAsync(diffusionGrid, diffusionGrid_h, sizeof(BaseGrid),
-					cudaMemcpyHostToDevice));
-		  gpuErrchk(cudaMemcpy(val, part[i].diffusionGrid->val, sz, cudaMemcpyHostToDevice));
-			diffusionGrid_h->val = NULL;
+		    b->diffusionGrid = part[i].diffusionGrid->copy_to_cuda();
+		} else {
+		    b->diffusionGrid = NULL;
 		}
 		
 		//b->pmf = pmf;
-		b->diffusionGrid = diffusionGrid;
 		gpuErrchk(cudaMalloc(&part_addr[i], sizeof(BrownianParticleType)));
 		gpuErrchk(cudaMemcpyAsync(part_addr[i], b, sizeof(BrownianParticleType),
 				cudaMemcpyHostToDevice));
@@ -677,7 +749,7 @@ void Configuration::copyToCUDA() {
 	gpuErrchk(cudaMalloc(&sys_d, sizeof(BaseGrid)));
 	gpuErrchk(cudaMemcpyAsync(sys_d, sys, sizeof(BaseGrid), cudaMemcpyHostToDevice));
 	/*gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum));
-	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int+num_rb_attached_particles) * num * simNum, cudaMemcpyHostToDevice));
 	
 	if (numBonds > 0) {
 		// bonds_d
@@ -696,7 +768,7 @@ void Configuration::copyToCUDA() {
 				cudaMemcpyHostToDevice));
 		
 		// excludeMap_d
-		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num));
+		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * (num));
 		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num,
 				cudaMemcpyHostToDevice));
 	}
@@ -1115,6 +1187,8 @@ int Configuration::readParameters(const char * config_file) {
 		else if (param == String("rotDamping"))
 			rigidBody[currRB].rotDamping = stringToVector3( value );
 
+		else if (param == String("attachedParticles"))
+			rigidBody[currRB].append_attached_particle_file(value);
 		else if (param == String("densityGrid"))
 			rigidBody[currRB].addDensityGrid(value);
 		else if (param == String("potentialGrid"))
@@ -1135,6 +1209,8 @@ int Configuration::readParameters(const char * config_file) {
                         rigidBody[currRB].initAngularMomentum = stringToVector3(value);
 		else if (param == String("inputRBCoordinates"))
 			inputRBCoordinates = value;
+		else if (param == String("restartRBCoordinates"))
+		        restartRBCoordinates = value;
 		
 		// COMMON
 		else if (param == String("num")) {
@@ -1271,6 +1347,7 @@ void Configuration::readAtoms() {
 		for (int i = 0; i < numParts; i++)
 			if (part[i].num == 0)
 				found = false;
+		// assert(false); // TODO probably relax constraint that particle must be found; could just be in RB
 		if (!found) {
 			printf("ERROR: Number of particles not specified in config file.\n");
 			exit(1);
@@ -1450,9 +1527,11 @@ void Configuration::readGroups() {
 		std::vector<int> tmp;
 		for (int i=1; i < numTokens; ++i) {
 		    const int ai = atoi(tokenList[i].val());
-		    if (ai >= num) {
+		    if (ai >= num+num_rb_attached_particles) {
 			printf("Error: Attempted to include invalid particle in group: %s\n", line);
 			exit(-1);
+		    } else if (ai >= num) {
+			printf("WARNING: including RB particles in group with line: %s\n", line);
 		    }
 		    tmp.push_back( ai );
 		}
@@ -1510,7 +1589,8 @@ void Configuration::readBonds() {
 			continue;
 		}
 
-		if (ind1 < 0 || ind1 >= num+numGroupSites || ind2 < 0 || ind2 >= num+numGroupSites) {
+		if (ind1 < 0 || ind1 >= num+num_rb_attached_particles+numGroupSites ||
+		    ind2 < 0 || ind2 >= num+num_rb_attached_particles+numGroupSites) {
 			printf("ERROR: Bond file line '%s' includes invalid index\n", line);
 			exit(1);
 		}
@@ -1558,8 +1638,8 @@ void Configuration::readBonds() {
 	 * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin
 	 * bondMap[i].y is the index in the bonds array where the ith particle's bonds end
 	 */
-	bondMap = new int2[num+numGroupSites];
-	for (int i = 0; i < num+numGroupSites; i++) {	
+	bondMap = new int2[num+num_rb_attached_particles+numGroupSites];
+	for (int i = 0; i < num+num_rb_attached_particles+numGroupSites; i++) {
 		bondMap[i].x = -1;
 		bondMap[i].y = -1;
 	}
@@ -1617,8 +1697,8 @@ void Configuration::readExcludes()
 	}
 }
 void Configuration::addExclusion(int ind1, int ind2) {
-    if (ind1 >= num || ind2 >= num) {
-	printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num);
+    if (ind1 >= num+num_rb_attached_particles || ind2 >= num+num_rb_attached_particles) {
+	printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num+num_rb_attached_particles);
 	return;
     }
 		
@@ -1661,8 +1741,8 @@ void Configuration::buildExcludeMap() {
      * excludeMap[i].x is the index in the excludes array where the ith particle's excludes begin
      * excludeMap[i].y is the index in the excludes array where the ith particle's excludes end
      */
-    excludeMap = new int2[num];
-    for (int i = 0; i < num; i++) {	
+    excludeMap = new int2[num+num_rb_attached_particles];
+    for (int i = 0; i < num+num_rb_attached_particles; i++) {
 	excludeMap[i].x = -1;
 	excludeMap[i].y = -1;
     }
@@ -1671,7 +1751,7 @@ void Configuration::buildExcludeMap() {
     for (int i = 0; i < numExcludes; i++) {
 	if (excludes[i].ind1 != currPart) {
 	    currPart = excludes[i].ind1;
-	    assert(currPart < num);
+	    assert(currPart < num+num_rb_attached_particles);
 	    excludeMap[currPart].x = i;
 	    if (lastPart >= 0)
 		excludeMap[lastPart].y = i;
@@ -1720,7 +1800,7 @@ void Configuration::readAngles() {
 		int ind3 = atoi(tokenList[3].val());
 		String file_name = tokenList[4];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites or ind3 >= num+numGroupSites)
+		if (ind1 >= num+num_rb_attached_particles+numGroupSites or ind2 >= num+num_rb_attached_particles+numGroupSites or ind3 >= num+num_rb_attached_particles+numGroupSites)
 			continue;
 
 		if (numAngles >= capacity) {
@@ -1781,8 +1861,10 @@ void Configuration::readDihedrals() {
 		int ind4 = atoi(tokenList[4].val());
 		String file_name = tokenList[5];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites
-				or ind3 >= num+numGroupSites or ind4 >= num+numGroupSites)
+		if (ind1 >= num+num_rb_attached_particles+numGroupSites or
+		    ind2 >= num+num_rb_attached_particles+numGroupSites or
+		    ind3 >= num+num_rb_attached_particles+numGroupSites or
+		    ind4 >= num+num_rb_attached_particles+numGroupSites)
 			continue;
 
 		if (numDihedrals >= capacity) {
@@ -1841,7 +1923,7 @@ void Configuration::readRestraints() {
 		float y0 = (float) strtod(tokenList[4].val(), NULL);
 		float z0 = (float) strtod(tokenList[5].val(), NULL);
 
-		if (id >= num+numGroupSites) continue;
+		if (id >= num + num_rb_attached_particles + numGroupSites) continue;
 
 		if (numRestraints >= capacity) {
 			Restraint* temp = restraints;
diff --git a/src/Configuration.h b/src/Configuration.h
index debf4b9f6ea4d0ce8eaa9c52d70d6c57f1efdcbb..6199dd8b24bc019dd7044b2a3a6031312142f13d 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -11,6 +11,7 @@
 
 #include <algorithm> // sort
 #include <vector>
+#include <map>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -64,6 +65,7 @@ class Configuration {
 	void readDihedrals();
 	void readRestraints();
 
+
 	bool readTableFile(const String& value, int currTab);
 	bool readBondFile(const String& value, int currBond);
 	bool readAngleFile(const String& value, int currAngle);
@@ -88,6 +90,15 @@ public:
 	Configuration(const char * config_file, int simNum = 0, bool debug=false);
 	~Configuration();
 
+    int find_particle_type(const char* s) const {
+	for (int j = 0; j < numParts; j++) {
+	    // printf("Searching particle %d (%s) =? %s\n", j, part[j].name.val(), s);
+	    if (strcmp(s,part[j].name.val()) == 0)
+		return j;
+	}
+	return -1;
+    }
+
 	void copyToCUDA();
 
 	// Output variables
@@ -121,7 +132,8 @@ public:
 	Bond* bonds;
 	int numCap; // max number of particles
 	int num; // current number of particles
-	Vector3* pos; //  position of each particle
+    int num_rb_attached_particles;
+        Vector3* pos; //  position of each particle
         Vector3* momentum; //momentum of each brownian particles Han-Yi Chou
         Vector3  COM_Velocity; //center of mass velocity Han-Yi Chou
 	int* type; // type of each particle
@@ -148,6 +160,7 @@ public:
 	String inputCoordinates;
         String inputMomentum; //Han-Yi Chou
 	String inputRBCoordinates;
+	String restartRBCoordinates;
 	int copyReplicaCoordinates;
 	String restartCoordinates;
         String restartMomentum; //Han-Yi Chou
@@ -208,6 +221,8 @@ public:
 	//float* partGridFileScale;
 	float **partGridFileScale;
         //int *numPartGridFiles;
+    std::map<std::string,BaseGrid> part_grid_dictionary;
+    std::map<std::string,BaseGrid*> part_grid_dictionary_d;
 	std::vector< std::vector<String> > partRigidBodyGrid;
 	String* partDiffusionGridFile;
 	String* partForceXGridFile;
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index a8576e54ab0e07474cdd18db2c6e138f27caf5da..fe55aad463b5b27ea6f095329ea14ed3917d498e 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -97,10 +97,11 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	bonds = c.bonds;
 	numCap = c.numCap;                      // max number of particles
 	num = c.num;                            // current number of particles
+	num_rb_attached_particles = c.num_rb_attached_particles;
 	numGroupSites = c.numGroupSites;
 
 	// Allocate arrays of positions, types and serial numbers
-	pos    = new Vector3[(num+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
+	pos    = new Vector3[(num+num_rb_attached_particles+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
         // Allocate arrays of momentum Han-Yi Chou
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
         {
@@ -112,8 +113,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
         //for debug
         //force = new Vector3[num * numReplicas];
 
-	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
-	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
+	type   = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' types.
+	serial = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' serial numbers.
 
 	// Allocate things for rigid body
 	// RBC = RigidBodyController(c);
@@ -127,8 +128,10 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	
 	// Replicate identical initial conditions across all replicas
 	for (int r = 0; r < numReplicas; ++r) {
-	  std::copy(c.type, c.type + num, type + r*num);
-	  std::copy(c.serial, c.serial + num, serial + r*num);
+	    std::copy(c.type, c.type + num+num_rb_attached_particles,
+		      type + r*(num+num_rb_attached_particles));
+	    std::copy(c.serial, c.serial + num + num_rb_attached_particles,
+		      serial + r*(num+num_rb_attached_particles));
 	  if (c.copyReplicaCoordinates > 0)
 	    std::copy(c.pos, c.pos + num, pos + r*num);
 	}
@@ -307,8 +310,12 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	auto _get_index = [this](int idx, int replica) {
 	    // Convenient lambda function to deal with increasingly complicated indexing
-	    auto num = this->num; auto numReplicas = this->numReplicas; auto numGroupSites = this->numGroupSites;
-	    idx = (idx < num) ? idx + replica*num : (idx-num) + numReplicas*num + replica * numGroupSites;
+	    auto num = this->num;
+	    auto numReplicas = this->numReplicas;
+	    auto num_rb_attached_particles = this->num_rb_attached_particles;
+	    auto numGroupSites = this->numGroupSites;
+	    idx = (idx < num+num_rb_attached_particles) ? idx + replica*(num+num_rb_attached_particles)
+		: (idx-num-num_rb_attached_particles) + numReplicas*(num+num_rb_attached_particles) + replica * numGroupSites;
 	    return idx;
 	};
 
@@ -367,7 +374,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	}
 	internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList);
 	
-	forceInternal = new Vector3[(num+numGroupSites)*numReplicas];
+	forceInternal = new Vector3[(num+num_rb_attached_particles+numGroupSites)*numReplicas];
 	if (fullLongRange != 0)
 	    printf("No cell decomposition created.\n");
 
@@ -488,20 +495,20 @@ GrandBrownTown::~GrandBrownTown() {
 	
 		
 }
-//temporary test for Nose-Hoover Langevin dynamics
+
 //Nose Hoover is now implement for particles.
-void GrandBrownTown::RunNoseHooverLangevin()
+void GrandBrownTown::run()
 {
-    //comment this out because this is the origin points Han-Yi Chou
+
     // Open the files for recording ionic currents
     for (int repID = 0; repID < numReplicas; ++repID) 
     {
-
-        writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
+        writers[repID]->newFile(pos + repID*(num+num_rb_attached_particles), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             momentum_writers[repID]->newFile((momentum + repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
         //random_writers[repID]->newFile(random + (repID * num), name, 0.0f, num);
     }
+
     // Initialize timers (util.*)
     wkf_timerhandle timer0, timerS;
     timer0 = wkf_timer_create();
@@ -569,16 +576,16 @@ void GrandBrownTown::RunNoseHooverLangevin()
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i)
-            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+conf.num_rb_attached_particles), sys_d);
         gpuErrchk(cudaDeviceSynchronize());
     }
 
     float t; // simulation time
 
-    int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
+    int numBlocks = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + (((num+num_rb_attached_particles) * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
     int tl = temperatureGridFile.length();
     Vector3 *force_d;
-    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+numGroupSites) * numReplicas));
+    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+num_rb_attached_particles+numGroupSites) * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
     for (int i=0; i< gpuman.gpus.size(); ++i) {
@@ -599,11 +606,23 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    internal->clear_force();
 	    internal->clear_energy();
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
-	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num, numGroupSites, numReplicas);
+
+	    if (num_rb_attached_particles > 0) {
+		#pragma omp parallel for
+		for(int i = 0; i < numReplicas; ++i) {
+		    RBC[i]->update_attached_particle_positions(
+			internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles),
+			internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles),
+			internal->getEnergy()+num+i*(num+num_rb_attached_particles),
+			sys_d, num, num_rb_attached_particles, numReplicas);
+		}
+	    }
+
+	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
-		gpuman.nccl_broadcast(0, _pos, _pos, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_broadcast(0, _pos, _pos, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 	    gpuman.sync();
@@ -633,7 +652,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal -> computeTabulated(get_energy);
                             break;
@@ -657,7 +676,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal->compute(get_energy);
                             break;
@@ -676,13 +695,32 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     }
                 }
             }//if inter-particle force
+
+	    if (get_energy) {
+		compute_position_dependent_force_for_rb_attached_particles
+		    <<< numBlocks, NUM_THREADS >>> (
+			internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
+			internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	    } else {
+		compute_position_dependent_force_for_rb_attached_particles
+		    <<< numBlocks, NUM_THREADS >>> (
+			internal -> getPos_d()[0],
+			internal -> getForceInternal_d()[0], internal -> getEnergy(),
+			internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	    }
+
+
             #ifdef _OPENMP
             omp_set_num_threads(4);
             #endif
             #pragma omp parallel for
             for(int i = 0; i < numReplicas; ++i)
-                RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
-                                       RigidBodyInterpolationType, sys, sys_d);
+                RBC[i]->updateForces(internal->getPos_d()[0]+i*(num+num_rb_attached_particles),
+				     internal->getForceInternal_d()[0]+i*(num+num_rb_attached_particles),
+				     s,
+				     internal->getEnergy()+i*(num+num_rb_attached_particles),
+				     get_energy,
+				     RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles);
             if(rigidbody_dynamic == String("Langevin"))
             {
                 #ifdef _OPENMP
@@ -698,28 +736,29 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-		gpuman.nccl_reduce(0, _f, _f, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 
-	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 
         }//if step == 1
 
 	internal->clear_energy();
 	gpuman.sync();
 
+
         if(particle_dynamic == String("Langevin"))
-            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
+            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
             updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), 
-            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, 
+            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d,
             randoGen_d, numReplicas, ParticleInterpolationType);
         ////For Brownian motion
         else
             updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getForceInternal_d()[0], internal -> getType_d(),
-                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, 
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas,
                                                        internal->getEnergy(), get_energy, ParticleInterpolationType);
 
         if(rigidbody_dynamic == String("Langevin"))
@@ -748,10 +787,21 @@ void GrandBrownTown::RunNoseHooverLangevin()
             }
         }
 
+	if (num_rb_attached_particles > 0) {
+	    #pragma omp parallel for
+	    for(int i = 0; i < numReplicas; ++i) {
+		RBC[i]->update_attached_particle_positions(
+		    internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles),
+		    internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles),
+		    internal->getEnergy()+num+i*(num+num_rb_attached_particles),
+		    sys_d, num, num_rb_attached_particles, numReplicas);
+	    }
+	}
+	
         if (s % outputPeriod == 0) {
             // Copy particle positions back to CPU
 	    gpuErrchk(cudaDeviceSynchronize());
-            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost));
 	}
         if (imd_on && clientsock && s % outputPeriod == 0)
         {
@@ -830,9 +880,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
         for(int i = 0; i < numReplicas; ++i) 
             RBC[i]->clearForceAndTorque();
 
+	
 	if (numGroupSites > 0) {
 	    gpuman.sync();
-	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 	    gpuman.sync();
 	}
 
@@ -844,7 +895,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
 		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
-		gpuman.nccl_broadcast(0, _p, _p, (num+numGroupSites)*numReplicas, nccl_broadcast_streams);
+		gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams);
 	    }
 	    #endif
     	}
@@ -865,13 +916,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             #endif
                             #pragma omp parallel for
                             for(int i = 0; i < numReplicas; ++i)
-                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                         }
                         internal -> computeTabulated(get_energy);
 			#ifdef USE_NCCL
 			if (gpuman.gpus.size() > 1) {
 			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+			    gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1);
 			}
 			#endif
                         break;
@@ -912,28 +963,43 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 }
             }
         }
+
+	if (get_energy) {
+	    compute_position_dependent_force_for_rb_attached_particles
+		<<< numBlocks, NUM_THREADS >>> (
+		    internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
+		    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	} else {
+	    compute_position_dependent_force_for_rb_attached_particles
+		<<< numBlocks, NUM_THREADS >>> (
+		    internal -> getPos_d()[0],
+		    internal -> getForceInternal_d()[0], internal -> getEnergy(),
+		    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	}
+
+
         //compute the force for rigid bodies
         #ifdef _OPENMP
         omp_set_num_threads(4);
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
-            RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
-                                 RigidBodyInterpolationType, sys, sys_d);
+            RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy,
+				 RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles);
 
 	if (numGroupSites > 0) {
 	    gpuman.sync();
 	    // if ((s%100) == 0) {
-	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// } else {
-	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// }
 	    gpuman.sync();
 	}
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], 
-            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, 
+            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy,
             ParticleInterpolationType);
             //gpuErrchk(cudaDeviceSynchronize());
 
@@ -957,7 +1023,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             {
-                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * (num) * numReplicas, cudaMemcpyDeviceToHost));
             }
             t = s*timestep;
             // Loop over all replicas
@@ -968,12 +1034,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     updateNameList(); // no need for it here if particles stay the same
 
                 // Write the trajectory.
-                writers[repID]->append(pos + (repID*num), name, serial, t, num);
+                writers[repID]->append(pos + repID*(num+num_rb_attached_particles), name, serial, t, num);
 
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
-                    momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
-                    //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
+                    momentum_writers[repID]->append(momentum + repID * (num+num_rb_attached_particles), name, serial, t, num);
+                    //force_writers[repID]->append(force + repID * (num+num_rb_attached_particles), name, serial, t, num);
                 }
             }
             // TODO: Currently, not compatible with replicas. Needs a fix.
@@ -1005,7 +1071,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 float e = 0.f;
                 float V = 0.f;
                 thrust::device_ptr<float> en_d(internal->getEnergy());
-                V = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas;
+                V = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas;
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
                     gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
@@ -1112,511 +1178,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
      gpuErrchk(cudaFree(force_d));
 } // GrandBrownTown::run()
 
-// Run the Brownian Dynamics steps.
-void GrandBrownTown::run() {
-
-        RunNoseHooverLangevin();
-#if 0
-	printf("\n\n");
-	Vector3 runningNetForce(0.0f);
-	
-	// Open the files for recording ionic currents
-	for (int repID = 0; repID < numReplicas; ++repID) {
-		writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
-	}
-        //Han-Yi Chou
-        if(particle_dynamic == String("Langevin"))
-        {
-            for (int repID = 0; repID < numReplicas; ++repID)
-            {
-                momentum_writers[repID]->newFile(momentum + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
-                //force_writers[repID]->newFile(force + (repID * num), name, 0.0f, num);
-            }
-        }
-
-	// Save (remember) particle positions for later (analysis)
-	remember(0.0f);
-
-	// Initialize timers (util.*)
-	wkf_timerhandle timer0, timerS;
-	timer0 = wkf_timer_create();
-	timerS = wkf_timer_create();
-
-	copyToCUDA();
-        if(particle_dynamic == String("Langevin"))
-	    internal -> copyToCUDA(forceInternal, pos,momentum);
-        else
-            internal -> copyToCUDA(forceInternal, pos);
-
-	// IMD Stuff
-	void* sock = NULL;
-	void* clientsock = NULL;
-	int length;
-	if (imd_on) {
-		printf("Setting up incoming socket\n");
-		vmdsock_init();
-		sock = vmdsock_create();
-		clientsock = NULL;
-		vmdsock_bind(sock, imd_port);
-
-		printf("Waiting for IMD connection on port %d...\n", imd_port);
-		vmdsock_listen(sock);
-		while (!clientsock) {
-			if (vmdsock_selread(sock, 0) > 0) {
-				clientsock = vmdsock_accept(sock);
-				if (imd_handshake(clientsock))
-					clientsock = NULL;
-			}
-		}
-		sleep(1);
-		if (vmdsock_selread(clientsock, 0) != 1 ||
-				imd_recv_header(clientsock, &length) != IMD_GO) {
-			clientsock = NULL;
-		}
-		imdForces = new Vector3[num*numReplicas];
-		for (size_t i = 0; i < num; ++i) // clear old forces
-			imdForces[i] = Vector3(0.0f);
-
-	} // endif (imd_on)
-
-	// Start timers
-	wkf_timer_start(timer0);
-	wkf_timer_start(timerS);
-
-	// We haven't done any steps yet.
-	// Do decomposition if we have to
-	if (fullLongRange == 0)
-	{
-		// cudaSetDevice(0);
-		internal->decompose();
-		gpuErrchk(cudaDeviceSynchronize());
-		RBC.updateParticleLists( internal->getPos_d() );
-	}
-
-	float t; // simulation time
-
-        int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
-        int tl = temperatureGridFile.length();
-        Vector3 *force_d;
-        gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
-
-	printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
-
-	// Main loop over Brownian dynamics steps
-	for (long int s = 1; s < steps; s++) 
-        {
-            // Compute the internal forces. Only calculate the energy when we are about to output.
-	    bool get_energy = ((s % outputEnergyPeriod) == 0);
-            //At the very first time step, the force is computed
-            if(s == 1) 
-            {
-                // 'interparticleForce' - determines whether particles interact with each other
-                gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
-                RBC.clearForceAndTorque(); //Han-Yi Chou
-
-		if (interparticleForce) 
-                {
-                    //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
-                    //RBC.clearForceAndTorque(); //Han-Yi Chou
-
-	            // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
-		    if (tabulatedPotential) 
-                    {
-		        // Using tabulated potentials
-		        // 'fullLongRange' - determines whether 'cutoff' is used
-		        // 0 - use cutoff (cell decomposition) [ N*log(N) ]
-		        // 1 - do not use cutoff [ N^2 ]
-		        switch (fullLongRange) 
-                        {
-		            case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
-                            //I want to replace this
-                            // by checking how far particles move to decide whether the pair list should be updated
-                            //if(NeedUpdatePairList()) //ToDo, Han-Yi Chou
-                            //{
-                            //internal-> decompose();
-                            //  RBC.updateParticleLists( internal->getPos_d() );
-                            //}
-			        if (s % decompPeriod == 0)
-		                {
-		                    // cudaSetDevice(0);
-				    internal -> decompose();
-				    RBC.updateParticleLists( internal->getPos_d() );
-			        }
-						
-			        //MLog: added Bond* bondList to the list of passed in variables.
-			        /*energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d,	angles_d, dihedrals_d, get_energy);*/
-		               // energy = internal -> computeTabulated(get_energy);
-			        internal -> computeTabulated(get_energy);
-			        break;
-			    default: 
-                               // [ N^2 ] interactions, no cutoff | decompositions
-			        internal->computeTabulatedFull(get_energy);
-			        break;
-		        }
-                    } 
-                    else 
-                    {
-		        // Not using tabulated potentials.
-		        switch (fullLongRange) 
-                        {
-                            case 0: // Use cutoff | cell decomposition.
-		               if (s % decompPeriod == 0)
-		               {
-		                   // cudaSetDevice(0);
-				   internal->decompose();
-				   RBC.updateParticleLists( internal->getPos_d() );
-			       }
-			       internal->compute(get_energy);
-			       break;
-
-			    case 1: // Do not use cutoff
-		                internal->computeFull(get_energy);
-				break;
-
-			    case 2: // Compute only softcore forces.
-		                internal->computeSoftcoreFull(get_energy);
-				break;
-
-		            case 3: // Compute only electrostatic forces.
-				internal->computeElecFull(get_energy);
-				break;
-		        }
-	            }
-		}//if inter-particle force
-
-	        gpuErrchk(cudaDeviceSynchronize());
-		RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType);
-                if(rigidbody_dynamic == String("Langevin"))
-                {
-                    RBC.SetRandomTorques();
-                    RBC.AddLangevin();
-                }
-		gpuErrchk(cudaDeviceSynchronize());
-            }//if step == 1
-
-          
-	    //Han-Yi Chou
-            //update the rigid body positions and orientation
-            //So far only brownian dynamics is used
-            //RBC.integrate(sys, s);
-
-	    if(particle_dynamic == String("Langevin"))
-                updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
-            //For Brownian motion
-            else
-                updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), 
-                                                           part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
-
-
-            if(rigidbody_dynamic == String("Langevin"))
-            {
-                RBC.integrateDLM(sys,0);
-                RBC.integrateDLM(sys,1);
-            }
-            else
-                RBC.integrate(sys,s);
-
-
-            if (s % outputPeriod == 0) {
-                // Copy particle positions back to CPU
-		gpuErrchk(cudaDeviceSynchronize());
-                gpuErrchk(cudaMemcpy(pos, internal ->  getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
-	    }
-
-            //compute again the new force with new positions.
-            //reset the internal force, I hope. Han-Yi Chou
-            //First clear the old force
-            if (imd_on && clientsock && s % outputPeriod == 0) 
-            {
-                float* coords = new float[num*3]; // TODO: move allocation out of run loop
-                int* atomIds = new int[num]; // TODO: move allocation out of run loop
-                int length;
-
-                bool paused = false;
-                while (vmdsock_selread(clientsock, 0) > 0 || paused) 
-                {
-                    switch (imd_recv_header(clientsock, &length)) 
-                    {
-                        case IMD_DISCONNECT:
-                            printf("[IMD] Disconnecting...\n");
-                            imd_disconnect(clientsock);
-                            clientsock = NULL;
-                            sleep(5);
-                            break;
-                        case IMD_KILL:
-                            printf("[IMD] Killing...\n");
-                            imd_disconnect(clientsock);
-                            clientsock = NULL;
-                            steps = s; // Stop the simulation at this step
-                            sleep(5);
-                            break;
-                        case IMD_PAUSE:
-                            paused = !paused;
-                            break;
-                        case IMD_GO:
-                            printf("[IMD] Caught IMD_GO\n");
-                            break;
-                        case IMD_MDCOMM:
-                            for (size_t i = 0; i < num; ++i) // clear old forces
-                                imdForces[i] = Vector3(0.0f);
-
-                            if (imd_recv_mdcomm(clientsock, length, atomIds, coords)) 
-                            {
-                                printf("[IMD] Error receiving forces\n");
-                            } 
-                            else 
-                            {
-                                for (size_t j = 0; j < length; ++j) 
-                                {
-                                    int i = atomIds[j];
-                                    imdForces[i] = Vector3(coords[j*3], coords[j*3+1], coords[j*3+2]) * conf.imdForceScale;
-                                }
-                            }
-                            break;
-                        default:
-                            printf("[IMD] Something weird happened. Disconnecting..\n");
-                            break;
-                    }
-                }
-                if (clientsock) 
-                {
-                    // float* coords = new float[num*3]; // TODO: move allocation out of run loop
-                    for (size_t i = 0; i < num; i++) 
-                    {
-                        const Vector3& p = pos[i];
-                        coords[3*i] = p.x;
-                        coords[3*i+1] = p.y;
-                        coords[3*i+2] = p.z;
-                    }
-                    imd_send_fcoords(clientsock, num, coords);
-                }
-                delete[] coords;
-                delete[] atomIds;
-            }
-            if (imd_on && clientsock) 
-                internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
-            //else
-            //{ 
-                //int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-                //MLog: along with calls to internal (ComputeForce class) this function should execute once per GPU.
-                //clearInternalForces<<< numBlocks, NUM_THREADS >>>(internal->getForceInternal_d(), num*numReplicas);
-                //use cudaMemset instead
-                //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
-                //RBC.clearForceAndTorque();
-            //}
-            //compute the new force for particles
-            RBC.clearForceAndTorque();
-            gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
-            //RBC.clearForceAndTorque();
-
-            if (interparticleForce) 
-            {
-                // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
-                if (tabulatedPotential)
-                {
-                    switch (fullLongRange) 
-                    {
-                        case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
-                            if (s % decompPeriod == 0)
-                            {
-                                internal -> decompose();
-                                RBC.updateParticleLists( internal->getPos_d() );
-                            }
-                            internal -> computeTabulated(get_energy);
-                            break;
-                        default: // [ N^2 ] interactions, no cutoff | decompositions
-                            internal->computeTabulatedFull(get_energy);
-                            break;
-                    }
-                } 
-                else 
-                {
-                    // Not using tabulated potentials.
-                    switch (fullLongRange) 
-                    {
-                        case 0: // Use cutoff | cell decomposition.
-                            if (s % decompPeriod == 0)
-                            {
-                               internal->decompose();
-                               RBC.updateParticleLists( internal->getPos_d() );
-                            }
-                            internal->compute(get_energy);
-                            break;
-                        case 1: // Do not use cutoff
-                            internal->computeFull(get_energy);
-                            break;
-
-                        case 2: // Compute only softcore forces.
-                            internal->computeSoftcoreFull(get_energy);
-                            break;
-
-                        case 3: // Compute only electrostatic forces.
-                            internal->computeElecFull(get_energy);
-                            break;
-                    }
-                }
-            }
-
-            //compute the force for rigid bodies
-            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType);
-            //Han-Yi Chou
-            //For BAOAB, the last update is only to update the momentum
-            // gpuErrchk(cudaDeviceSynchronize());
-
-            if(particle_dynamic == String("Langevin"))
-                LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
-
-           //gpuErrchk(cudaDeviceSynchronize());
-
-           if(rigidbody_dynamic == String("Langevin"))
-           {
-               RBC.SetRandomTorques();
-               RBC.AddLangevin();
-               RBC.integrateDLM(sys,2);
-               RBC.print(s);
-           }
- 
-           if (s % outputPeriod == 0) 
-           {
-                if(particle_dynamic == String("Langevin"))
-                {
-		    // TODO: make async
-                    gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
-                    /*
-                    gpuErrchk(cudaMemcpy(force, force_d, sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
-                    Vector3 f(0.f), p(0.f), r0(0.f);
-                    double total_mass = 0.;
-                    for(int i = 0; i < num * numReplicas; ++i)
-                    {
-                        f = f + force[i];
-                        p = p + momentum[i];
-                        total_mass += part[type[i]].mass;
-                        r0 = r0 + part[type[i]].mass * pos[i];
-                    }
-                    printf("The COM is %f %f %f\n",r0.x / total_mass, r0.y / total_mass, r0.z / total_mass);
-                    printf("The total momentum is %f %f %f\n",p.x, p.y, p.z);
-                    printf("The total force %f %f %f\n",f.x, f.y, f.z);
-                    */
-		    // Output trajectories (to files)
-		}
-                //RBC.print(s);
-                t = s*timestep;
-
-		// Loop over all replicas
-		for (int repID = 0; repID < numReplicas; ++repID) 
-                {
-
-                    if (numberFluct == 1) 
-                        updateNameList(); // no need for it here if particles stay the same
-
-                    // Write the trajectory.
-		    writers[repID]->append(pos + (repID*num), name, serial, t, num);
-
-                    if(particle_dynamic == String("Langevin"))
-                    {
-                        momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
-                        //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
-                    }
-
-	        }
-
-                // TODO: Currently, not compatible with replicas. Needs a fix.
-		if (numberFluct == 1) 
-                    updateReservoirs();
-
-		remember(t);
-            }
-	    // Output energy.
-            if (get_energy) 
-            {
-	        wkf_timer_stop(timerS);
-	        t = s * timestep;
-	        // Simulation progress and statistics.
-	        float percent = (100.0f * s) / steps;
-	        float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod;
-	        float nsPerDay = numReplicas * timestep / msPerStep * 864E5f;
-
-	        // Nice thousand separator
-	        setlocale(LC_NUMERIC, "");
-
-	        // Do the output
-	        printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay);
-
-	        // Copy positions from GPU to CPU.
-	        gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                if(particle_dynamic == String("Langevin"))
-                {
-                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    float e = KineticEnergy();
-                    printf(" The kinetic energy is %f \n",e*(2.388458509e-1));
-                }
-                if(rigidbody_dynamic == String("Langevin"))
-                {
-                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    float e = RotKineticEnergy();
-                    printf(" The Rotational kinetic energy is %f \n",e*(2.388458509e-1));
-                }
-       
-                // Write restart files for each replica.
-                for (int repID = 0; repID < numReplicas; ++repID)
-                    writeRestart(repID);
-                
-	        wkf_timer_start(timerS);
-            } // s % outputEnergyPeriod
-	} // done with all Brownian dynamics steps
-
-	// If IMD is on & our socket is still open.
-	if (imd_on and clientsock) 
-        {
-            if (vmdsock_selread(clientsock, 0) == 1) 
-            {
-	        int length;
-		switch (imd_recv_header(clientsock, &length)) 
-                {
-		    case IMD_DISCONNECT:
-		        printf("\n[IMD] Disconnecting...\n");
-			imd_disconnect(clientsock);
-			clientsock = NULL;
-			sleep(5);
-			break;
-		    case IMD_KILL:
-		        printf("\n[IMD] Killing...\n");
-			imd_disconnect(clientsock);
-			clientsock = NULL;
-			sleep(5);
-			break;
-		    default:
-		        printf("\n[IMD] Something weird happened. Disconnecting..\n");
-			break;
-	        }
-            }
-	}
-	// Stop the main timer.
-	wkf_timer_stop(timer0);
-
-	// Compute performance data.
-	const float elapsed = wkf_timer_time(timer0); // seconds
-	int tot_hrs = (int) std::fmod(elapsed / 3600.0f, 60.0f);
-	int tot_min = (int) std::fmod(elapsed / 60.0f, 60.0f);
-	float tot_sec	= std::fmod(elapsed, 60.0f);
-
-	printf("\nFinal Step: %d\n", (int) steps);
-
-	printf("Total Run Time: ");
-	if (tot_hrs > 0) printf("%dh%dm%.1fs\n", tot_hrs, tot_min, tot_sec);
-	else if (tot_min > 0) printf("%dm%.1fs\n", tot_min, tot_sec);
-	else printf("%.2fs\n", tot_sec);
-
-        gpuErrchk(cudaFree(force_d));
-#endif
-} // GrandBrownTown::run()
 // --------------------------------------------
 // Populate lists of types and serial numbers.
 //
 void GrandBrownTown::populate() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		int pn = 0;
 		int p = 0;
 		for (int i = 0; i < num; i++) {
@@ -1636,7 +1203,7 @@ void GrandBrownTown::populate() {
 void GrandBrownTown::writeRestart(int repID) const 
 {
     FILE* out   = fopen(restartFiles[repID].c_str(), "w");
-    const int offset = repID * num;
+    const int offset = repID * (num+num_rb_attached_particles);
 
     for (int i = 0; i < num; ++i) 
     {
@@ -1664,14 +1231,14 @@ void GrandBrownTown::writeRestart(int repID) const
 /*the center is defined by the first pmf*/
 void GrandBrownTown::initialCondCen() {
 	for (int i = 0; i < num; i++)
-		pos[i] = part[ type[i] ].pmf->getCenter();
+		pos[i] = part[ type[i] ].pmf[0]->getCenter();
 }
 
 
 // Set random initial positions for all particles and replicas
 void GrandBrownTown::initialCond() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		for (int i = 0; i < num; i++) {
 			pos[i + offset] = findPos(type[i + offset]);
 		}
@@ -1688,7 +1255,7 @@ Vector3 GrandBrownTown::findPos(int typ) {
 		const float ry = sysDim.y * randoGen->uniform();
 		const float rz = sysDim.z * randoGen->uniform();
 		r = sys->wrap( Vector3(rx, ry, rz) );
-	} while (pt.pmf->interpolatePotential(r) > *pt.meanPmf);
+	} while (pt.pmf[0]->interpolatePotential(r) > *pt.meanPmf);
 	return r;
 }
 
@@ -1701,7 +1268,7 @@ Vector3 GrandBrownTown::findPos(int typ, float minZ) {
 		const float ry = sysDim.y * randoGen->uniform();
 		const float rz = sysDim.z * randoGen->uniform();
 		r = sys->wrap( Vector3(rx, ry, rz) );
-	} while (pt.pmf->interpolatePotential(r) > *pt.meanPmf and fabs(r.z) > minZ);
+	} while (pt.pmf[0]->interpolatePotential(r) > *pt.meanPmf and fabs(r.z) > minZ);
 	return r;
 }
 
@@ -1715,7 +1282,7 @@ float GrandBrownTown::KineticEnergy()
     gpuErrchk(cudaMalloc((void**)&energy, sizeof(float)));
     gpuErrchk(cudaMemset((void*)energy,0,sizeof(float)));
 
-    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, numReplicas*num);
+    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, num, num_rb_attached_particles, numReplicas);
     gpuErrchk(cudaDeviceSynchronize());
 
     Reduction<64><<<dim3(1),dim3(64)>>>(vec_red, energy, 512);
@@ -1763,7 +1330,7 @@ void GrandBrownTown::init_cuda_group_sites()
     }
 
     // Create GPU-friendly data structure
-    // TODO make this work for replicas
+    assert(numReplicas == 1);    // TODO make this work for replicas
     int* tmp = new int[numGroupSites+1+num_particles];
     num_particles = 0;
     int i = 0;
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 4a2211c6c407f16871f93ab7aa9e33f5defa9a2b..44ffe6f82c1c3dde464d8039ac6fef26dd97f84f 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -14,86 +14,50 @@ __device__
 Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad,
 						 float timestep, BaseGrid *sys, Random *randoGen, int num);
 
-//update both position and momentum for Langevin dynamics
-//Han-Yi Chou
-#if 0
-__global__ void updateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, 
-                             int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, 
-                             int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) 
+inline __device__
+ForceEnergy compute_position_dependent_force(
+    const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal,
+    const int* __restrict__ type, BrownianParticleType** part,
+    const float electricField, const int scheme, const int idx)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int t = type[idx];
+    Vector3 r0 = pos[idx];
+    const BrownianParticleType& pt = *part[t];
+    Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
 
-    if (idx < num * numReplicas)
+    ForceEnergy fe(0.f, 0.f);
+    for(int i = 0; i < pt.numPartGridFiles; ++i)
     {
-        int t = type[idx];
-        //Vector3 r0(tex1Dfetch(PosTex, idx));
-        //Vector3 p0(tex1Dfetch(MomTex, idx));
-        Vector3 r0 = pos[idx];
-        Vector3 p0 = momentum[idx];
-        
-        const BrownianParticleType& pt = *part[t];
-        float mass      = pt.mass;
-        r0 = r0 + (timestep * 5e3 * p0) / mass;
-        r0 = sys->wrap(r0);
-        pos[idx]      = r0;
-
-#if 0
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-        // Compute PMF
-        // TODO: maybe make periodic and nonPeriodic versions of this kernel
-        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+	ForceEnergy tmp(0.f, 0.f);
+	if(!scheme) {
+	    BoundaryCondition bc = pt.pmf_boundary_conditions[i];
+	    INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0)
+		} else
+	    tmp = pt.pmf[i]->interpolateForceD(r0);
+	fe.f += tmp.f * pt.pmf_scale[i];
+	fe.e += tmp.e * pt.pmf_scale[i];
+    }
+    // if(get_energy)
+    // 	energy[idx] += fe.e;
 
 #ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
-
-        // Get local kT value
-        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
-
-        // Update the particle's position using the calculated values for time, force, etc.
-        float mass      = pt.mass;
-        Vector3 gamma   = pt.transDamping;
-
-        if (pt.diffusionGrid != NULL) 
-        {
-
-            Vector3 gridCenter = pt.diffusionGrid->origin +
-            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
-                                                       0.5*pt.diffusionGrid->nz));
-            Vector3 p2 = r0 - gridCenter;
-            p2 = sys->wrapDiff( p2 ) + gridCenter;
-            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
-            gamma = Vector3(kTlocal / (mass * diff.e));
-            //diffGrad = diff.f;
-        }
-
-        float tmp = sqrtf(0.50f * kTlocal * mass * timestep);
-        Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas);
-
-        #ifdef MDSTEP
-        force = Vector3(-r0.x, -r0.y, -r0.z);
-        #endif
-
-        p0.x = (1.0f - 0.50f * timestep * gamma.x) * p0.x + (0.50f * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2);
-        p0.y = (1.0f - 0.50f * timestep * gamma.y) * p0.y + (0.50f * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2);
-        p0.z = (1.0f - 0.50f * timestep * gamma.z) * p0.z + (0.50f * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2);
-
-        r0 = r0 + (timestep * 1e4 * p0) / mass;
-        r0 = sys->wrap(r0);
- 
-        //step(r0, p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas);
-        pos[idx]      = r0;
-        momentum[idx] = p0;
-#endif
-
+    // Add a force defined via 3D FORCE maps (not 3D potential maps)
+    if(!scheme)
+    {
+	if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+	if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+	if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+    }
+    else
+    {
+	if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
+	if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
+	if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
     }
-}
 #endif
+    fe.f = fe.f + forceExternal;
+    return fe;
+}
 
 
 ////The kernel is for Nose-Hoover Langevin dynamics
@@ -101,48 +65,24 @@ __global__ void
 updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, 
                                Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], 
                                float kT, BaseGrid* kTGrid, float electricField, int tGridLength, float timestep, 
-                               int num, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
+                               int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
 
+        int t = type[idx];
         Vector3 r0  = pos[idx];
         Vector3 p0  = momentum[idx];
         float   ran = random[idx];
 
         const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f,0.f);
-            if(!scheme) {
-		const BoundaryCondition& bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0)
-	    } else 
-                tmp = pt.pmf[i].interpolateForceD(r0);
-            fe.f += tmp.f;
-        }
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
 
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
         #ifdef Debug
         forceInternal[idx] = -force;
         #endif
@@ -214,50 +154,27 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 //Han-Yi Chou
 __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                   int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                  float electricField,int tGridLength, float timestep, int num, BaseGrid* sys, 
+                                  float electricField,int tGridLength, float timestep,
+				  int num, int num_rb_attached_particles, BaseGrid* sys,
                                   Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
 
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	// if (get_energy) energy[idx] += fe.e;
+
+        int t = type[idx];
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
-
         const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
 
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f, 0.f);
-            if(!scheme) {
-		BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0)
-	    } else 
-                tmp = pt.pmf[i].interpolateForceD(r0);
-            fe.f += tmp.f;
-        }
-
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
@@ -315,52 +232,23 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
 //update momentum in the last step of BAOAB integrator for the Langevin dynamics. Han-Yi Chou
 __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
                                       int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                      float electricField, int tGridLength, float timestep, int num, 
+                                      float electricField, int tGridLength, float timestep, int num, int num_rb_attached_particles,
                                       BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy,int scheme)
 {
-    const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	if (get_energy) energy[idx] += fe.e;
+
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
 
-        const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f, 0.f);
-            if(!scheme) {
-		BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0)
-	    } else 
-                tmp = pt.pmf[i].interpolateForceD(r0);
-
-            fe += tmp;
-            //fe.e += tmp.e;
-        }
-        if(get_energy)
-            energy[idx] += fe.e;
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
@@ -373,82 +261,22 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
         momentum[idx] = p0;
     }
 }
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//The last step of BBK integrator to update the momentum for Langevin dynamics.
-#if 0
-__global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
-                                 int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
-                                 int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
-{
-    const int idx  = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (idx < num * numReplicas)
-    {
-        int t = type[idx];
-        Vector3 r0 = pos[idx];
-        Vector3 p0 = momentum[idx];
-
-        const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
-
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
-
-        float mass      = pt.mass;
-        Vector3 gamma   = pt.transDamping;
-        
-        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
-
-        float tmp = sqrtf(0.50f * kTlocal * mass * timestep);
-
-        if (pt.diffusionGrid != NULL)
-        {
-
-            Vector3 gridCenter = pt.diffusionGrid->origin +
-            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
-                                                       0.5*pt.diffusionGrid->nz));
-            Vector3 p2 = r0 - gridCenter;
-            p2 = sys->wrapDiff( p2 ) + gridCenter;
-            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
-            gamma = Vector3(kTlocal / (mass * diff.e));
-        }
-
-        Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas);
-        #ifdef MDSTEP
-        force = Vector3(-r0.x, -r0.y, -r0.z);
-        #endif
-
-        //step(p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas);
-        p0.x = (p0.x + 0.50 * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2) / (1.0f+0.5f*timestep*gamma.x);
-        p0.y = (p0.y + 0.50 * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2) / (1.0f+0.5f*timestep*gamma.y);
-        p0.z = (p0.z + 0.50 * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2) / (1.0f+0.5f*timestep*gamma.z);
-
-        momentum[idx] = p0;
-    }
-}
-#endif
 
 //Update kernel for Brownian dynamics
 __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], 
                   BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, 
-                  int tGridLength, float timestep, int num, BaseGrid* sys,
+                  int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys,
 		  Random* randoGen, int numReplicas, float* energy, bool get_energy, int scheme) 
 {
 	// Calculate this thread's ID
-	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+	int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
         
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
 	// Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) 
         {
+	    idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 		const int t = type[idx];
 		Vector3   p = pos[idx];
 
@@ -456,44 +284,14 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
                 
 	 	/* printf("atom %d: forceInternal: %f %f %f\n", idx, forceInternal[idx].x, forceInternal[idx].y, forceInternal[idx].z);  */
 
-		// Compute external force
-		Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-		// Compute PMF
-		// TODO: maybe make periodic and nonPeriodic versions of this kernel
-		//ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(p);
-		ForceEnergy fe(0.f, 0.f);
-                for(int i = 0; i < pt.numPartGridFiles; ++i)
-                {
-                    ForceEnergy tmp(0.f,0.f);
-		    if(!scheme) {
-			BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-			INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, p)
-		    } else 
-                        tmp = pt.pmf[i].interpolateForceD(p);
-                    fe += tmp;
-                }
-#ifndef FORCEGRIDOFF
-		// Add a force defined via 3D FORCE maps (not 3D potential maps)
-		if(!scheme)
-                {
-		    if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(p);
-		    if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(p);
-		    if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(p);
-                }
-                else
-                {
-                    if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(p);
-                    if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(p);
-                    if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(p);
-                }
-#endif
+		ForceEnergy fe = compute_position_dependent_force(
+		    pos, forceInternal, type, part, electricField, scheme, idx );
 
 		// Compute total force:
 		//	  Internal:  interaction between particles
 		//	  External:  electric field (now this is basically a constant vector)
 		//	  forceGrid: ADD force due to PMF or other potentials defined in 3D space
-		Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+		Vector3 force = forceInternal[idx] + fe.f;
                 #ifdef Debug
                 forceInternal[idx] = -force;
                 #endif
@@ -550,10 +348,12 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
                     float en_local = 0.f;
                     for(int i = 0; i < pt.numPartGridFiles; ++i)
                     {
+			float en_tmp = 0.0f;
                         if(!scheme)
-                            en_local += pt.pmf[i].interpolatePotentialLinearly(tmp);
+                            en_tmp = pt.pmf[i]->interpolatePotentialLinearly(tmp);
                         else
-                            en_local += pt.pmf[i].interpolatePotential(tmp);
+                            en_tmp = pt.pmf[i]->interpolatePotential(tmp);
+			en_tmp *= pt.pmf_scale[i];
                     }
                     energy[idx] += en_local;
                 }		
@@ -703,3 +503,37 @@ __global__ void devicePrint(RigidBodyType* rb[]) {
 //     cudaMalloc( &totalForce_h, sizeof(Vector3) );
 //     cudaMemcpyToSymbol(totalForce, &totalForce_h, sizeof(totalForce_h));
 // }
+
+__global__ void compute_position_dependent_force_for_rb_attached_particles(
+    const Vector3* __restrict__ pos,
+    Vector3* __restrict__ forceInternal, float* __restrict__ energy,
+    const int* __restrict__ type, BrownianParticleType** __restrict__ part,
+    const float electricField, const int num, const int num_rb_attached_particles,
+    const int numReplicas, const int scheme)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_rb_attached_particles * numReplicas)
+    {
+	idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles);
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	atomicAdd( &forceInternal[idx], fe.f );
+	atomicAdd( &energy[idx], fe.e );
+    }
+}
+__global__ void compute_position_dependent_force_for_rb_attached_particles(
+    const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal,
+    const int* __restrict__ type, BrownianParticleType** __restrict__ part,
+    const float electricField, const int num, const int num_rb_attached_particles,
+    const int numReplicas, const int scheme)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_rb_attached_particles * numReplicas)
+    {
+	idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles);
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	atomicAdd( &forceInternal[idx], fe.f );
+    }
+}
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index c6479a081fb2b4461493f49269c1e67b29eb7323..56d4c5a3b28c7b765542ba87eb5efa2445768c82 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -62,7 +62,6 @@ public:
 	~GrandBrownTown();
 
 	void run();
-        void RunNoseHooverLangevin();
 	static bool DEBUG;
 
 private:  
@@ -230,6 +229,8 @@ private:
 	int numAngles;
 	int numDihedrals;
 
+    int num_rb_attached_particles;
+
 	int numGroupSites;
 	int* groupSiteData_d;
 
diff --git a/src/Makefile b/src/Makefile
index b0356b86521cc4ca5448f5fdb8b0b35ea106b2dc..68598f462bf925e748a759b91ab9bd5ab09d6afc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,7 @@
 ### Paths, libraries, includes, options
 include ./findcudalib.mk
 
+SUFFIX ?= ""
 
 ifeq ($(dbg),1)
 	NV_FLAGS += -g -G -O0
@@ -99,13 +100,18 @@ ifeq ($(omp),1)
 TARGET := $(TARGET)_omp
 endif
 
-all: $(TARGET)
-	@echo "Done ->" $(TARGET)
+all: $(TARGET)$(SUFFIX)
+	@echo "Done ->" $(TARGET)$(SUFFIX)
 
-$(TARGET): $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h
-	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+$(TARGET)_link.o: $(CU_OBJ)
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
 	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
-	$(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)
+
+$(TARGET)$(SUFFIX): $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
+	$(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)$(SUFFIX)
+
 
 .SECONDEXPANSION:
 $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh)
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index 94e726112c97e8fc6eddde9d2585f8aa01157df5..8aeb7aa65711c72fa02149e73cf969a611f4078f 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -18,10 +18,10 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 #endif
 
-RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref) 
-    : name(name), c(&cref), t(&tref), RBC(RBCref), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); }
+RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref, int attached_particle_start, int attached_particle_end) 
+    : name(name), c(&cref), t(&tref), RBC(RBCref), attached_particle_start(attached_particle_start), attached_particle_end(attached_particle_end), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); }
 RigidBody::RigidBody(const RigidBody& rb)
-    : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); }
+    : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), attached_particle_start(rb.attached_particle_start), attached_particle_end(rb.attached_particle_end), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); }
 void RigidBody::init() {
 	// units "(kcal_mol/AA) * ns" "dalton AA/ns" * 4.184e+08	
 	timestep = c->timestep;
@@ -69,8 +69,8 @@ void RigidBody::Boltzmann(unsigned long int seed)
     angularMomentum.x *= sigma[1];
     angularMomentum.y *= sigma[2];
     angularMomentum.z *= sigma[3];
-    printf("%f\n", Temp);
-    printf("%f\n", Temperature());
+    // printf("%f\n", Temp);
+    // printf("%f\n", Temperature());
 }
 
 RigidBody::~RigidBody() {
@@ -103,6 +103,23 @@ int RigidBody::appendNumParticleBlocks( std::vector<int>* blocks ) {
     return ret;
 }
 
+__global__
+void update_particle_positions_kernel(Vector3* __restrict__ pos, const int start, const int num,
+			       const Vector3* __restrict__ pos_rb,
+			       const Vector3 center, const Matrix3 orientation) {
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < num) {
+	    const int aid = i+start;
+	    pos[aid] = orientation*pos_rb[i] + center;
+	}
+}		
+void RigidBody::update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d) {
+    int num_attached = attached_particle_end - attached_particle_start;
+    int nb = floor(num_attached/NUMTHREADS) + 1;
+    update_particle_positions_kernel<<<nb,NUMTHREADS>>>(pos_d, attached_particle_start, num_attached,
+						 t->attached_particle_positions_d, position, orientation);
+}
+
 void RigidBody::addForce(Force f) { 
 	force += f; 
 } 
@@ -113,6 +130,141 @@ void RigidBody::addEnergy(float e)
 {
     energy += e;
 }
+
+// TODO move code snippet to more general location
+template<class C, class T>
+auto contains(const C& v, const T& x)
+-> decltype(end(v), true)
+{
+    return end(v) != std::find(begin(v), end(v), x);
+}
+/*
+void RigidBody::initialize_possible_particles()
+{
+    std::vector<int> atomic_ids;
+
+    // Loop over potential grids
+    for (int i = 0; i < t->numPotGrids; ++i) {
+	atomic_ids.clear();
+
+	String& gridName = t->potentialGridKeys[i];
+	
+	// Loop over particle types to count the number of particles
+	for (int j = 0; j < conf->numParts; ++j) {
+	    if (contains(conf->partRigidBodyGrid[j], gridName)) {
+		// gridNames contained gridName, so add the particles to atomic_ids
+		for (int aid = 0; aid < conf->num + conf->num_rb_attached_particles; ++aid) {
+		    if (conf->type[aid] == j && (aid < exclude_start || aid > exclude_end)) {
+			atomic_ids.push_back(aid);
+		    }
+		}
+	    }
+	}
+
+	// Initialize device data
+	size_t sz = sizeof(int) * atomic_ids.size();
+	gpuErrchk(cudaMalloc( &(possible_particles_d[i]), sz ));
+	gpuErrchk(cudaMemcpyAsync( possible_particles_d[i], &atomic_ids[0], sz, cudaMemcpyHostToDevice))
+
+	// // Add particles attached to OTHER RBs
+	// int rb_particle_offset = conf->num;
+	// for (const auto& rbs: RBC->rigidBodyByType)
+	// {
+	//     int rb_type
+	//     const RigidBodyType& rb_type = rbs[0].t;
+	//     if (rbs.t != 
+	//     for (int& ptype: rb_type->attached_particle_types) {
+	// 	// This could be made much more efficient
+	// 	if (contains(conf->partRigidBodyGrid[ptype], gridName)) {
+	// 	// Add particles
+	// 	rb
+	// 	rbs[0].sum
+	// 	    }
+	//     rb_particle_offset += rb_type.attached_particle_types
+	    
+	//     for (int k = 0; k < t->gridNames.size(); ++k) {
+	// 	    if (t->gridNames[k] == gridName) {
+	// 		t->numParticles[i] += conf->numPartsOfType[j];			
+	// 	    }
+
+	    
+	// 		// Loop over rigid body grid names associated with particle type
+	// 		const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+	// 		for (int k = 0; k < gridNames.size(); ++k) {
+	// 		    if (gridNames[k] == gridName) {
+	// 			// Copy type j particles to particles[i]
+	// 			memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId );
+	// 			assert(currId == conf->numPartsOfType[j]);
+	// 			pid += conf->numPartsOfType[j];
+	// 		    }
+	// 		}
+	// 	    }
+
+	// numParticles[i] = 0;
+
+	    // Count the particles interacting with potential grid i
+	    // Loop over particle types
+	    for (int j = 0; j < conf->numParts; ++j) {
+
+		// Loop over rigid body grid names associated with particle type
+		const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+		for (int k = 0; k < t->gridNames.size(); ++k) {
+		    if (t->gridNames[k] == gridName) {
+			numParticles[i] += conf->numPartsOfType[j];			
+		    }
+		}
+	    }
+
+	    // Add RB particles
+	    for (const auto& rbv: RBC->rigidBodyByType)
+	    {
+		int ptype = rbv[0].t->attached_particle_types;
+		const std::vector<String>& gridNames = conf->partRigidBodyGrid[ptype];
+		for (int k = 0; k < t->gridNames.size(); ++k) {
+		    if (t->gridNames[k] == gridName) {
+			t->numParticles[i] += conf->numPartsOfType[j];			
+		    }
+		
+	    attached_particle_
+	    
+	    if (numParticles[i] > 0) {
+
+		    // allocate array of particle ids for the potential grid 
+		    particles[i] = new int[numParticles[i]];
+		    int pid = 0;
+		
+		    // Loop over particle types to count the number of particles
+		    for (int j = 0; j < conf->numParts; ++j) {
+
+			// Build temporary id array of type j particles
+			int tmp[conf->numPartsOfType[j]];
+			int currId = 0;
+			for (int aid = 0; aid < conf->num; ++aid) {
+			    if (conf->type[aid] == j)
+				tmp[currId++] = aid;
+			}
+			if (currId == 0) continue;
+
+			// Loop over rigid body grid names associated with particle type
+			const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+			for (int k = 0; k < gridNames.size(); ++k) {
+			    if (gridNames[k] == gridName) {
+				// Copy type j particles to particles[i]
+				memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId );
+				assert(currId == conf->numPartsOfType[j]);
+				pid += conf->numPartsOfType[j];
+			    }
+			}
+		    }
+
+		    // Initialize device data
+		    size_t sz = sizeof(int) * numParticles[i];
+		    gpuErrchk(cudaMalloc( &(particles_d[i]), sz ));
+		    gpuErrchk(cudaMemcpyAsync( particles_d[i], particles[i], sz, cudaMemcpyHostToDevice));
+		}
+	}
+}
+*/
 void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) {
 	for (int i = 0; i < t->numPotGrids; ++i) {
 		numParticles[i] = 0;
@@ -131,10 +283,12 @@ void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) {
 			int nb = floor(tnp/NUMTHREADS) + 1;
 #if __CUDA_ARCH__ >= 300
 			createPartlist<<<nb,NUMTHREADS>>>(pos_d, tnp, t->particles_d[i],
+							  attached_particle_start, attached_particle_end,
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff, sys_d);
 #else
 			createPartlist<<<nb,NUMTHREADS,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i],
+							  attached_particle_start, attached_particle_end,
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff, sys_d);
 #endif			
@@ -179,6 +333,20 @@ void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, in
 	}
 }
 
+void RigidBody::apply_attached_particle_forces(const Vector3* force) {
+    const auto &rb_pos = t->attached_particle_positions;
+    int num = rb_pos.size();
+    Vector3 total_force = Vector3(0.0f);
+    Vector3 torque = Vector3(0.0f);
+    for (int i = 0; i < num; ++i) {
+	const int j = i + attached_particle_start;
+	torque = torque + (orientation*rb_pos[i]).cross(force[j]);
+	total_force = total_force + force[j];
+    }
+    addForce(total_force);
+    addTorque(torque);
+}
+
 void RigidBody::applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx) {
 	// loop over potential grids 
 	for (int i = 0; i < t->numPotGrids; ++i) {
diff --git a/src/RigidBody.h b/src/RigidBody.h
index 5854a082a29f975d11281b9ad45ffd5bc8cc0fbf..e3e81737ae1d7aee45f16e915d44f6aa413e1749 100644
--- a/src/RigidBody.h
+++ b/src/RigidBody.h
@@ -31,7 +31,9 @@ class RigidBody { // host side representation of rigid bodies
 	| splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) |
 	`––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––*/
 		public:
-	RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC);
+    RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC,
+	      int attached_particle_start, int attached_particle_end);
+
     RigidBody(const RigidBody& rb);
     // RigidBody(const RigidBody& rb) : RigidBody(rb.name, *rb.c, *rb.t) {};
 	void init();
@@ -40,6 +42,8 @@ class RigidBody { // host side representation of rigid bodies
 
 	int appendNumParticleBlocks( std::vector<int>* blocks );
 
+    void update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d);
+
 	HOST DEVICE void addForce(Force f); 
 	HOST DEVICE void addTorque(Force t);
         HOST DEVICE void addEnergy(float e);
@@ -76,8 +80,10 @@ class RigidBody { // host side representation of rigid bodies
               return Vector3( angularMomentum.x, angularMomentum.y, angularMomentum.z);
         }
 
+        void initializeParticleLists();
 	void updateParticleList(Vector3* pos_d, BaseGrid* sys_d);
 	void callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, ForceEnergy* forcestorques_d, const std::vector<int>& forcestorques_offset, int& fto_idx);
+	void apply_attached_particle_forces(const Vector3* force);
 	void applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx);
 	
 	bool langevin;
@@ -122,9 +128,12 @@ private:
 	bool isFirstStep; 
 	
 	int* numParticles;		  /* particles affected by potential grids */
+	int** possible_particles_d;		 	
 	int** particles_d;		 	
 	const cudaStream_t** particleForceStreams;
-	
+
+    int attached_particle_start, attached_particle_end;
+    
 	/*–––––––––––––––––––––––––––––––––––––––––.
 	| units "kcal_mol/AA * ns" "(AA/ns) * amu" |
 	`–––––––––––––––––––––––––––––––––––––––––*/
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index 5c89ba26f440bbc2659aea47b31671ecf7ce9cd0..13438b8c758384996f3c72309d2bc62071a1f7c7 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -45,15 +45,14 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre
 
 	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
 	construct_grids();
-	for (int i = 0; i < conf.numRigidTypes; i++)
-		conf.rigidBody[i].initializeParticleLists();
 
 	int numRB = 0;
 	// grow list of rbs
 
-
+	int attached_particle_offset = 0;
 	for (int i = 0; i < conf.numRigidTypes; i++) {			
 		numRB += conf.rigidBody[i].num;
+		int attached_particle_in_type = conf.rigidBody[i].num_attached_particles();
 		std::vector<RigidBody> tmp;
 		// RBTODO: change conf.rigidBody to conf.rigidBodyType
 		const int jmax = conf.rigidBody[i].num;
@@ -64,13 +63,17 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre
 			    snprintf(stmp, 128, "#%d", j);
 			    name.add( stmp );
 			}
-			RigidBody r(name, conf, conf.rigidBody[i], this);
+			RigidBody r(name, conf, conf.rigidBody[i], this,
+				    attached_particle_offset, attached_particle_offset+attached_particle_in_type);
+			attached_particle_offset += attached_particle_in_type;
+
 			int nb = r.appendNumParticleBlocks( &particleForceNumBlocks );
 			tmp.push_back( r );
 		}
 		rigidBodyByType.push_back(tmp);
 	}
-	
+	attached_particle_forces = new Vector3[attached_particle_offset];
+
 	totalParticleForceNumBlocks = 0;
 	for (int i=0; i < particleForceNumBlocks.size(); ++i) {
 	    particleForce_offset.push_back(2*totalParticleForceNumBlocks);
@@ -80,30 +83,24 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre
 	gpuErrchk(cudaMallocHost(&(particleForces), sizeof(ForceEnergy) * 2*totalParticleForceNumBlocks))
 	gpuErrchk(cudaMalloc(&(particleForces_d), sizeof(ForceEnergy) * 2*totalParticleForceNumBlocks))
 
-	if (conf.inputRBCoordinates.length() > 0)
-		loadRBCoordinates(conf.inputRBCoordinates.val());
+	if (conf.restartRBCoordinates.length() > 0)
+	    load_restart_coordinates(conf.restartRBCoordinates.val());
+	else if (conf.inputRBCoordinates.length() > 0)
+	    loadRBCoordinates(conf.inputRBCoordinates.val());
 	
 	random = new RandomCPU(conf.seed + repID + 1); /* +1 to avoid using same seed as RandomCUDA */
 	
+	
 	initializeForcePairs();	// Must run after construct_grids()
 	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
-
-        //Boltzmann distribution
-        for (int i = 0; i < rigidBodyByType.size(); i++)
-        {
-            for (int j = 0; j < rigidBodyByType[i].size(); j++)
-            {
-                RigidBody& rb = rigidBodyByType[i][j];
-                // thermostat
-                rb.Boltzmann(seed);
-            }
-        }
 }
 
 RigidBodyController::~RigidBodyController() {
 	for (int i = 0; i < rigidBodyByType.size(); i++)
 		rigidBodyByType[i].clear();
 	rigidBodyByType.clear();
+
+	delete [] attached_particle_forces;
 	delete random;
 }
 
@@ -337,6 +334,9 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) {
                     rb.momentum = Vector3((float)strtod(tokenList[12],NULL), (float) strtod(tokenList[13],NULL), (float) strtod(tokenList[14],NULL));
                     rb.angularMomentum = Vector3((float)strtod(tokenList[15],NULL), (float) strtod(tokenList[16],NULL), (float) strtod(tokenList[17],NULL));
                 }
+
+	        if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 18)
+		    rb.Boltzmann(conf.seed);
                
 		delete[] tokenList;
 
@@ -352,6 +352,82 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) {
 	fclose(inp);
 	return true;
 }
+bool RigidBodyController::load_restart_coordinates(const char* filename) {
+	char line[STRLEN];
+	FILE* inp = fopen(filename, "r");
+
+	if (inp == NULL) {
+		printf("ARBD: load RB coordinates: File '%s' does not exist\n", filename);
+		exit(-1);	   
+	}
+
+	int imax = rigidBodyByType.size();
+	int i = 0;
+	int jmax = rigidBodyByType[i].size();
+	int j = 0;
+
+	while (fgets(line, STRLEN, inp) != NULL) {
+		// Ignore comments.
+		int len = strlen(line);
+		if (line[0] == '#') continue;
+		if (len < 2) continue;
+
+		String s(line);
+		int numTokens = s.tokenCount();
+		if (numTokens < 2+3+9) {
+			printf("ARBD: invalid RB restart coordinate line: %s\n", line);
+			fclose(inp);	
+			exit(-1);
+		}
+                if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 20)
+                {
+                    std::cout << "Warning the initial momentum set by random number" << std::endl;
+                }
+
+		String* tokenList = new String[numTokens];
+		s.tokenize(tokenList);
+		if (tokenList == NULL) {
+			printf("ARBD: invalid RB restart coordinate line: %s\n", line);
+			fclose(inp);
+			exit(-1);
+		}
+
+		RigidBody& rb = rigidBodyByType[i][j];
+		printf("Assinging positions %d %d %f\n",i,j, (float) strtod(tokenList[2],NULL));
+		rb.position = Vector3(
+			(float) strtod(tokenList[2],NULL), (float) strtod(tokenList[3],NULL), (float) strtod(tokenList[4],NULL));
+		rb.orientation = Matrix3(
+			(float) strtod(tokenList[5],NULL), (float) strtod(tokenList[6],NULL), (float) strtod(tokenList[7],NULL),
+			(float) strtod(tokenList[8],NULL), (float) strtod(tokenList[9],NULL), (float) strtod(tokenList[10],NULL),
+			(float) strtod(tokenList[11],NULL), (float) strtod(tokenList[12],NULL), (float) strtod(tokenList[13],NULL));
+
+	        if(conf.RigidBodyDynamicType == String("Langevin") && numTokens >= 20)
+                {
+                    rb.momentum = Vector3((float)strtod(tokenList[14],NULL), (float) strtod(tokenList[15],NULL), (float) strtod(tokenList[16],NULL));
+                    rb.angularMomentum = Vector3((float)strtod(tokenList[17],NULL), (float) strtod(tokenList[18],NULL), (float) strtod(tokenList[19],NULL));
+                }
+
+	        if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 20)
+		    rb.Boltzmann(conf.seed);
+
+		delete[] tokenList;
+
+		j++;
+		if (j == jmax) {
+			i++;
+			if (i == imax)
+				break;
+			j=0;
+			jmax = rigidBodyByType[i].size();
+		}
+	}
+	fclose(inp);
+	if (i < imax || j < jmax) {
+	    printf("ARBD: RB restart file did not contain the correct number of lines: %s\n", filename);
+	    exit(-1);
+	}
+	return true;
+}
 
 		
 
@@ -454,6 +530,17 @@ void RigidBodyController::initializeForcePairs() {
 			
 }
 
+void RigidBodyController::update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas) {
+    for (int i = 0; i < rigidBodyByType.size(); i++) {
+	for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+	    rigidBodyByType[i][j].update_particle_positions(pos_d, force_d, energy_d);
+	}
+    }
+    gpuErrchk(cudaMemset((void*) force_d, 0, num_rb_attached_particles*sizeof(Vector3)));
+    gpuErrchk(cudaMemset((void*) energy_d, 0, num_rb_attached_particles*sizeof(float)));
+}
+
+
 void RigidBodyController::updateParticleLists(Vector3* pos_d, BaseGrid* sys_d) {
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
 		for (int j = 0; j < rigidBodyByType[i].size(); j++) {
@@ -476,7 +563,7 @@ void RigidBodyController::clearForceAndTorque()
     }
 }
 
-void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d) 
+void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles)
 {
 	//if (s <= 1)
 		//gpuErrchk( cudaProfilerStart() );
@@ -493,6 +580,7 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s,
 	// RBTODO: launch kernels ahead of time and sync using event and memcpyAsync 
 	gpuErrchk( cudaDeviceSynchronize() );
 	cudaMemcpy(particleForces, particleForces_d, sizeof(ForceEnergy)*2*totalParticleForceNumBlocks, cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(attached_particle_forces, &force_d[num], sizeof(Vector3)*(num_rb_attached_particles), cudaMemcpyDeviceToHost);
 
 	pfo_idx=0;
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
@@ -503,6 +591,16 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s,
 		}
 	}
 
+	{
+	    gpuErrchk( cudaDeviceSynchronize() );
+	    for (auto &rbv: rigidBodyByType) {
+		for (auto &rb: rbv) {
+		    rb.apply_attached_particle_forces( attached_particle_forces );
+		}
+	    }
+	}
+
+
 	// Grid–Grid forces
 	if ( ((s % conf.rigidBodyGridGridPeriod) == 0 || s == 1 ) && forcePairs.size() > 0) {
 		for (int i=0; i < forcePairs.size(); i++) {
@@ -804,10 +902,10 @@ void RigidBodyForcePair::processGPUForces(BaseGrid* sys) {
 			tmpT = tmpT + torques[i][j];
 		}
 		
-		// tmpT is the torque calculated about the origin of grid k2 (e.g. c2)
+		// tmpT is the torque calculated about the origin of density grid
 		//   so here we transform torque to be about rb1
-		Vector3 o2 = getOrigin2(i);
-		tmpT = tmpT - sys->wrapDiff(rb1->getPosition() - o2).cross( tmpF.f ); 
+		Vector3 o1 = getOrigin1(i);
+		tmpT = tmpT - (rb1->getPosition() - o1).cross( tmpF.f );
 
 		// clear forces on GPU
 		gpuErrchk(cudaMemset((void*)(forces_d[i]),0,nb*sizeof(ForceEnergy)));
diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h
index ff99091b3b49f0ca5d68edf07f66538821cd9946..fa2fa4e5d1efd6b8bf43a83e9e34b456d5f67245 100644
--- a/src/RigidBodyController.h
+++ b/src/RigidBodyController.h
@@ -108,8 +108,9 @@ public:
         void SetRandomTorques();
 	void integrate(BaseGrid* sys, int step);
         void integrateDLM(BaseGrid* sys, int step);
-	void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d);
+	void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles);
 	void updateParticleLists(Vector3* pos_d, BaseGrid* sys_d);
+    void update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas);
         void clearForceAndTorque(); 
         void KineticEnergy();
         void print(int step);
@@ -117,6 +118,7 @@ public:
         float getEnergy(float (RigidBody::*get)());
 private:
 	bool loadRBCoordinates(const char* fileName);
+    bool load_restart_coordinates(const char* filename);
 	void initializeForcePairs();
 
 	//void print(int step);
@@ -150,7 +152,8 @@ private:
 	void construct_grids();
 	void destruct_grids();
 
-        //float* rb_energy;	
+        //float* rb_energy;
+	Vector3* attached_particle_forces;
 	ForceEnergy* particleForces;
 	ForceEnergy* particleForces_d;
 	std::vector<int> particleForceNumBlocks;
diff --git a/src/RigidBodyType.cu b/src/RigidBodyType.cu
index 969a16ad7d6c3183d9a0f69ee0c4bffd142faf0f..2137160943aaf63fa2345f48e0b115373a242704 100644
--- a/src/RigidBodyType.cu
+++ b/src/RigidBodyType.cu
@@ -81,6 +81,58 @@ void RigidBodyType::setDampingCoeffs(float timestep) { /* MUST ONLY BE CALLED ON
 
 }
 	
+void RigidBodyType::attach_particles() {
+    for (const auto& filename: attached_particle_files) {
+	const size_t line_char = 256;
+	FILE* inp = fopen(filename.val(), "r");
+	char line[line_char];
+
+	// If the particle file cannot be found, exit the program
+	if (inp == NULL) {
+	    printf("ERROR: Could not open `%s'.\n", filename.val());
+	    fclose(inp);
+	    exit(1);
+	}
+
+	// Get and process all lines of input
+	while (fgets(line, line_char, inp) != NULL) {
+		// Lines in the particle file that begin with # are comments
+		if (line[0] == '#') continue;
+
+		String s(line);
+		int numTokens = s.tokenCount();
+
+		// Break the line down into pieces (tokens) so we can process them individually
+		String* tokenList = new String[numTokens];
+		s.tokenize(tokenList);
+
+		// Legitimate GROUP input lines have at least 3 tokens:
+		// Particle_type | x | y | z
+		// A line without exactly six tokens should be discarded.
+		if (numTokens != 4) {
+		    printf("Error: Invalid attached particle file line: %s\n", tokenList[0].val());
+		    fclose(inp);
+		    exit(-1);
+		}
+
+		// Make sure the index of this particle is unique.
+		// NOTE: The particle list is sorted by index.
+		int type_idx = conf->find_particle_type( tokenList[0].val() );
+		if (type_idx < 0) {
+		    printf("Error: Unrecognized particle type: %s\n", line);
+		    fclose(inp);
+		    exit(-1);
+		}
+		attached_particle_types.push_back( type_idx );
+		attached_particle_positions.push_back( Vector3(atof(tokenList[1].val()), atof(tokenList[2].val()), atof(tokenList[3].val())) );
+	}
+	fclose(inp);
+    }
+    size_t sz = sizeof(Vector3)*attached_particle_positions.size();
+    gpuErrchk(cudaMalloc( &(attached_particle_positions_d), sz ));
+    gpuErrchk(cudaMemcpyAsync( attached_particle_positions_d, &attached_particle_positions[0], sz, cudaMemcpyHostToDevice));
+}
+
 void RigidBodyType::addGrid(String s, std::vector<String> &keys, std::vector<String> &files) {
 	// tokenize and return
 	int numTokens = s.tokenCount();
diff --git a/src/RigidBodyType.h b/src/RigidBodyType.h
index 575a31348eb24dd305e913bbf46bf0b095bb8649..5cc1442052a102da7198e1e6eeda806c4cdaa8b7 100644
--- a/src/RigidBodyType.h
+++ b/src/RigidBodyType.h
@@ -17,9 +17,10 @@ class BaseGrid;
 class RigidBodyGrid;
 class Configuration;
 class RigidBodyController;
+class RigidBody;
 
 class RigidBodyType {
-	
+    friend class RigidBody;
 public:
 RigidBodyType(const String& name = "", const Configuration* conf = NULL ) :
 	name(name), conf(conf), num(0),
@@ -39,6 +40,11 @@ public:
 	/* RigidBodyType& operator=(const RigidBodyType& src); */
 	void copyGridsToDevice();
 	
+    void append_attached_particle_file(String s) { attached_particle_files.push_back(s); }
+    void attach_particles();
+    size_t num_attached_particles() const { return attached_particle_types.size() ;}
+    const std::vector<int>& get_attached_particle_types() const { return attached_particle_types; }
+
 	void addPotentialGrid(String s);
 	void addDensityGrid(String s);
 	void addPMF(String s);
@@ -54,6 +60,12 @@ public:
 	String name;
 private:
 	const Configuration* conf;
+	std::vector<String> attached_particle_files;
+	std::vector<int>attached_particle_types;
+private:
+    std::vector<Vector3>attached_particle_positions;
+    Vector3* attached_particle_positions_d;
+
 public:
 	int num; // number of particles of this type
 
diff --git a/src/TabulatedPotential.h b/src/TabulatedPotential.h
index b5a0f1e19e9c3dc43f73bd85c978d2091b27db57..e32068a6fa7dc0b8cbada59cb7f6ea8d40933b22 100644
--- a/src/TabulatedPotential.h
+++ b/src/TabulatedPotential.h
@@ -179,6 +179,6 @@ private:
 };
 #ifndef delgpuErrchk
 #undef  delgpuErrchk
-#undef  gpuErrchk
+#undef  gpuErrchk(code)
 #endif
 #endif