diff --git a/src/BrownParticlesKernel.h b/src/BrownParticlesKernel.h
index 36f7daa3ea95f0e3ddafdabde661b143382f17c7..5475a454d4a7b3ef4c70a904c8ce262a8d71780e 100644
--- a/src/BrownParticlesKernel.h
+++ b/src/BrownParticlesKernel.h
@@ -4,7 +4,7 @@
 
 template<const int BlockSize>
 static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], BrownianParticleType* part[], 
-                                                   float *vec_red, int n)
+                                                   float *vec_red, int num, int num_rb_attached_particles, int num_replicas)
 {
     __shared__ __align__(4) float sdata[BlockSize];
     
@@ -17,9 +17,12 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
 
     sdata[tid] = 0.f; 
 
+    int n = (num*num_replicas);
+
     while (i < n) 
-    { 
-        int t1 = type[i];
+    {
+	const int i1 = (i % num) +  (i/num)*(num+num_rb_attached_particles);
+        const int t1 = type[i1];
         const BrownianParticleType& pt1 = *part[t1];
 
         p1    = P_n[i];
@@ -27,7 +30,8 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         
         if(i + BlockSize < n)
         {
-            int t2 = type[i+BlockSize];
+	    const int i2 = ((i+BlockSize) % num) +  ((i+BlockSize)/num)*(num+num_rb_attached_particles);
+            const int t2 = type[i2];
             const BrownianParticleType& pt2 = *part[t2];
 
             p2    = P_n[i+BlockSize];
@@ -38,7 +42,7 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         else
             sdata[tid] += p1.length2() / mass1;
 
-        i += gridSize; 
+        i += gridSize;
     }
 
     sdata[tid] *= 0.50f;
diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 4f39b3fe3a905fca9703f03423b357a401bef198..74710b86fd0fd131f53c22e372c85e341c809450 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -40,14 +40,15 @@ void runSort(int2 *d1, int *d2, float *key,
 				unsigned int count);
 
 ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
-    num(c.num), numParts(c.numParts), sys(c.sys), switchStart(c.switchStart),
+    num(c.num), numParts(c.numParts), num_rb_attached_particles(c.num_rb_attached_particles),
+    sys(c.sys), switchStart(c.switchStart),
     switchLen(c.switchLen), electricConst(c.coulombConst),
     cutoff2((c.switchLen + c.switchStart) * (c.switchLen + c.switchStart)),
     decomp(c.sys->getBox(), c.sys->getOrigin(), c.switchStart + c.switchLen + c.pairlistDistance, numReplicas),
     numBonds(c.numBonds), numTabBondFiles(c.numTabBondFiles),
     numExcludes(c.numExcludes), numAngles(c.numAngles),
     numTabAngleFiles(c.numTabAngleFiles), numDihedrals(c.numDihedrals),
-    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), 
+    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints),
     numGroupSites(c.numGroupSites),
     numReplicas(numReplicas) {
 
@@ -234,10 +235,10 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 	restraintIds_d = NULL;
 
 	//Calculate the number of blocks the grid should contain
-	gridSize =  num / NUM_THREADS + 1;
+	gridSize =  (num+num_rb_attached_particles) / NUM_THREADS + 1;
 
 	// Create and allocate the energy arrays
-	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+numGroupSites) * numReplicas));
+	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+num_rb_attached_particles+numGroupSites) * numReplicas));
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 }
@@ -341,7 +342,7 @@ void ComputeForce::updateNumber(int newNum) {
 
 	// Recalculate the number of blocks in the grid
 	gridSize = 0;
-	while ((int)sqrt(NUM_THREADS) * gridSize < num)
+	while ((int)sqrt(NUM_THREADS) * gridSize < num+num_rb_attached_particles)
 		++gridSize;
 
 	gpuErrchk(cudaFree(energies_d));
@@ -500,7 +501,7 @@ void ComputeForce::decompose() {
             cudaFree(decomp_d);
             decomp_d = NULL;
 	}	
-	decomp.decompose_d(pos_d[0], num);
+	decomp.decompose_d(pos_d[0], num+num_rb_attached_particles);
 	decomp_d = decomp.copyToCUDA();
 
 	// Update pairlists using cell decomposition (not sure this is really needed or good) 
@@ -555,11 +556,11 @@ void ComputeForce::decompose() {
       //Han-Yi Chou 2017 my code
       
       #if __CUDA_ARCH__ >= 520
-      createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
+      createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
                                                                              pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
 									   excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex);
       #else //__CUDA_ARCH__ == 300
-      createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
+      createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
                                                                            pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, 
                                                                            excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex);
       #endif
@@ -663,20 +664,20 @@ float ComputeForce::decompCutoff() { return decomp.getCutoff(); }
 
 float ComputeForce::computeFull(bool get_energy) {
 	float energy = 0.0f;
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tableAlpha_d,
-		tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize,
+		tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize,
 		numReplicas, get_energy);
 
 	// Calculate energy based on the array created by the kernel
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites);
 	}
 
 	return energy;
@@ -684,20 +685,20 @@ float ComputeForce::computeFull(bool get_energy) {
 
 float ComputeForce::computeSoftcoreFull(bool get_energy) {
 	float energy = 0.0f;
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d,
-			tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize,
+			tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize,
 			numReplicas, get_energy);
 
 	// Calculate energy based on the array created by the kernel
 	if (get_energy) {
 		cudaDeviceSynchronize();
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles);
 	}
 
 	return energy;
@@ -729,13 +730,13 @@ float ComputeForce::computeElecFull(bool get_energy) {
 float ComputeForce::compute(bool get_energy) {
 	float energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
 	computeKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d,
-			tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d[0],
+			tableAlpha_d, tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0],
 			decomp_d, energies_d, switchStart, switchLen, gridSize, numReplicas,
 			get_energy);
 
@@ -744,7 +745,7 @@ float ComputeForce::compute(bool get_energy) {
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites);
 	}
 
 	return energy;
@@ -757,7 +758,7 @@ float ComputeForce::compute(bool get_energy) {
 float ComputeForce::computeTabulated(bool get_energy) {
 	float energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 	
@@ -773,7 +774,7 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	{
 		//clearEnergies<<< nb, numThreads >>>(energies_d,num);
 		//gpuErrchk(cudaDeviceSynchronize());
-	        cudaMemset((void*)energies_d, 0, sizeof(float)*(num+numGroupSites)*numReplicas);
+	        cudaMemset((void*)energies_d, 0, sizeof(float)*(num+num_rb_attached_particles+numGroupSites)*numReplicas);
 		computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0],
 						cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], energies_d);
 	}
@@ -866,27 +867,27 @@ float ComputeForce::computeTabulated(bool get_energy) {
 float ComputeForce::computeTabulatedFull(bool get_energy) {
 	energy = 0.0f;
 
-	gridSize = (num * numReplicas) / NUM_THREADS + 1;
+	gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1;
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
 	// Call the kernel to calculate forces
-	computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d,	tablePot_d[0], tableBond_d, num, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d);
+	computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d,	tablePot_d[0], tableBond_d, num+num_rb_attached_particles, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d);
 	gpuErrchk(cudaDeviceSynchronize());
 
 	computeAngles<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], angles_d, tableAngle_d,
-																						 numAngles, num, sys_d[0], energies_d,
+																						 numAngles, num+num_rb_attached_particles, sys_d[0], energies_d,
 																						 get_energy);
 	gpuErrchk(cudaDeviceSynchronize());
 	computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], dihedrals_d,
 																							  tableDihedral_d, numDihedrals,
-																								num, sys_d[0], energies_d,
+																								num+num_rb_attached_particles, sys_d[0], energies_d,
 																								get_energy);
 	// Calculate the energy based on the array created by the kernel
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles);
 	}
 
 	return energy;
@@ -894,7 +895,7 @@ float ComputeForce::computeTabulatedFull(bool get_energy) {
 
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 {
-    const size_t tot_num = (num+numGroupSites) * numReplicas;
+    const size_t tot_num = (num+num_rb_attached_particles+numGroupSites) * numReplicas;
 
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
@@ -935,11 +936,10 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom)
 {
-    const size_t tot_num = (num+numGroupSites) * numReplicas;
-    const size_t small_tot_num = num * numReplicas;
+    const size_t tot_num = num * numReplicas;
 
         gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
-        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * small_tot_num, cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 
 	copyToCUDA(forceInternal,pos);
         gpuErrchk(cudaDeviceSynchronize());
@@ -965,10 +965,11 @@ void ComputeForce::setForceInternalOnDevice(Vector3* f) {
 void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, const Restraint* const restraints)
 {
     assert(simNum == numReplicas); // Not sure why we have both of these things
-    int tot_num = num * simNum;
+    int tot_num_with_rb = (num+num_rb_attached_particles) * simNum;
+    int tot_num_with_rb_group = (num+num_rb_attached_particles+numGroupSites) * simNum;
 	// type_d
-	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num));
-	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num, cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num_with_rb));
+	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num_with_rb, cudaMemcpyHostToDevice));
 	
 	if (numBonds > 0)
 	{
@@ -977,8 +978,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap,
 		gpuErrchk(cudaMemcpyAsync(bonds_d, bonds, sizeof(Bond) * numBonds, cudaMemcpyHostToDevice));
 		
 		// bondMap_d
-		gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * num));
-		gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * num, cudaMemcpyHostToDevice));
+		gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * tot_num_with_rb_group));
+		gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * tot_num_with_rb_group, cudaMemcpyHostToDevice));
 	}
 
 	if (numExcludes > 0) {
@@ -990,8 +991,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap,
 				cudaMemcpyHostToDevice));
 		
 		// excludeMap_d
-		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num));
-		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num,
+		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * tot_num_with_rb));
+		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * tot_num_with_rb,
 				cudaMemcpyHostToDevice));
 	}
 
diff --git a/src/ComputeForce.h b/src/ComputeForce.h
index 5d602e4a609482a73ba7c639f8a575fc3023b2a7..b3bc258541f9603f387221363f3c39fe7fbae881 100644
--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -174,6 +174,7 @@ private:
 	int numReplicas;
 	int num;
 	int numParts;
+	int num_rb_attached_particles;
 	int numBonds;
 	int numExcludes;
 	int numTabBondFiles;
diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu
index 1ecfb9fad191cb1e70d787e384198b57b2a87883..f6cb72cc78899b85ca0395e28e9021dedd9c92ed 100644
--- a/src/ComputeGridGrid.cu
+++ b/src/ComputeGridGrid.cu
@@ -171,9 +171,11 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	}
 }
 
+
 __global__
 void createPartlist(const Vector3* __restrict__ pos,
 				const int numTypeParticles, const int* __restrict__ typeParticles_d,
+		    const int attached_particle_start, const int attached_particle_end,
 				int* numParticles_d, int* particles_d,
 				const Vector3 gridCenter, const float radius2, BaseGrid* sys_d) {
 	const int tid = threadIdx.x;
@@ -182,11 +184,13 @@ void createPartlist(const Vector3* __restrict__ pos,
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < numTypeParticles) {
 		int aid = typeParticles_d[i];
-		float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2();
-
-		if (dist <= radius2) {
+		if (aid < attached_particle_start || aid >= attached_particle_end) { 
+		    float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2();
+		
+		    if (dist <= radius2) {
 			int tmp = atomicAggInc(numParticles_d, warpLane);
 			particles_d[tmp] = aid;
+		    }
 		}
 	}
 }		
diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh
index 739aa622e2514840f8586c47d0a277d306427ff4..27fc32ccb6b3b3fd43dd005fd9eda7eb1689c8f8 100644
--- a/src/ComputeGridGrid.cuh
+++ b/src/ComputeGridGrid.cuh
@@ -29,6 +29,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 extern __global__
 void createPartlist(const Vector3* __restrict__ pos,
 				const int numTypeParticles, const int* __restrict__ typeParticles_d,
+		    const int attached_particle_start, const int attached_particle_end,
 				int* numParticles_d, int* particles_d,
 				const Vector3 gridCenter, const float radius2, BaseGrid* sys_d);
 	
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index b1a89c49049548ef72024c4d4bfd69357acf60ab..e7bdfd71c57e163646c644d6ef3bb85485f4e7a7 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -97,8 +97,24 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
     }
   } // end result: variable "num" is set
 
+	// Count particles associated with rigid bodies
+	num_rb_attached_particles = 0;
+	if (numRigidTypes > 0) {
+	    // grow list of rbs
+	    for (int i = 0; i < numRigidTypes; i++) {
+		RigidBodyType &rbt = rigidBody[i];
+		rbt.attach_particles();
+		num_rb_attached_particles += rbt.num * rbt.num_attached_particles();
+	    }
+	}
+	assert( num_rb_attached_particles == 0 || simNum == 1 ); // replicas not yet implemented
+	// num = num+num_rb_attached_particles;
+
+
 	// Set the number capacity
 	printf("\n%d particles\n", num);
+	printf("%d particles attached to RBs\n", num_rb_attached_particles);
+
 	if (numCap <= 0) numCap = numCapFactor*num; // max number of particles
 	if (numCap <= 0) numCap = 20;
 
@@ -106,18 +122,35 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	printf("%d groups\n", numGroupSites);
 
 	// Allocate particle variables.
-	pos = new Vector3[num * simNum];
+	// Each replica works with num+num_rb_attached_particles in array
+	pos = new Vector3[ (num+num_rb_attached_particles) * simNum];
+
         //Han-Yi Chou
         if (ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin"))
-            momentum = new Vector3[num * simNum];
-
-	type   = new int[num * simNum];
-	serial = new int[num * simNum];
-	posLast = new Vector3[num * simNum];
+            momentum = new Vector3[(num+num_rb_attached_particles) * simNum];
+
+	type   = new int[(num+num_rb_attached_particles) * simNum];
+	serial = new int[(num+num_rb_attached_particles) * simNum];
+	posLast = new Vector3[(num+num_rb_attached_particles) * simNum];
+
+	{
+	    int pidx = 0;
+	    for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types
+		RigidBodyType &rbt = rigidBody[i];
+		for (int j = 0; j < rbt.num; ++j) { // Loop over RBs
+		    for (const int& t: rbt.get_attached_particle_types()) {
+			type[num+pidx] = t;
+			serial[num+pidx] = num+pidx;
+			pidx++;
+		    }
+		}
+	    }
+	}	
+	
         //Han-Yi Chou
         if(ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin"))
-           momLast = new Vector3[num * simNum];
-	name = new String[num * simNum];
+           momLast = new Vector3[(num+num_rb_attached_particles) * simNum];
+	name = new String[(num+num_rb_attached_particles) * simNum];
 	currSerial = 0;
 
 
@@ -164,7 +197,12 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 				String* tokenList = new String[numTokens];
 				partsFromFile[i].tokenize(tokenList);
 
-				int currType = 0;
+				int currType = find_particle_type(tokenList[2]);
+				if (currType == -1) {
+				    printf("Error: Unable to find particle type %s\n", tokenList[2].val());
+				    exit(1);
+
+				}
 				for (int j = 0; j < numParts; j++)
 					if (tokenList[2] == part[j].name)
 						currType = j;
@@ -216,7 +254,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
                                 printf("done!\n");
                         }
                         else
-                            loadedMomentum = Boltzmann(COM_Velocity, num * simNum);
+                            loadedMomentum = Boltzmann(COM_Velocity, (num * simNum));
                     }
                 }
             }
@@ -420,13 +458,45 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 			excludes = new Exclude[numExcludes];
 			for (int i = 0; i < oldNumExcludes; i++)
 				excludes[i] = tempExcludes[i];
-			delete tempExcludes;
+			delete [] tempExcludes;
 		}
 		for (int i = oldNumExcludes; i < numExcludes; i++)
 			excludes[i] = newExcludes[i - oldNumExcludes];
 		printf("Built %d exclusions.\n",numExcludes);
 	}
 
+	{ // Add exclusions for RB attached particles
+	    std::vector<Exclude> ex;
+	    int start = num;
+	    for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types
+		RigidBodyType &rbt = rigidBody[i];
+		const int nap = rbt.num_attached_particles();
+		for (int j = 0; j < rbt.num; ++j) { // Loop over RBs
+		    for (int ai = 0; ai < nap-1; ++ai) {
+			for (int aj = ai+1; aj < nap; ++aj) {
+			    ex.push_back( Exclude( ai+start, aj+start ) );
+			}
+		    }
+		    start += nap;
+		}
+	    }
+	    // copy
+	    int oldNumExcludes = numExcludes;
+	    numExcludes = numExcludes + ex.size();
+	    if (excludes == NULL) {
+		excludes = new Exclude[numExcludes];
+	    } else if (numExcludes >= excludeCapacity) {
+		Exclude* tempExcludes = excludes;
+		excludes = new Exclude[numExcludes];
+		for (int i = 0; i < oldNumExcludes; i++)
+		    excludes[i] = tempExcludes[i];
+		delete [] tempExcludes;
+	    }
+	    for (int i = oldNumExcludes; i < numExcludes; i++)
+		excludes[i] = ex[i - oldNumExcludes];
+	}
+
+	printf("Built %d exclusions.\n",numExcludes);		
 	buildExcludeMap();
 
 	// Count number of particles of each type
@@ -434,7 +504,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	for (int i = 0; i < numParts; ++i) {
 		numPartsOfType[i] = 0;
 	}
-	for (int i = 0; i < num; ++i) {
+	for (int i = 0; i < num+num_rb_attached_particles; ++i) {
 		++numPartsOfType[type[i]];
 	}
 
@@ -679,7 +749,7 @@ void Configuration::copyToCUDA() {
 	gpuErrchk(cudaMalloc(&sys_d, sizeof(BaseGrid)));
 	gpuErrchk(cudaMemcpyAsync(sys_d, sys, sizeof(BaseGrid), cudaMemcpyHostToDevice));
 	/*gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum));
-	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int+num_rb_attached_particles) * num * simNum, cudaMemcpyHostToDevice));
 	
 	if (numBonds > 0) {
 		// bonds_d
@@ -698,7 +768,7 @@ void Configuration::copyToCUDA() {
 				cudaMemcpyHostToDevice));
 		
 		// excludeMap_d
-		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num));
+		gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * (num));
 		gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num,
 				cudaMemcpyHostToDevice));
 	}
@@ -1117,6 +1187,8 @@ int Configuration::readParameters(const char * config_file) {
 		else if (param == String("rotDamping"))
 			rigidBody[currRB].rotDamping = stringToVector3( value );
 
+		else if (param == String("attachedParticles"))
+			rigidBody[currRB].append_attached_particle_file(value);
 		else if (param == String("densityGrid"))
 			rigidBody[currRB].addDensityGrid(value);
 		else if (param == String("potentialGrid"))
@@ -1275,6 +1347,7 @@ void Configuration::readAtoms() {
 		for (int i = 0; i < numParts; i++)
 			if (part[i].num == 0)
 				found = false;
+		// assert(false); // TODO probably relax constraint that particle must be found; could just be in RB
 		if (!found) {
 			printf("ERROR: Number of particles not specified in config file.\n");
 			exit(1);
@@ -1454,9 +1527,11 @@ void Configuration::readGroups() {
 		std::vector<int> tmp;
 		for (int i=1; i < numTokens; ++i) {
 		    const int ai = atoi(tokenList[i].val());
-		    if (ai >= num) {
+		    if (ai >= num+num_rb_attached_particles) {
 			printf("Error: Attempted to include invalid particle in group: %s\n", line);
 			exit(-1);
+		    } else if (ai >= num) {
+			printf("WARNING: including RB particles in group with line: %s\n", line);
 		    }
 		    tmp.push_back( ai );
 		}
@@ -1514,7 +1589,8 @@ void Configuration::readBonds() {
 			continue;
 		}
 
-		if (ind1 < 0 || ind1 >= num+numGroupSites || ind2 < 0 || ind2 >= num+numGroupSites) {
+		if (ind1 < 0 || ind1 >= num+num_rb_attached_particles+numGroupSites ||
+		    ind2 < 0 || ind2 >= num+num_rb_attached_particles+numGroupSites) {
 			printf("ERROR: Bond file line '%s' includes invalid index\n", line);
 			exit(1);
 		}
@@ -1562,8 +1638,8 @@ void Configuration::readBonds() {
 	 * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin
 	 * bondMap[i].y is the index in the bonds array where the ith particle's bonds end
 	 */
-	bondMap = new int2[num+numGroupSites];
-	for (int i = 0; i < num+numGroupSites; i++) {	
+	bondMap = new int2[num+num_rb_attached_particles+numGroupSites];
+	for (int i = 0; i < num+num_rb_attached_particles+numGroupSites; i++) {
 		bondMap[i].x = -1;
 		bondMap[i].y = -1;
 	}
@@ -1621,8 +1697,8 @@ void Configuration::readExcludes()
 	}
 }
 void Configuration::addExclusion(int ind1, int ind2) {
-    if (ind1 >= num || ind2 >= num) {
-	printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num);
+    if (ind1 >= num+num_rb_attached_particles || ind2 >= num+num_rb_attached_particles) {
+	printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num+num_rb_attached_particles);
 	return;
     }
 		
@@ -1665,8 +1741,8 @@ void Configuration::buildExcludeMap() {
      * excludeMap[i].x is the index in the excludes array where the ith particle's excludes begin
      * excludeMap[i].y is the index in the excludes array where the ith particle's excludes end
      */
-    excludeMap = new int2[num];
-    for (int i = 0; i < num; i++) {	
+    excludeMap = new int2[num+num_rb_attached_particles];
+    for (int i = 0; i < num+num_rb_attached_particles; i++) {
 	excludeMap[i].x = -1;
 	excludeMap[i].y = -1;
     }
@@ -1675,7 +1751,7 @@ void Configuration::buildExcludeMap() {
     for (int i = 0; i < numExcludes; i++) {
 	if (excludes[i].ind1 != currPart) {
 	    currPart = excludes[i].ind1;
-	    assert(currPart < num);
+	    assert(currPart < num+num_rb_attached_particles);
 	    excludeMap[currPart].x = i;
 	    if (lastPart >= 0)
 		excludeMap[lastPart].y = i;
@@ -1724,7 +1800,7 @@ void Configuration::readAngles() {
 		int ind3 = atoi(tokenList[3].val());
 		String file_name = tokenList[4];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites or ind3 >= num+numGroupSites)
+		if (ind1 >= num+num_rb_attached_particles+numGroupSites or ind2 >= num+num_rb_attached_particles+numGroupSites or ind3 >= num+num_rb_attached_particles+numGroupSites)
 			continue;
 
 		if (numAngles >= capacity) {
@@ -1785,8 +1861,10 @@ void Configuration::readDihedrals() {
 		int ind4 = atoi(tokenList[4].val());
 		String file_name = tokenList[5];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites
-				or ind3 >= num+numGroupSites or ind4 >= num+numGroupSites)
+		if (ind1 >= num+num_rb_attached_particles+numGroupSites or
+		    ind2 >= num+num_rb_attached_particles+numGroupSites or
+		    ind3 >= num+num_rb_attached_particles+numGroupSites or
+		    ind4 >= num+num_rb_attached_particles+numGroupSites)
 			continue;
 
 		if (numDihedrals >= capacity) {
@@ -1845,7 +1923,7 @@ void Configuration::readRestraints() {
 		float y0 = (float) strtod(tokenList[4].val(), NULL);
 		float z0 = (float) strtod(tokenList[5].val(), NULL);
 
-		if (id >= num+numGroupSites) continue;
+		if (id >= num + num_rb_attached_particles + numGroupSites) continue;
 
 		if (numRestraints >= capacity) {
 			Restraint* temp = restraints;
diff --git a/src/Configuration.h b/src/Configuration.h
index fc70e2a1e61eed800c52235e868d8f3afd5bf6cc..ff7879faf686ff4b29e2c63be454d99ca2a96cde 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -65,6 +65,7 @@ class Configuration {
 	void readDihedrals();
 	void readRestraints();
 
+
 	bool readTableFile(const String& value, int currTab);
 	bool readBondFile(const String& value, int currBond);
 	bool readAngleFile(const String& value, int currAngle);
@@ -89,6 +90,15 @@ public:
 	Configuration(const char * config_file, int simNum = 0, bool debug=false);
 	~Configuration();
 
+    int find_particle_type(const char* s) const {
+	for (int j = 0; j < numParts; j++) {
+	    printf("Searching particle %d (%s) =? %s\n", j, part[j].name.val(), s);
+	    if (strcmp(s,part[j].name.val()) == 0)
+		return j;
+	}
+	return -1;
+    }
+
 	void copyToCUDA();
 
 	// Output variables
@@ -122,7 +132,8 @@ public:
 	Bond* bonds;
 	int numCap; // max number of particles
 	int num; // current number of particles
-	Vector3* pos; //  position of each particle
+    int num_rb_attached_particles;
+        Vector3* pos; //  position of each particle
         Vector3* momentum; //momentum of each brownian particles Han-Yi Chou
         Vector3  COM_Velocity; //center of mass velocity Han-Yi Chou
 	int* type; // type of each particle
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 1fb98bc084e49aca00279e5c91611bfbe0a9f32e..0510a0e41881166cabb49556a27b9c2f278130e6 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -97,10 +97,11 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	bonds = c.bonds;
 	numCap = c.numCap;                      // max number of particles
 	num = c.num;                            // current number of particles
+	num_rb_attached_particles = c.num_rb_attached_particles;
 	numGroupSites = c.numGroupSites;
 
 	// Allocate arrays of positions, types and serial numbers
-	pos    = new Vector3[(num+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
+	pos    = new Vector3[(num+num_rb_attached_particles+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
         // Allocate arrays of momentum Han-Yi Chou
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
         {
@@ -112,8 +113,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
         //for debug
         //force = new Vector3[num * numReplicas];
 
-	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
-	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
+	type   = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' types.
+	serial = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' serial numbers.
 
 	// Allocate things for rigid body
 	// RBC = RigidBodyController(c);
@@ -127,8 +128,10 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	
 	// Replicate identical initial conditions across all replicas
 	for (int r = 0; r < numReplicas; ++r) {
-	  std::copy(c.type, c.type + num, type + r*num);
-	  std::copy(c.serial, c.serial + num, serial + r*num);
+	    std::copy(c.type, c.type + num+num_rb_attached_particles,
+		      type + r*(num+num_rb_attached_particles));
+	    std::copy(c.serial, c.serial + num + num_rb_attached_particles,
+		      serial + r*(num+num_rb_attached_particles));
 	  if (c.copyReplicaCoordinates > 0)
 	    std::copy(c.pos, c.pos + num, pos + r*num);
 	}
@@ -307,8 +310,12 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	auto _get_index = [this](int idx, int replica) {
 	    // Convenient lambda function to deal with increasingly complicated indexing
-	    auto num = this->num; auto numReplicas = this->numReplicas; auto numGroupSites = this->numGroupSites;
-	    idx = (idx < num) ? idx + replica*num : (idx-num) + numReplicas*num + replica * numGroupSites;
+	    auto num = this->num;
+	    auto numReplicas = this->numReplicas;
+	    auto num_rb_attached_particles = this->num_rb_attached_particles;
+	    auto numGroupSites = this->numGroupSites;
+	    idx = (idx < num+num_rb_attached_particles) ? idx + replica*(num+num_rb_attached_particles)
+		: (idx-num-num_rb_attached_particles) + numReplicas*(num+num_rb_attached_particles) + replica * numGroupSites;
 	    return idx;
 	};
 
@@ -367,7 +374,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	}
 	internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList);
 	
-	forceInternal = new Vector3[(num+numGroupSites)*numReplicas];
+	forceInternal = new Vector3[(num+num_rb_attached_particles+numGroupSites)*numReplicas];
 	if (fullLongRange != 0)
 	    printf("No cell decomposition created.\n");
 
@@ -496,8 +503,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
     // Open the files for recording ionic currents
     for (int repID = 0; repID < numReplicas; ++repID) 
     {
-
-        writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
+        writers[repID]->newFile(pos + repID*(num+num_rb_attached_particles), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             momentum_writers[repID]->newFile((momentum + repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
         //random_writers[repID]->newFile(random + (repID * num), name, 0.0f, num);
@@ -569,16 +575,16 @@ void GrandBrownTown::RunNoseHooverLangevin()
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i)
-            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+conf.num_rb_attached_particles), sys_d);
         gpuErrchk(cudaDeviceSynchronize());
     }
 
     float t; // simulation time
 
-    int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
+    int numBlocks = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + (((num+num_rb_attached_particles) * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
     int tl = temperatureGridFile.length();
     Vector3 *force_d;
-    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+numGroupSites) * numReplicas));
+    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+num_rb_attached_particles+numGroupSites) * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
     for (int i=0; i< gpuman.gpus.size(); ++i) {
@@ -599,11 +605,23 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    internal->clear_force();
 	    internal->clear_energy();
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
-	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num, numGroupSites, numReplicas);
+
+	    if (num_rb_attached_particles > 0) {
+		#pragma omp parallel for
+		for(int i = 0; i < numReplicas; ++i) {
+		    RBC[i]->update_attached_particle_positions(
+			internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles),
+			internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles),
+			internal->getEnergy()+num+i*(num+num_rb_attached_particles),
+			sys_d, num, num_rb_attached_particles, numReplicas);
+		}
+	    }
+
+	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
-		gpuman.nccl_broadcast(0, _pos, _pos, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_broadcast(0, _pos, _pos, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 	    gpuman.sync();
@@ -633,7 +651,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal -> computeTabulated(get_energy);
                             break;
@@ -657,7 +675,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal->compute(get_energy);
                             break;
@@ -676,13 +694,32 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     }
                 }
             }//if inter-particle force
+
+	    if (get_energy) {
+		compute_position_dependent_force_for_rb_attached_particles
+		    <<< numBlocks, NUM_THREADS >>> (
+			internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
+			internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	    } else {
+		compute_position_dependent_force_for_rb_attached_particles
+		    <<< numBlocks, NUM_THREADS >>> (
+			internal -> getPos_d()[0],
+			internal -> getForceInternal_d()[0], internal -> getEnergy(),
+			internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	    }
+
+
             #ifdef _OPENMP
             omp_set_num_threads(4);
             #endif
             #pragma omp parallel for
             for(int i = 0; i < numReplicas; ++i)
-                RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
-                                       RigidBodyInterpolationType, sys, sys_d);
+                RBC[i]->updateForces(internal->getPos_d()[0]+i*(num+num_rb_attached_particles),
+				     internal->getForceInternal_d()[0]+i*(num+num_rb_attached_particles),
+				     s,
+				     internal->getEnergy()+i*(num+num_rb_attached_particles),
+				     get_energy,
+				     RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles);
             if(rigidbody_dynamic == String("Langevin"))
             {
                 #ifdef _OPENMP
@@ -698,28 +735,29 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-		gpuman.nccl_reduce(0, _f, _f, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 
-	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 
         }//if step == 1
 
 	internal->clear_energy();
 	gpuman.sync();
 
+
         if(particle_dynamic == String("Langevin"))
-            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
+            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
             updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), 
-            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, 
+            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d,
             randoGen_d, numReplicas, ParticleInterpolationType);
         ////For Brownian motion
         else
             updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getForceInternal_d()[0], internal -> getType_d(),
-                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, 
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas,
                                                        internal->getEnergy(), get_energy, ParticleInterpolationType);
 
         if(rigidbody_dynamic == String("Langevin"))
@@ -748,10 +786,21 @@ void GrandBrownTown::RunNoseHooverLangevin()
             }
         }
 
+	if (num_rb_attached_particles > 0) {
+	    #pragma omp parallel for
+	    for(int i = 0; i < numReplicas; ++i) {
+		RBC[i]->update_attached_particle_positions(
+		    internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles),
+		    internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles),
+		    internal->getEnergy()+num+i*(num+num_rb_attached_particles),
+		    sys_d, num, num_rb_attached_particles, numReplicas);
+	    }
+	}
+	
         if (s % outputPeriod == 0) {
             // Copy particle positions back to CPU
 	    gpuErrchk(cudaDeviceSynchronize());
-            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost));
 	}
         if (imd_on && clientsock && s % outputPeriod == 0)
         {
@@ -830,9 +879,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
         for(int i = 0; i < numReplicas; ++i) 
             RBC[i]->clearForceAndTorque();
 
+	
 	if (numGroupSites > 0) {
 	    gpuman.sync();
-	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 	    gpuman.sync();
 	}
 
@@ -844,7 +894,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
 		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
-		gpuman.nccl_broadcast(0, _p, _p, (num+numGroupSites)*numReplicas, nccl_broadcast_streams);
+		gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams);
 	    }
 	    #endif
     	}
@@ -865,13 +915,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             #endif
                             #pragma omp parallel for
                             for(int i = 0; i < numReplicas; ++i)
-                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                         }
                         internal -> computeTabulated(get_energy);
 			#ifdef USE_NCCL
 			if (gpuman.gpus.size() > 1) {
 			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+			    gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1);
 			}
 			#endif
                         break;
@@ -912,28 +962,43 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 }
             }
         }
+
+	if (get_energy) {
+	    compute_position_dependent_force_for_rb_attached_particles
+		<<< numBlocks, NUM_THREADS >>> (
+		    internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
+		    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	} else {
+	    compute_position_dependent_force_for_rb_attached_particles
+		<<< numBlocks, NUM_THREADS >>> (
+		    internal -> getPos_d()[0],
+		    internal -> getForceInternal_d()[0], internal -> getEnergy(),
+		    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
+	}
+
+
         //compute the force for rigid bodies
         #ifdef _OPENMP
         omp_set_num_threads(4);
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
-            RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
-                                 RigidBodyInterpolationType, sys, sys_d);
+            RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy,
+				 RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles);
 
 	if (numGroupSites > 0) {
 	    gpuman.sync();
 	    // if ((s%100) == 0) {
-	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// } else {
-	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// }
 	    gpuman.sync();
 	}
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], 
-            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, 
+            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy,
             ParticleInterpolationType);
             //gpuErrchk(cudaDeviceSynchronize());
 
@@ -957,7 +1022,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             {
-                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * (num) * numReplicas, cudaMemcpyDeviceToHost));
             }
             t = s*timestep;
             // Loop over all replicas
@@ -968,12 +1033,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     updateNameList(); // no need for it here if particles stay the same
 
                 // Write the trajectory.
-                writers[repID]->append(pos + (repID*num), name, serial, t, num);
+                writers[repID]->append(pos + repID*(num+num_rb_attached_particles), name, serial, t, num);
 
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
-                    momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
-                    //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
+                    momentum_writers[repID]->append(momentum + repID * (num+num_rb_attached_particles), name, serial, t, num);
+                    //force_writers[repID]->append(force + repID * (num+num_rb_attached_particles), name, serial, t, num);
                 }
             }
             // TODO: Currently, not compatible with replicas. Needs a fix.
@@ -1005,7 +1070,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 float e = 0.f;
                 float V = 0.f;
                 thrust::device_ptr<float> en_d(internal->getEnergy());
-                V = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas;
+                V = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas;
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
                     gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
@@ -1616,7 +1681,7 @@ void GrandBrownTown::run() {
 //
 void GrandBrownTown::populate() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		int pn = 0;
 		int p = 0;
 		for (int i = 0; i < num; i++) {
@@ -1636,7 +1701,7 @@ void GrandBrownTown::populate() {
 void GrandBrownTown::writeRestart(int repID) const 
 {
     FILE* out   = fopen(restartFiles[repID].c_str(), "w");
-    const int offset = repID * num;
+    const int offset = repID * (num+num_rb_attached_particles);
 
     for (int i = 0; i < num; ++i) 
     {
@@ -1671,7 +1736,7 @@ void GrandBrownTown::initialCondCen() {
 // Set random initial positions for all particles and replicas
 void GrandBrownTown::initialCond() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		for (int i = 0; i < num; i++) {
 			pos[i + offset] = findPos(type[i + offset]);
 		}
@@ -1715,7 +1780,7 @@ float GrandBrownTown::KineticEnergy()
     gpuErrchk(cudaMalloc((void**)&energy, sizeof(float)));
     gpuErrchk(cudaMemset((void*)energy,0,sizeof(float)));
 
-    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, numReplicas*num);
+    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, num, num_rb_attached_particles, numReplicas);
     gpuErrchk(cudaDeviceSynchronize());
 
     Reduction<64><<<dim3(1),dim3(64)>>>(vec_red, energy, 512);
@@ -1763,7 +1828,7 @@ void GrandBrownTown::init_cuda_group_sites()
     }
 
     // Create GPU-friendly data structure
-    // TODO make this work for replicas
+    assert(numReplicas == 1);    // TODO make this work for replicas
     int* tmp = new int[numGroupSites+1+num_particles];
     num_particles = 0;
     int i = 0;
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index c4c059aea09aafe0f9a77c3f0cd6a6a1f71e7073..a152dd1ddef4f945e5f468c124a685b0b2d23a13 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -14,17 +14,66 @@ __device__
 Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad,
 						 float timestep, BaseGrid *sys, Random *randoGen, int num);
 
+inline __device__
+ForceEnergy compute_position_dependent_force(
+    const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal,
+    const int* __restrict__ type, BrownianParticleType** part,
+    const float electricField, const int scheme, const int idx)
+{
+    int t = type[idx];
+    Vector3 r0 = pos[idx];
+    const BrownianParticleType& pt = *part[t];
+    Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+
+    ForceEnergy fe(0.f, 0.f);
+    for(int i = 0; i < pt.numPartGridFiles; ++i)
+    {
+	ForceEnergy tmp(0.f, 0.f);
+	if(!scheme) {
+	    BoundaryCondition bc = pt.pmf_boundary_conditions[i];
+	    INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0)
+		} else
+	    tmp = pt.pmf[i]->interpolateForceD(r0);
+	fe.f += tmp.f * pt.pmf_scale[i];
+	fe.e += tmp.e * pt.pmf_scale[i];
+    }
+    // if(get_energy)
+    // 	energy[idx] += fe.e;
+
+#ifndef FORCEGRIDOFF
+    // Add a force defined via 3D FORCE maps (not 3D potential maps)
+    if(!scheme)
+    {
+	if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+	if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+	if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+    }
+    else
+    {
+	if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
+	if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
+	if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
+    }
+#endif
+    fe.f = fe.f + forceExternal;
+    return fe;
+}
+
+
+
 //update both position and momentum for Langevin dynamics
 //Han-Yi Chou
 #if 0
 __global__ void updateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, 
                              int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, 
-                             int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) 
+                             int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
         //Vector3 r0(tex1Dfetch(PosTex, idx));
         //Vector3 p0(tex1Dfetch(MomTex, idx));
@@ -101,48 +150,24 @@ __global__ void
 updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, 
                                Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], 
                                float kT, BaseGrid* kTGrid, float electricField, int tGridLength, float timestep, 
-                               int num, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
+                               int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
 
+        int t = type[idx];
         Vector3 r0  = pos[idx];
         Vector3 p0  = momentum[idx];
         float   ran = random[idx];
 
         const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f,0.f);
-            if(!scheme) {
-		const BoundaryCondition& bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0)
-	    } else
-                tmp = pt.pmf[i]->interpolateForceD(r0);
-            fe.f += tmp.f * pt.pmf_scale[i];
-        }
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
 
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
         #ifdef Debug
         forceInternal[idx] = -force;
         #endif
@@ -214,50 +239,27 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 //Han-Yi Chou
 __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                   int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                  float electricField,int tGridLength, float timestep, int num, BaseGrid* sys, 
+                                  float electricField,int tGridLength, float timestep,
+				  int num, int num_rb_attached_particles, BaseGrid* sys,
                                   Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
 
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	// if (get_energy) energy[idx] += fe.e;
 
+        int t = type[idx];
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
-
         const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
 
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f, 0.f);
-            if(!scheme) {
-		BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0)
-	    } else 
-                tmp = pt.pmf[i]->interpolateForceD(r0);
-            fe.f += tmp.f * pt.pmf_scale[i];
-        }
-
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
@@ -315,54 +317,23 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
 //update momentum in the last step of BAOAB integrator for the Langevin dynamics. Han-Yi Chou
 __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
                                       int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                      float electricField, int tGridLength, float timestep, int num, 
+                                      float electricField, int tGridLength, float timestep, int num, int num_rb_attached_particles,
                                       BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy,int scheme)
 {
-    const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 
     if (idx < num * numReplicas)
     {
-        int t = type[idx];
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	if (get_energy) energy[idx] += fe.e;
+
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
 
-        const BrownianParticleType& pt = *part[t];
-        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-        //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
-        ForceEnergy fe(0.f, 0.f);
-        for(int i = 0; i < pt.numPartGridFiles; ++i)
-        {
-            ForceEnergy tmp(0.f, 0.f);
-            if(!scheme) {
-		BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-		INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0)
-	    } else 
-                tmp = pt.pmf[i]->interpolateForceD(r0);
-
-	    tmp.f = tmp.f * pt.pmf_scale[i];
-	    tmp.e = tmp.e * pt.pmf_scale[i];
-            fe += tmp;
-            //fe.e += tmp.e;
-        }
-        if(get_energy)
-            energy[idx] += fe.e;
-#ifndef FORCEGRIDOFF
-        // Add a force defined via 3D FORCE maps (not 3D potential maps)
-        if(!scheme)
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
-        }
-        else
-        {
-            if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0);
-            if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0);
-            if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0);
-        }
-#endif
-        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        Vector3 force = forceInternal[idx] + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
@@ -380,12 +351,14 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
 #if 0
 __global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                  int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
-                                 int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+				      int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas)
 {
-    const int idx  = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx  = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
@@ -441,16 +414,17 @@ __global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3*
 __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], 
                   BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, 
-                  int tGridLength, float timestep, int num, BaseGrid* sys,
+                  int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys,
 		  Random* randoGen, int numReplicas, float* energy, bool get_energy, int scheme) 
 {
 	// Calculate this thread's ID
-	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+	int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
         
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
 	// Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) 
         {
+	    idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 		const int t = type[idx];
 		Vector3   p = pos[idx];
 
@@ -458,46 +432,14 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
                 
 	 	/* printf("atom %d: forceInternal: %f %f %f\n", idx, forceInternal[idx].x, forceInternal[idx].y, forceInternal[idx].z);  */
 
-		// Compute external force
-		Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
-
-		// Compute PMF
-		// TODO: maybe make periodic and nonPeriodic versions of this kernel
-		//ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(p);
-		ForceEnergy fe(0.f, 0.f);
-                for(int i = 0; i < pt.numPartGridFiles; ++i)
-                {
-                    ForceEnergy tmp(0.f,0.f);
-		    if(!scheme) {
-			BoundaryCondition bc = pt.pmf_boundary_conditions[i];
-			INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, p)
-		    } else 
-                        tmp = pt.pmf[i]->interpolateForceD(p);
-		    tmp.f *= pt.pmf_scale[i];
-		    tmp.e *= pt.pmf_scale[i];
-                    fe += tmp;
-                }
-#ifndef FORCEGRIDOFF
-		// Add a force defined via 3D FORCE maps (not 3D potential maps)
-		if(!scheme)
-                {
-		    if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(p);
-		    if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(p);
-		    if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(p);
-                }
-                else
-                {
-                    if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(p);
-                    if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(p);
-                    if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(p);
-                }
-#endif
+		ForceEnergy fe = compute_position_dependent_force(
+		    pos, forceInternal, type, part, electricField, scheme, idx );
 
 		// Compute total force:
 		//	  Internal:  interaction between particles
 		//	  External:  electric field (now this is basically a constant vector)
 		//	  forceGrid: ADD force due to PMF or other potentials defined in 3D space
-		Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+		Vector3 force = forceInternal[idx] + fe.f;
                 #ifdef Debug
                 forceInternal[idx] = -force;
                 #endif
@@ -709,3 +651,37 @@ __global__ void devicePrint(RigidBodyType* rb[]) {
 //     cudaMalloc( &totalForce_h, sizeof(Vector3) );
 //     cudaMemcpyToSymbol(totalForce, &totalForce_h, sizeof(totalForce_h));
 // }
+
+__global__ void compute_position_dependent_force_for_rb_attached_particles(
+    const Vector3* __restrict__ pos,
+    Vector3* __restrict__ forceInternal, float* __restrict__ energy,
+    const int* __restrict__ type, BrownianParticleType** __restrict__ part,
+    const float electricField, const int num, const int num_rb_attached_particles,
+    const int numReplicas, const int scheme)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_rb_attached_particles * numReplicas)
+    {
+	idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles);
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	atomicAdd( &forceInternal[idx], fe.f );
+	atomicAdd( &energy[idx], fe.e );
+    }
+}
+__global__ void compute_position_dependent_force_for_rb_attached_particles(
+    const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal,
+    const int* __restrict__ type, BrownianParticleType** __restrict__ part,
+    const float electricField, const int num, const int num_rb_attached_particles,
+    const int numReplicas, const int scheme)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_rb_attached_particles * numReplicas)
+    {
+	idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles);
+	ForceEnergy fe = compute_position_dependent_force(
+	    pos, forceInternal, type, part, electricField, scheme, idx );
+	atomicAdd( &forceInternal[idx], fe.f );
+    }
+}
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index c6479a081fb2b4461493f49269c1e67b29eb7323..6d8ed7f34c5bcdbed402929d8c5c3b41ffcb5f89 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -230,6 +230,8 @@ private:
 	int numAngles;
 	int numDihedrals;
 
+    int num_rb_attached_particles;
+
 	int numGroupSites;
 	int* groupSiteData_d;
 
diff --git a/src/Makefile b/src/Makefile
index b0356b86521cc4ca5448f5fdb8b0b35ea106b2dc..68598f462bf925e748a759b91ab9bd5ab09d6afc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,7 @@
 ### Paths, libraries, includes, options
 include ./findcudalib.mk
 
+SUFFIX ?= ""
 
 ifeq ($(dbg),1)
 	NV_FLAGS += -g -G -O0
@@ -99,13 +100,18 @@ ifeq ($(omp),1)
 TARGET := $(TARGET)_omp
 endif
 
-all: $(TARGET)
-	@echo "Done ->" $(TARGET)
+all: $(TARGET)$(SUFFIX)
+	@echo "Done ->" $(TARGET)$(SUFFIX)
 
-$(TARGET): $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h
-	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+$(TARGET)_link.o: $(CU_OBJ)
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
 	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
-	$(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)
+
+$(TARGET)$(SUFFIX): $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+#	$(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
+	$(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)$(SUFFIX)
+
 
 .SECONDEXPANSION:
 $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh)
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index e224d3dbf42be8bb3afde7dfa1e28f37ff4d8d26..8aeb7aa65711c72fa02149e73cf969a611f4078f 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -18,10 +18,10 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 #endif
 
-RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref) 
-    : name(name), c(&cref), t(&tref), RBC(RBCref), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); }
+RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref, int attached_particle_start, int attached_particle_end) 
+    : name(name), c(&cref), t(&tref), RBC(RBCref), attached_particle_start(attached_particle_start), attached_particle_end(attached_particle_end), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); }
 RigidBody::RigidBody(const RigidBody& rb)
-    : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); }
+    : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), attached_particle_start(rb.attached_particle_start), attached_particle_end(rb.attached_particle_end), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); }
 void RigidBody::init() {
 	// units "(kcal_mol/AA) * ns" "dalton AA/ns" * 4.184e+08	
 	timestep = c->timestep;
@@ -103,6 +103,23 @@ int RigidBody::appendNumParticleBlocks( std::vector<int>* blocks ) {
     return ret;
 }
 
+__global__
+void update_particle_positions_kernel(Vector3* __restrict__ pos, const int start, const int num,
+			       const Vector3* __restrict__ pos_rb,
+			       const Vector3 center, const Matrix3 orientation) {
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < num) {
+	    const int aid = i+start;
+	    pos[aid] = orientation*pos_rb[i] + center;
+	}
+}		
+void RigidBody::update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d) {
+    int num_attached = attached_particle_end - attached_particle_start;
+    int nb = floor(num_attached/NUMTHREADS) + 1;
+    update_particle_positions_kernel<<<nb,NUMTHREADS>>>(pos_d, attached_particle_start, num_attached,
+						 t->attached_particle_positions_d, position, orientation);
+}
+
 void RigidBody::addForce(Force f) { 
 	force += f; 
 } 
@@ -113,6 +130,141 @@ void RigidBody::addEnergy(float e)
 {
     energy += e;
 }
+
+// TODO move code snippet to more general location
+template<class C, class T>
+auto contains(const C& v, const T& x)
+-> decltype(end(v), true)
+{
+    return end(v) != std::find(begin(v), end(v), x);
+}
+/*
+void RigidBody::initialize_possible_particles()
+{
+    std::vector<int> atomic_ids;
+
+    // Loop over potential grids
+    for (int i = 0; i < t->numPotGrids; ++i) {
+	atomic_ids.clear();
+
+	String& gridName = t->potentialGridKeys[i];
+	
+	// Loop over particle types to count the number of particles
+	for (int j = 0; j < conf->numParts; ++j) {
+	    if (contains(conf->partRigidBodyGrid[j], gridName)) {
+		// gridNames contained gridName, so add the particles to atomic_ids
+		for (int aid = 0; aid < conf->num + conf->num_rb_attached_particles; ++aid) {
+		    if (conf->type[aid] == j && (aid < exclude_start || aid > exclude_end)) {
+			atomic_ids.push_back(aid);
+		    }
+		}
+	    }
+	}
+
+	// Initialize device data
+	size_t sz = sizeof(int) * atomic_ids.size();
+	gpuErrchk(cudaMalloc( &(possible_particles_d[i]), sz ));
+	gpuErrchk(cudaMemcpyAsync( possible_particles_d[i], &atomic_ids[0], sz, cudaMemcpyHostToDevice))
+
+	// // Add particles attached to OTHER RBs
+	// int rb_particle_offset = conf->num;
+	// for (const auto& rbs: RBC->rigidBodyByType)
+	// {
+	//     int rb_type
+	//     const RigidBodyType& rb_type = rbs[0].t;
+	//     if (rbs.t != 
+	//     for (int& ptype: rb_type->attached_particle_types) {
+	// 	// This could be made much more efficient
+	// 	if (contains(conf->partRigidBodyGrid[ptype], gridName)) {
+	// 	// Add particles
+	// 	rb
+	// 	rbs[0].sum
+	// 	    }
+	//     rb_particle_offset += rb_type.attached_particle_types
+	    
+	//     for (int k = 0; k < t->gridNames.size(); ++k) {
+	// 	    if (t->gridNames[k] == gridName) {
+	// 		t->numParticles[i] += conf->numPartsOfType[j];			
+	// 	    }
+
+	    
+	// 		// Loop over rigid body grid names associated with particle type
+	// 		const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+	// 		for (int k = 0; k < gridNames.size(); ++k) {
+	// 		    if (gridNames[k] == gridName) {
+	// 			// Copy type j particles to particles[i]
+	// 			memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId );
+	// 			assert(currId == conf->numPartsOfType[j]);
+	// 			pid += conf->numPartsOfType[j];
+	// 		    }
+	// 		}
+	// 	    }
+
+	// numParticles[i] = 0;
+
+	    // Count the particles interacting with potential grid i
+	    // Loop over particle types
+	    for (int j = 0; j < conf->numParts; ++j) {
+
+		// Loop over rigid body grid names associated with particle type
+		const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+		for (int k = 0; k < t->gridNames.size(); ++k) {
+		    if (t->gridNames[k] == gridName) {
+			numParticles[i] += conf->numPartsOfType[j];			
+		    }
+		}
+	    }
+
+	    // Add RB particles
+	    for (const auto& rbv: RBC->rigidBodyByType)
+	    {
+		int ptype = rbv[0].t->attached_particle_types;
+		const std::vector<String>& gridNames = conf->partRigidBodyGrid[ptype];
+		for (int k = 0; k < t->gridNames.size(); ++k) {
+		    if (t->gridNames[k] == gridName) {
+			t->numParticles[i] += conf->numPartsOfType[j];			
+		    }
+		
+	    attached_particle_
+	    
+	    if (numParticles[i] > 0) {
+
+		    // allocate array of particle ids for the potential grid 
+		    particles[i] = new int[numParticles[i]];
+		    int pid = 0;
+		
+		    // Loop over particle types to count the number of particles
+		    for (int j = 0; j < conf->numParts; ++j) {
+
+			// Build temporary id array of type j particles
+			int tmp[conf->numPartsOfType[j]];
+			int currId = 0;
+			for (int aid = 0; aid < conf->num; ++aid) {
+			    if (conf->type[aid] == j)
+				tmp[currId++] = aid;
+			}
+			if (currId == 0) continue;
+
+			// Loop over rigid body grid names associated with particle type
+			const std::vector<String>& gridNames = conf->partRigidBodyGrid[j];
+			for (int k = 0; k < gridNames.size(); ++k) {
+			    if (gridNames[k] == gridName) {
+				// Copy type j particles to particles[i]
+				memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId );
+				assert(currId == conf->numPartsOfType[j]);
+				pid += conf->numPartsOfType[j];
+			    }
+			}
+		    }
+
+		    // Initialize device data
+		    size_t sz = sizeof(int) * numParticles[i];
+		    gpuErrchk(cudaMalloc( &(particles_d[i]), sz ));
+		    gpuErrchk(cudaMemcpyAsync( particles_d[i], particles[i], sz, cudaMemcpyHostToDevice));
+		}
+	}
+}
+*/
 void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) {
 	for (int i = 0; i < t->numPotGrids; ++i) {
 		numParticles[i] = 0;
@@ -131,10 +283,12 @@ void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) {
 			int nb = floor(tnp/NUMTHREADS) + 1;
 #if __CUDA_ARCH__ >= 300
 			createPartlist<<<nb,NUMTHREADS>>>(pos_d, tnp, t->particles_d[i],
+							  attached_particle_start, attached_particle_end,
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff, sys_d);
 #else
 			createPartlist<<<nb,NUMTHREADS,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i],
+							  attached_particle_start, attached_particle_end,
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff, sys_d);
 #endif			
@@ -179,6 +333,20 @@ void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, in
 	}
 }
 
+void RigidBody::apply_attached_particle_forces(const Vector3* force) {
+    const auto &rb_pos = t->attached_particle_positions;
+    int num = rb_pos.size();
+    Vector3 total_force = Vector3(0.0f);
+    Vector3 torque = Vector3(0.0f);
+    for (int i = 0; i < num; ++i) {
+	const int j = i + attached_particle_start;
+	torque = torque + (orientation*rb_pos[i]).cross(force[j]);
+	total_force = total_force + force[j];
+    }
+    addForce(total_force);
+    addTorque(torque);
+}
+
 void RigidBody::applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx) {
 	// loop over potential grids 
 	for (int i = 0; i < t->numPotGrids; ++i) {
diff --git a/src/RigidBody.h b/src/RigidBody.h
index 5854a082a29f975d11281b9ad45ffd5bc8cc0fbf..e3e81737ae1d7aee45f16e915d44f6aa413e1749 100644
--- a/src/RigidBody.h
+++ b/src/RigidBody.h
@@ -31,7 +31,9 @@ class RigidBody { // host side representation of rigid bodies
 	| splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) |
 	`––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––*/
 		public:
-	RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC);
+    RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC,
+	      int attached_particle_start, int attached_particle_end);
+
     RigidBody(const RigidBody& rb);
     // RigidBody(const RigidBody& rb) : RigidBody(rb.name, *rb.c, *rb.t) {};
 	void init();
@@ -40,6 +42,8 @@ class RigidBody { // host side representation of rigid bodies
 
 	int appendNumParticleBlocks( std::vector<int>* blocks );
 
+    void update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d);
+
 	HOST DEVICE void addForce(Force f); 
 	HOST DEVICE void addTorque(Force t);
         HOST DEVICE void addEnergy(float e);
@@ -76,8 +80,10 @@ class RigidBody { // host side representation of rigid bodies
               return Vector3( angularMomentum.x, angularMomentum.y, angularMomentum.z);
         }
 
+        void initializeParticleLists();
 	void updateParticleList(Vector3* pos_d, BaseGrid* sys_d);
 	void callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, ForceEnergy* forcestorques_d, const std::vector<int>& forcestorques_offset, int& fto_idx);
+	void apply_attached_particle_forces(const Vector3* force);
 	void applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx);
 	
 	bool langevin;
@@ -122,9 +128,12 @@ private:
 	bool isFirstStep; 
 	
 	int* numParticles;		  /* particles affected by potential grids */
+	int** possible_particles_d;		 	
 	int** particles_d;		 	
 	const cudaStream_t** particleForceStreams;
-	
+
+    int attached_particle_start, attached_particle_end;
+    
 	/*–––––––––––––––––––––––––––––––––––––––––.
 	| units "kcal_mol/AA * ns" "(AA/ns) * amu" |
 	`–––––––––––––––––––––––––––––––––––––––––*/
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index c5e855ebb34692a168a72198431ff1450d914a53..b7fd3c8964273b336705e4f912e1f7d4df8eedd3 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -45,15 +45,14 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre
 
 	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
 	construct_grids();
-	for (int i = 0; i < conf.numRigidTypes; i++)
-		conf.rigidBody[i].initializeParticleLists();
 
 	int numRB = 0;
 	// grow list of rbs
 
-
+	int attached_particle_offset = 0;
 	for (int i = 0; i < conf.numRigidTypes; i++) {			
 		numRB += conf.rigidBody[i].num;
+		int attached_particle_in_type = conf.rigidBody[i].num_attached_particles();
 		std::vector<RigidBody> tmp;
 		// RBTODO: change conf.rigidBody to conf.rigidBodyType
 		const int jmax = conf.rigidBody[i].num;
@@ -64,13 +63,17 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre
 			    snprintf(stmp, 128, "#%d", j);
 			    name.add( stmp );
 			}
-			RigidBody r(name, conf, conf.rigidBody[i], this);
+			RigidBody r(name, conf, conf.rigidBody[i], this,
+				    attached_particle_offset, attached_particle_offset+attached_particle_in_type);
+			attached_particle_offset += attached_particle_in_type;
+
 			int nb = r.appendNumParticleBlocks( &particleForceNumBlocks );
 			tmp.push_back( r );
 		}
 		rigidBodyByType.push_back(tmp);
 	}
-	
+	attached_particle_forces = new Vector3[attached_particle_offset];
+
 	totalParticleForceNumBlocks = 0;
 	for (int i=0; i < particleForceNumBlocks.size(); ++i) {
 	    particleForce_offset.push_back(2*totalParticleForceNumBlocks);
@@ -96,6 +99,8 @@ RigidBodyController::~RigidBodyController() {
 	for (int i = 0; i < rigidBodyByType.size(); i++)
 		rigidBodyByType[i].clear();
 	rigidBodyByType.clear();
+
+	delete [] attached_particle_forces;
 	delete random;
 }
 
@@ -525,6 +530,17 @@ void RigidBodyController::initializeForcePairs() {
 			
 }
 
+void RigidBodyController::update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas) {
+    for (int i = 0; i < rigidBodyByType.size(); i++) {
+	for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+	    rigidBodyByType[i][j].update_particle_positions(pos_d, force_d, energy_d);
+	}
+    }
+    gpuErrchk(cudaMemset((void*) force_d, 0, num_rb_attached_particles*sizeof(Vector3)));
+    gpuErrchk(cudaMemset((void*) energy_d, 0, num_rb_attached_particles*sizeof(float)));
+}
+
+
 void RigidBodyController::updateParticleLists(Vector3* pos_d, BaseGrid* sys_d) {
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
 		for (int j = 0; j < rigidBodyByType[i].size(); j++) {
@@ -547,7 +563,7 @@ void RigidBodyController::clearForceAndTorque()
     }
 }
 
-void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d) 
+void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles)
 {
 	//if (s <= 1)
 		//gpuErrchk( cudaProfilerStart() );
@@ -564,6 +580,7 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s,
 	// RBTODO: launch kernels ahead of time and sync using event and memcpyAsync 
 	gpuErrchk( cudaDeviceSynchronize() );
 	cudaMemcpy(particleForces, particleForces_d, sizeof(ForceEnergy)*2*totalParticleForceNumBlocks, cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(attached_particle_forces, &force_d[num], sizeof(Vector3)*(num_rb_attached_particles), cudaMemcpyDeviceToHost);
 
 	pfo_idx=0;
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
@@ -574,6 +591,16 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s,
 		}
 	}
 
+	{
+	    gpuErrchk( cudaDeviceSynchronize() );
+	    for (auto &rbv: rigidBodyByType) {
+		for (auto &rb: rbv) {
+		    rb.apply_attached_particle_forces( attached_particle_forces );
+		}
+	    }
+	}
+
+
 	// Grid–Grid forces
 	if ( ((s % conf.rigidBodyGridGridPeriod) == 0 || s == 1 ) && forcePairs.size() > 0) {
 		for (int i=0; i < forcePairs.size(); i++) {
diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h
index 0e2dbadbf1e5eb35f9ad4fd007fe1eeb145e8f81..fa2fa4e5d1efd6b8bf43a83e9e34b456d5f67245 100644
--- a/src/RigidBodyController.h
+++ b/src/RigidBodyController.h
@@ -108,8 +108,9 @@ public:
         void SetRandomTorques();
 	void integrate(BaseGrid* sys, int step);
         void integrateDLM(BaseGrid* sys, int step);
-	void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d);
+	void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles);
 	void updateParticleLists(Vector3* pos_d, BaseGrid* sys_d);
+    void update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas);
         void clearForceAndTorque(); 
         void KineticEnergy();
         void print(int step);
@@ -151,7 +152,8 @@ private:
 	void construct_grids();
 	void destruct_grids();
 
-        //float* rb_energy;	
+        //float* rb_energy;
+	Vector3* attached_particle_forces;
 	ForceEnergy* particleForces;
 	ForceEnergy* particleForces_d;
 	std::vector<int> particleForceNumBlocks;
diff --git a/src/RigidBodyType.cu b/src/RigidBodyType.cu
index 969a16ad7d6c3183d9a0f69ee0c4bffd142faf0f..2137160943aaf63fa2345f48e0b115373a242704 100644
--- a/src/RigidBodyType.cu
+++ b/src/RigidBodyType.cu
@@ -81,6 +81,58 @@ void RigidBodyType::setDampingCoeffs(float timestep) { /* MUST ONLY BE CALLED ON
 
 }
 	
+void RigidBodyType::attach_particles() {
+    for (const auto& filename: attached_particle_files) {
+	const size_t line_char = 256;
+	FILE* inp = fopen(filename.val(), "r");
+	char line[line_char];
+
+	// If the particle file cannot be found, exit the program
+	if (inp == NULL) {
+	    printf("ERROR: Could not open `%s'.\n", filename.val());
+	    fclose(inp);
+	    exit(1);
+	}
+
+	// Get and process all lines of input
+	while (fgets(line, line_char, inp) != NULL) {
+		// Lines in the particle file that begin with # are comments
+		if (line[0] == '#') continue;
+
+		String s(line);
+		int numTokens = s.tokenCount();
+
+		// Break the line down into pieces (tokens) so we can process them individually
+		String* tokenList = new String[numTokens];
+		s.tokenize(tokenList);
+
+		// Legitimate GROUP input lines have at least 3 tokens:
+		// Particle_type | x | y | z
+		// A line without exactly six tokens should be discarded.
+		if (numTokens != 4) {
+		    printf("Error: Invalid attached particle file line: %s\n", tokenList[0].val());
+		    fclose(inp);
+		    exit(-1);
+		}
+
+		// Make sure the index of this particle is unique.
+		// NOTE: The particle list is sorted by index.
+		int type_idx = conf->find_particle_type( tokenList[0].val() );
+		if (type_idx < 0) {
+		    printf("Error: Unrecognized particle type: %s\n", line);
+		    fclose(inp);
+		    exit(-1);
+		}
+		attached_particle_types.push_back( type_idx );
+		attached_particle_positions.push_back( Vector3(atof(tokenList[1].val()), atof(tokenList[2].val()), atof(tokenList[3].val())) );
+	}
+	fclose(inp);
+    }
+    size_t sz = sizeof(Vector3)*attached_particle_positions.size();
+    gpuErrchk(cudaMalloc( &(attached_particle_positions_d), sz ));
+    gpuErrchk(cudaMemcpyAsync( attached_particle_positions_d, &attached_particle_positions[0], sz, cudaMemcpyHostToDevice));
+}
+
 void RigidBodyType::addGrid(String s, std::vector<String> &keys, std::vector<String> &files) {
 	// tokenize and return
 	int numTokens = s.tokenCount();
diff --git a/src/RigidBodyType.h b/src/RigidBodyType.h
index 575a31348eb24dd305e913bbf46bf0b095bb8649..5cc1442052a102da7198e1e6eeda806c4cdaa8b7 100644
--- a/src/RigidBodyType.h
+++ b/src/RigidBodyType.h
@@ -17,9 +17,10 @@ class BaseGrid;
 class RigidBodyGrid;
 class Configuration;
 class RigidBodyController;
+class RigidBody;
 
 class RigidBodyType {
-	
+    friend class RigidBody;
 public:
 RigidBodyType(const String& name = "", const Configuration* conf = NULL ) :
 	name(name), conf(conf), num(0),
@@ -39,6 +40,11 @@ public:
 	/* RigidBodyType& operator=(const RigidBodyType& src); */
 	void copyGridsToDevice();
 	
+    void append_attached_particle_file(String s) { attached_particle_files.push_back(s); }
+    void attach_particles();
+    size_t num_attached_particles() const { return attached_particle_types.size() ;}
+    const std::vector<int>& get_attached_particle_types() const { return attached_particle_types; }
+
 	void addPotentialGrid(String s);
 	void addDensityGrid(String s);
 	void addPMF(String s);
@@ -54,6 +60,12 @@ public:
 	String name;
 private:
 	const Configuration* conf;
+	std::vector<String> attached_particle_files;
+	std::vector<int>attached_particle_types;
+private:
+    std::vector<Vector3>attached_particle_positions;
+    Vector3* attached_particle_positions_d;
+
 public:
 	int num; // number of particles of this type