diff --git a/src/BrownParticlesKernel.h b/src/BrownParticlesKernel.h
index 36f7daa3ea95f0e3ddafdabde661b143382f17c7..5475a454d4a7b3ef4c70a904c8ce262a8d71780e 100644
--- a/src/BrownParticlesKernel.h
+++ b/src/BrownParticlesKernel.h
@@ -4,7 +4,7 @@
 
 template<const int BlockSize>
 static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], BrownianParticleType* part[], 
-                                                   float *vec_red, int n)
+                                                   float *vec_red, int num, int num_rb_attached_particles, int num_replicas)
 {
     __shared__ __align__(4) float sdata[BlockSize];
     
@@ -17,9 +17,12 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
 
     sdata[tid] = 0.f; 
 
+    int n = (num*num_replicas);
+
     while (i < n) 
-    { 
-        int t1 = type[i];
+    {
+	const int i1 = (i % num) +  (i/num)*(num+num_rb_attached_particles);
+        const int t1 = type[i1];
         const BrownianParticleType& pt1 = *part[t1];
 
         p1    = P_n[i];
@@ -27,7 +30,8 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         
         if(i + BlockSize < n)
         {
-            int t2 = type[i+BlockSize];
+	    const int i2 = ((i+BlockSize) % num) +  ((i+BlockSize)/num)*(num+num_rb_attached_particles);
+            const int t2 = type[i2];
             const BrownianParticleType& pt2 = *part[t2];
 
             p2    = P_n[i+BlockSize];
@@ -38,7 +42,7 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro
         else
             sdata[tid] += p1.length2() / mass1;
 
-        i += gridSize; 
+        i += gridSize;
     }
 
     sdata[tid] *= 0.50f;
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index 04f2a94653b3942ff55be51c85fd6b4c9f9de3ec..679daac15e9d595c81ec74848126c6a58aad79bf 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -122,9 +122,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	printf("%d groups\n", numGroupSites);
 
 	// Allocate particle variables.
-	// First num*simNum entries are for point particles, next num_rb_attached_particles*simNum are for RB particles
-
-
+	// Each replica works with num+num_rb_attached_particles in array
 	pos = new Vector3[ (num+num_rb_attached_particles) * simNum];
 
         //Han-Yi Chou
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 1fb98bc084e49aca00279e5c91611bfbe0a9f32e..f6c9d380421fbd63a22ccfe65d9dbcb234ed636d 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -97,10 +97,11 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	bonds = c.bonds;
 	numCap = c.numCap;                      // max number of particles
 	num = c.num;                            // current number of particles
+	num_rb_attached_particles = c.num_rb_attached_particles;
 	numGroupSites = c.numGroupSites;
 
 	// Allocate arrays of positions, types and serial numbers
-	pos    = new Vector3[(num+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
+	pos    = new Vector3[(num+num_rb_attached_particles+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
         // Allocate arrays of momentum Han-Yi Chou
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
         {
@@ -112,8 +113,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
         //for debug
         //force = new Vector3[num * numReplicas];
 
-	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
-	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
+	type   = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' types.
+	serial = new     int[(num+num_rb_attached_particles) * numReplicas];  // [HOST] array of particles' serial numbers.
 
 	// Allocate things for rigid body
 	// RBC = RigidBodyController(c);
@@ -127,8 +128,10 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	
 	// Replicate identical initial conditions across all replicas
 	for (int r = 0; r < numReplicas; ++r) {
-	  std::copy(c.type, c.type + num, type + r*num);
-	  std::copy(c.serial, c.serial + num, serial + r*num);
+	    std::copy(c.type, c.type + num+num_rb_attached_particles,
+		      type + r*(num+num_rb_attached_particles));
+	    std::copy(c.serial, c.serial + num + num_rb_attached_particles,
+		      serial + r*(num+num_rb_attached_particles));
 	  if (c.copyReplicaCoordinates > 0)
 	    std::copy(c.pos, c.pos + num, pos + r*num);
 	}
@@ -307,8 +310,12 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	auto _get_index = [this](int idx, int replica) {
 	    // Convenient lambda function to deal with increasingly complicated indexing
-	    auto num = this->num; auto numReplicas = this->numReplicas; auto numGroupSites = this->numGroupSites;
-	    idx = (idx < num) ? idx + replica*num : (idx-num) + numReplicas*num + replica * numGroupSites;
+	    auto num = this->num;
+	    auto numReplicas = this->numReplicas;
+	    auto num_rb_attached_particles = this->num_rb_attached_particles;
+	    auto numGroupSites = this->numGroupSites;
+	    idx = (idx < num+num_rb_attached_particles) ? idx + replica*(num+num_rb_attached_particles)
+		: (idx-num-num_rb_attached_particles) + numReplicas*(num+num_rb_attached_particles) + replica * numGroupSites;
 	    return idx;
 	};
 
@@ -367,7 +374,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	}
 	internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList);
 	
-	forceInternal = new Vector3[(num+numGroupSites)*numReplicas];
+	forceInternal = new Vector3[(num+num_rb_attached_particles+numGroupSites)*numReplicas];
 	if (fullLongRange != 0)
 	    printf("No cell decomposition created.\n");
 
@@ -569,16 +576,16 @@ void GrandBrownTown::RunNoseHooverLangevin()
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i)
-            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+            RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+conf.num_rb_attached_particles), sys_d);
         gpuErrchk(cudaDeviceSynchronize());
     }
 
     float t; // simulation time
 
-    int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
+    int numBlocks = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + (((num+num_rb_attached_particles) * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
     int tl = temperatureGridFile.length();
     Vector3 *force_d;
-    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+numGroupSites) * numReplicas));
+    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+num_rb_attached_particles+numGroupSites) * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
     for (int i=0; i< gpuman.gpus.size(); ++i) {
@@ -599,11 +606,11 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    internal->clear_force();
 	    internal->clear_energy();
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
-	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
-		gpuman.nccl_broadcast(0, _pos, _pos, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_broadcast(0, _pos, _pos, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 	    gpuman.sync();
@@ -633,7 +640,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal -> computeTabulated(get_energy);
                             break;
@@ -657,7 +664,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 #endif
                                 #pragma omp parallel for
                                 for(int i = 0; i < numReplicas; ++i)
-                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                    RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                             }
                             internal->compute(get_energy);
                             break;
@@ -681,8 +688,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
             #endif
             #pragma omp parallel for
             for(int i = 0; i < numReplicas; ++i)
-                RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
-                                       RigidBodyInterpolationType, sys, sys_d);
+                RBC[i]->updateForces(internal->getPos_d()[0]+i*(num+num_rb_attached_particles),
+				     internal->getForceInternal_d()[0]+i*(num+num_rb_attached_particles),
+				     s,
+				     internal->getEnergy()+i*(num+num_rb_attached_particles),
+				     get_energy,
+				     RigidBodyInterpolationType, sys, sys_d);
             if(rigidbody_dynamic == String("Langevin"))
             {
                 #ifdef _OPENMP
@@ -698,28 +709,29 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-		gpuman.nccl_reduce(0, _f, _f, (num+numGroupSites)*numReplicas, -1);
+		gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 
-	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 
         }//if step == 1
 
 	internal->clear_energy();
 	gpuman.sync();
 
+	assert(false); // TODO update RB particels
         if(particle_dynamic == String("Langevin"))
-            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
+            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
             updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), 
-            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, 
+            internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d,
             randoGen_d, numReplicas, ParticleInterpolationType);
         ////For Brownian motion
         else
             updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getForceInternal_d()[0], internal -> getType_d(),
-                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, 
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas,
                                                        internal->getEnergy(), get_energy, ParticleInterpolationType);
 
         if(rigidbody_dynamic == String("Langevin"))
@@ -751,7 +763,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         if (s % outputPeriod == 0) {
             // Copy particle positions back to CPU
 	    gpuErrchk(cudaDeviceSynchronize());
-            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+            gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost));
 	}
         if (imd_on && clientsock && s % outputPeriod == 0)
         {
@@ -832,7 +844,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
 
 	if (numGroupSites > 0) {
 	    gpuman.sync();
-	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 	    gpuman.sync();
 	}
 
@@ -844,7 +856,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
 		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
-		gpuman.nccl_broadcast(0, _p, _p, (num+numGroupSites)*numReplicas, nccl_broadcast_streams);
+		gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams);
 	    }
 	    #endif
     	}
@@ -865,13 +877,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             #endif
                             #pragma omp parallel for
                             for(int i = 0; i < numReplicas; ++i)
-                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
+                                RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
                         }
                         internal -> computeTabulated(get_energy);
 			#ifdef USE_NCCL
 			if (gpuman.gpus.size() > 1) {
 			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+			    gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1);
 			}
 			#endif
                         break;
@@ -918,22 +930,22 @@ void GrandBrownTown::RunNoseHooverLangevin()
         #endif
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
-            RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
+            RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy,
                                  RigidBodyInterpolationType, sys, sys_d);
 
 	if (numGroupSites > 0) {
 	    gpuman.sync();
 	    // if ((s%100) == 0) {
-	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// } else {
-	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// }
 	    gpuman.sync();
 	}
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], 
-            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, 
+            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy,
             ParticleInterpolationType);
             //gpuErrchk(cudaDeviceSynchronize());
 
@@ -957,7 +969,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             {
-                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost));
             }
             t = s*timestep;
             // Loop over all replicas
@@ -968,12 +980,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     updateNameList(); // no need for it here if particles stay the same
 
                 // Write the trajectory.
-                writers[repID]->append(pos + (repID*num), name, serial, t, num);
+                writers[repID]->append(pos + repID*(num+num_rb_attached_particles), name, serial, t, num);
 
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
-                    momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
-                    //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
+                    momentum_writers[repID]->append(momentum + repID * (num+num_rb_attached_particles), name, serial, t, num);
+                    //force_writers[repID]->append(force + repID * (num+num_rb_attached_particles), name, serial, t, num);
                 }
             }
             // TODO: Currently, not compatible with replicas. Needs a fix.
@@ -1005,7 +1017,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 float e = 0.f;
                 float V = 0.f;
                 thrust::device_ptr<float> en_d(internal->getEnergy());
-                V = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas;
+                V = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas;
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
                     gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
@@ -1616,7 +1628,7 @@ void GrandBrownTown::run() {
 //
 void GrandBrownTown::populate() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		int pn = 0;
 		int p = 0;
 		for (int i = 0; i < num; i++) {
@@ -1636,7 +1648,7 @@ void GrandBrownTown::populate() {
 void GrandBrownTown::writeRestart(int repID) const 
 {
     FILE* out   = fopen(restartFiles[repID].c_str(), "w");
-    const int offset = repID * num;
+    const int offset = repID * (num+num_rb_attached_particles);
 
     for (int i = 0; i < num; ++i) 
     {
@@ -1671,7 +1683,7 @@ void GrandBrownTown::initialCondCen() {
 // Set random initial positions for all particles and replicas
 void GrandBrownTown::initialCond() {
 	for (int repID = 0; repID < numReplicas; repID++) {
-		const int offset = repID * num;
+	    const int offset = repID * (num+num_rb_attached_particles);
 		for (int i = 0; i < num; i++) {
 			pos[i + offset] = findPos(type[i + offset]);
 		}
@@ -1715,7 +1727,7 @@ float GrandBrownTown::KineticEnergy()
     gpuErrchk(cudaMalloc((void**)&energy, sizeof(float)));
     gpuErrchk(cudaMemset((void*)energy,0,sizeof(float)));
 
-    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, numReplicas*num);
+    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, num, num_rb_attached_particles, numReplicas);
     gpuErrchk(cudaDeviceSynchronize());
 
     Reduction<64><<<dim3(1),dim3(64)>>>(vec_red, energy, 512);
@@ -1763,7 +1775,7 @@ void GrandBrownTown::init_cuda_group_sites()
     }
 
     // Create GPU-friendly data structure
-    // TODO make this work for replicas
+    assert(numReplicas == 1);    // TODO make this work for replicas
     int* tmp = new int[numGroupSites+1+num_particles];
     num_particles = 0;
     int i = 0;
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index c4c059aea09aafe0f9a77c3f0cd6a6a1f71e7073..6857d21bd853d10f8d54629c1cb0ce120e01a93a 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -19,12 +19,14 @@ Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion, Vector3
 #if 0
 __global__ void updateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, 
                              int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, 
-                             int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) 
+                             int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
         //Vector3 r0(tex1Dfetch(PosTex, idx));
         //Vector3 p0(tex1Dfetch(MomTex, idx));
@@ -101,11 +103,13 @@ __global__ void
 updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, 
                                Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], 
                                float kT, BaseGrid* kTGrid, float electricField, int tGridLength, float timestep, 
-                               int num, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
+                               int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
 
         Vector3 r0  = pos[idx];
@@ -214,13 +218,17 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 //Han-Yi Chou
 __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                   int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                  float electricField,int tGridLength, float timestep, int num, BaseGrid* sys, 
+                                  float electricField,int tGridLength, float timestep,
+				  int num, int num_rb_attached_particles, BaseGrid* sys,
                                   Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
 
         Vector3 r0 = pos[idx];
@@ -315,13 +323,15 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
 //update momentum in the last step of BAOAB integrator for the Langevin dynamics. Han-Yi Chou
 __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
                                       int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
-                                      float electricField, int tGridLength, float timestep, int num, 
+                                      float electricField, int tGridLength, float timestep, int num, int num_rb_attached_particles,
                                       BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy,int scheme)
 {
-    const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
@@ -380,12 +390,14 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
 #if 0
 __global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                  int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
-                                 int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+				      int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas)
 {
-    const int idx  = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx  = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < num * numReplicas)
     {
+	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
+
         int t = type[idx];
         Vector3 r0 = pos[idx];
         Vector3 p0 = momentum[idx];
@@ -441,16 +453,17 @@ __global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3*
 __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], 
                   BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, 
-                  int tGridLength, float timestep, int num, BaseGrid* sys,
+                  int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys,
 		  Random* randoGen, int numReplicas, float* energy, bool get_energy, int scheme) 
 {
 	// Calculate this thread's ID
-	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+	int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
         
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
 	// Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) 
         {
+	    idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 		const int t = type[idx];
 		Vector3   p = pos[idx];
 
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index c6479a081fb2b4461493f49269c1e67b29eb7323..6d8ed7f34c5bcdbed402929d8c5c3b41ffcb5f89 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -230,6 +230,8 @@ private:
 	int numAngles;
 	int numDihedrals;
 
+    int num_rb_attached_particles;
+
 	int numGroupSites;
 	int* groupSiteData_d;
 
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index e224d3dbf42be8bb3afde7dfa1e28f37ff4d8d26..f65fcc75298816e893a6052045e4815c0ef08527 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -114,6 +114,7 @@ void RigidBody::addEnergy(float e)
     energy += e;
 }
 void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) {
+    assert(false); // TODO update particles, excluding own particles
 	for (int i = 0; i < t->numPotGrids; ++i) {
 		numParticles[i] = 0;
 		int& tnp = t->numParticles[i];