diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 5144a55a20abcb1108bdc63caaf65e58f8437fcd..4f39b3fe3a905fca9703f03423b357a401bef198 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -47,7 +47,9 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
     numBonds(c.numBonds), numTabBondFiles(c.numTabBondFiles),
     numExcludes(c.numExcludes), numAngles(c.numAngles),
     numTabAngleFiles(c.numTabAngleFiles), numDihedrals(c.numDihedrals),
-    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), numReplicas(numReplicas) {
+    numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), 
+    numGroupSites(c.numGroupSites),
+    numReplicas(numReplicas) {
 
 	// Grow vectors for per-gpu device pointers
 	for (int i = 0; i < gpuman.gpus.size(); ++i) {
@@ -235,7 +237,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 	gridSize =  num / NUM_THREADS + 1;
 
 	// Create and allocate the energy arrays
-	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * num * numReplicas));
+	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+numGroupSites) * numReplicas));
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 }
@@ -674,7 +676,7 @@ float ComputeForce::computeFull(bool get_energy) {
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
 	}
 
 	return energy;
@@ -742,7 +744,7 @@ float ComputeForce::compute(bool get_energy) {
 	if (get_energy) {
 		gpuErrchk(cudaDeviceSynchronize());
 		thrust::device_ptr<float> en_d(energies_d);
-		energy = thrust::reduce(en_d, en_d + num);
+		energy = thrust::reduce(en_d, en_d + num + numGroupSites);
 	}
 
 	return energy;
@@ -771,7 +773,7 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	{
 		//clearEnergies<<< nb, numThreads >>>(energies_d,num);
 		//gpuErrchk(cudaDeviceSynchronize());
-		cudaMemset((void*)energies_d, 0, sizeof(float)*num*numReplicas);
+	        cudaMemset((void*)energies_d, 0, sizeof(float)*(num+numGroupSites)*numReplicas);
 		computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0],
 						cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], energies_d);
 	}
@@ -892,7 +894,7 @@ float ComputeForce::computeTabulatedFull(bool get_energy) {
 
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 {
-	const size_t tot_num = num * numReplicas;
+    const size_t tot_num = (num+numGroupSites) * numReplicas;
 
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
@@ -924,7 +926,7 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
-	    gpuErrchk(cudaMalloc(&forceInternal_d[i], sizeof(Vector3) * num * numReplicas));
+	    gpuErrchk(cudaMalloc(&forceInternal_d[i], sizeof(Vector3) * tot_num));
 	}
 	gpuman.use(0);
 	gpuErrchk(cudaMemcpyAsync(forceInternal_d[0], forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
@@ -933,17 +935,18 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom)
 {
-        const size_t tot_num = num * numReplicas;
+    const size_t tot_num = (num+numGroupSites) * numReplicas;
+    const size_t small_tot_num = num * numReplicas;
 
         gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
-        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * small_tot_num, cudaMemcpyHostToDevice));
 
 	copyToCUDA(forceInternal,pos);
         gpuErrchk(cudaDeviceSynchronize());
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom, float* random)
 {
-        const size_t tot_num = num * numReplicas;
+    const size_t tot_num = num * numReplicas;
 
         gpuErrchk(cudaMalloc(&ran_d, sizeof(float) * tot_num));
         gpuErrchk(cudaMemcpyAsync(ran_d, random, sizeof(float) * tot_num, cudaMemcpyHostToDevice));
@@ -953,16 +956,19 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom
 }
 
 void ComputeForce::setForceInternalOnDevice(Vector3* f) {
-	const size_t tot_num = num * numReplicas;
+    // const size_t tot_num = (num+numGroupSites) * numReplicas;
+    assert(numGroupSites == 0); // IMD, the only feature using this function, is currently incompatible with group sites
+    const size_t tot_num = num * numReplicas;
 	gpuErrchk(cudaMemcpy(forceInternal_d[0], f, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 }
 
-
 void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, const Restraint* const restraints)
 {
+    assert(simNum == numReplicas); // Not sure why we have both of these things
+    int tot_num = num * simNum;
 	// type_d
-	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum));
-	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num));
+	gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num, cudaMemcpyHostToDevice));
 	
 	if (numBonds > 0)
 	{
diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh
index 9109ab48c94f6bacb7d121e32990c67fa8bf79f9..8735799be098867866ddc865fa39579e3ff77bc5 100644
--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -116,6 +116,7 @@ void computeFullKernel(Vector3 force[], Vector3 pos[], int type[],
 	}
 }
 
+
 __global__
 void computeSoftcoreFullKernel(Vector3 force[], Vector3 pos[], int type[],
 															 float tableEps[], float tableRad6[],
diff --git a/src/ComputeForce.h b/src/ComputeForce.h
index fc111068841ff730fc03a043e4320cad161c7d34..5d602e4a609482a73ba7c639f8a575fc3023b2a7 100644
--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -150,12 +150,12 @@ public:
     void clear_force() { 
 	for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) {
 	    gpuman.use(i);
-	    gpuErrchk(cudaMemsetAsync((void*)(forceInternal_d[i]),0,num*numReplicas*sizeof(Vector3)));
+	    gpuErrchk(cudaMemsetAsync((void*)(forceInternal_d[i]),0,(num+numGroupSites)*numReplicas*sizeof(Vector3)));
 	}
 	gpuman.use(0);		// TODO move to a paradigm where gpu0 is not preferentially treated 
     }
     void clear_energy() { 
-	gpuErrchk(cudaMemsetAsync((void*)(energies_d), 0, sizeof(float)*num*numReplicas)); // TODO make async
+	gpuErrchk(cudaMemsetAsync((void*)(energies_d), 0, sizeof(float)*(num+numGroupSites)*numReplicas)); // TODO make async
     }
 
 	HOST DEVICE
@@ -181,6 +181,11 @@ private:
 	int numTabAngleFiles;
 	int numDihedrals;
 	int numTabDihedralFiles;
+
+	int numGroupSites;
+	int* comSiteParticles;
+	int* comSiteParticles_d;
+
 	float *tableEps, *tableRad6, *tableAlpha;
 	TabulatedPotential **tablePot; // 100% on Host 
 	TabulatedPotential **tableBond;
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index 66c420671f8ffdc09eadc97fa5d0e4ed1721597b..41b4cf3a1dd9c97bd1bf9b28b2591058627b0287 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -96,12 +96,14 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
     }
   } // end result: variable "num" is set
 
-
 	// Set the number capacity
 	printf("\n%d particles\n", num);
 	if (numCap <= 0) numCap = numCapFactor*num; // max number of particles
 	if (numCap <= 0) numCap = 20;
 
+	if (readGroupSitesFromFile) readGroups();
+	printf("%d groups\n", numGroupSites);
+
 	// Allocate particle variables.
 	pos = new Vector3[num * simNum];
         //Han-Yi Chou
@@ -764,6 +766,10 @@ void Configuration::setDefaults() {
 	numPartsFromFile = 0;
 	partsFromFile = NULL;
 	readBondsFromFile = false;
+	numGroupSites = 0;
+	readGroupSitesFromFile = false;
+	
+
 	numBonds = 0;
 	bonds = NULL;
 	bondMap = NULL;
@@ -1000,6 +1006,13 @@ int Configuration::readParameters(const char * config_file) {
 				readPartsFromFile = true;
 				loadedCoordinates = true;
 			}
+		} else if (param == String("inputGroups")) {
+			if (readGroupSitesFromFile) {
+				printf("WARNING: More than one group file specified. Ignoring new file.\n");
+			} else {
+				groupSiteFile = value;
+				readGroupSitesFromFile = true;
+			}
 		} else if (param == String("inputBonds")) {
 			if (readBondsFromFile) {
 				printf("WARNING: More than one bond file specified. Ignoring new bond file.\n");
@@ -1391,6 +1404,62 @@ void Configuration::readAtoms() {
 		}
 	}
 }
+void Configuration::readGroups() {
+	// Open the file
+    const size_t line_char = 16384;
+	FILE* inp = fopen(groupSiteFile.val(), "r");
+	char line[line_char];
+
+	// If the particle file cannot be found, exit the program
+	if (inp == NULL) {
+		printf("ERROR: Could not open `%s'.\n", partFile.val());
+		exit(1);
+	}
+
+	// Our particle array has a starting capacity of 256
+	// We will expand this later if we need to.
+	// int capacity = 256;
+	numGroupSites = 0;
+
+	// partsFromFile = new String[capacity];
+	// indices = new int[capacity];
+	// indices[0] = 0;
+
+	// Get and process all lines of input
+	while (fgets(line, line_char, inp) != NULL) {
+		// Lines in the particle file that begin with # are comments
+		if (line[0] == '#') continue;
+		      
+		String s(line);
+		int numTokens = s.tokenCount();
+		      
+		// Break the line down into pieces (tokens) so we can process them individually
+		String* tokenList = new String[numTokens];
+		s.tokenize(tokenList);
+
+		// Legitimate GROUP input lines have at least 3 tokens: 
+		// GROUP | Atom_1_idx | Atom_2_idx | ...
+		// A line without exactly six tokens should be discarded.
+		if (numTokens < 3) {
+		    printf("Error: Invalid group file line: %s\n", line);
+		    exit(-1);
+		}
+
+		// Make sure the index of this particle is unique.
+		// NOTE: The particle list is sorted by index. 
+		std::vector<int> tmp;
+		for (int i=1; i < numTokens; ++i) {
+		    const int ai = atoi(tokenList[i].val());
+		    if (ai >= num) {
+			printf("Error: Attempted to include invalid particle in group: %s\n", line);
+			exit(-1);
+		    }
+		    tmp.push_back( ai );
+		}
+		groupSiteData.push_back(tmp);
+		numGroupSites++;
+	}
+}
 
 void Configuration::readBonds() {
 	// Open the file
@@ -1441,7 +1510,7 @@ void Configuration::readBonds() {
 			continue;
 		}
 
-		if (ind1 < 0 || ind1 >= num || ind2 < 0 || ind2 >=num) {
+		if (ind1 < 0 || ind1 >= num+numGroupSites || ind2 < 0 || ind2 >= num+numGroupSites) {
 			printf("ERROR: Bond file line '%s' includes invalid index\n", line);
 			exit(1);
 		}
@@ -1489,8 +1558,8 @@ void Configuration::readBonds() {
 	 * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin
 	 * bondMap[i].y is the index in the bonds array where the ith particle's bonds end
 	 */
-	bondMap = new int2[num];
-	for (int i = 0; i < num; i++) {	
+	bondMap = new int2[num+numGroupSites];
+	for (int i = 0; i < num+numGroupSites; i++) {	
 		bondMap[i].x = -1;
 		bondMap[i].y = -1;
 	}
@@ -1651,7 +1720,7 @@ void Configuration::readAngles() {
 		int ind3 = atoi(tokenList[3].val());
 		String file_name = tokenList[4];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num or ind2 >= num or ind3 >= num)
+		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites or ind3 >= num+numGroupSites)
 			continue;
 
 		if (numAngles >= capacity) {
@@ -1712,8 +1781,8 @@ void Configuration::readDihedrals() {
 		int ind4 = atoi(tokenList[4].val());
 		String file_name = tokenList[5];
 		//printf("file_name %s\n", file_name.val());
-		if (ind1 >= num or ind2 >= num
-				or ind3 >= num or ind4 >= num)
+		if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites
+				or ind3 >= num+numGroupSites or ind4 >= num+numGroupSites)
 			continue;
 
 		if (numDihedrals >= capacity) {
@@ -1772,7 +1841,7 @@ void Configuration::readRestraints() {
 		float y0 = (float) strtod(tokenList[4].val(), NULL);
 		float z0 = (float) strtod(tokenList[5].val(), NULL);
 
-		if (id >= num) continue;
+		if (id >= num+numGroupSites) continue;
 
 		if (numRestraints >= capacity) {
 			Restraint* temp = restraints;
diff --git a/src/Configuration.h b/src/Configuration.h
index 0895235c7929bddd22f0598ccb807cce10426740..debf4b9f6ea4d0ce8eaa9c52d70d6c57f1efdcbb 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -56,6 +56,7 @@ class Configuration {
 	int readParameters(const char* config_file);
 	void readAngles();
 	void readAtoms();
+	void readGroups();
 	void readBonds();
 	void readExcludes();
 	void addExclusion(int ind1, int ind2);
@@ -196,6 +197,7 @@ public:
 	String dihedralFile;
 	String restraintFile;
 	bool readPartsFromFile;
+	bool readGroupSitesFromFile;
 	bool readBondsFromFile;
 	bool readExcludesFromFile;
 	bool readAnglesFromFile;
@@ -216,6 +218,10 @@ public:
 	int* partTableIndex0;
 	int* partTableIndex1;
 
+	String groupSiteFile;
+	int numGroupSites;
+	std::vector<std::vector<int>> groupSiteData;
+
 	String* bondTableFile;
 	int numTabBondFiles;
 	int2* bondMap;
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 59668438f50436ed25ef0ae401423a134d84b6a7..f1e3b73c071bc3b460f40fb1b7e9bbad78582654 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -97,9 +97,10 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	bonds = c.bonds;
 	numCap = c.numCap;                      // max number of particles
 	num = c.num;                            // current number of particles
+	numGroupSites = c.numGroupSites;
 
 	// Allocate arrays of positions, types and serial numbers
-	pos    = new Vector3[num * numReplicas];  // [HOST] array of particles' positions.
+	pos    = new Vector3[(num+numGroupSites) * numReplicas];  // [HOST] array of particles' positions.
         // Allocate arrays of momentum Han-Yi Chou
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
         {
@@ -254,6 +255,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	
 	//MLog: I did the other halve of the copyToCUDA function from the Configuration class here, keep an eye on any mistakes that may occur due to the location.
 	internal -> copyToCUDA(c.simNum, c.type, c.bonds, c.bondMap, c.excludes, c.excludeMap, c.angles, c.dihedrals, c.restraints);
+	if (numGroupSites > 0) init_cuda_group_sites();
 
 	// TODO: check for duplicate potentials 
 	if (c.tabulatedPotential) {
@@ -303,6 +305,13 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 				internal->addDihedralPotential(dihedralTableFile[p].val(), p, dihedrals);
 	}
 
+	auto _get_index = [this](int idx, int replica) {
+	    // Convenient lambda function to deal with increasingly complicated indexing
+	    auto num = this->num; auto numReplicas = this->numReplicas; auto numGroupSites = this->numGroupSites;
+	    idx = (idx < num) ? idx + replica*num : (idx-num) + numReplicas*num + replica * numGroupSites;
+	    return idx;
+	};
+
 	//Mlog: this is where we create the bondList.
 	if (numBonds > 0) {
 		bondList = new int3[ (numBonds / 2) * numReplicas ];
@@ -318,8 +327,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 						fprintf(stderr,"Error: bondfile '%s' was not read with tabulatedBondFile command.\n", bonds[i].fileName.val());
 						exit(1);
 					}
-						
-					bondList[j] = make_int3( (bonds[i].ind1 + k * num), (bonds[i].ind2 + k * num), bonds[i].tabFileIndex );
+					bondList[j] = make_int3( _get_index(bonds[i].ind1, k), _get_index(bonds[i].ind2, k), bonds[i].tabFileIndex );
 					// cout << "Displaying: bondList["<< j <<"].x = " << bondList[j].x << ".\n"
 					// << "Displaying: bondList["<< j <<"].y = " << bondList[j].y << ".\n"
 					// << "Displaying: bondList["<< j <<"].z = " << bondList[j].z << ".\n";
@@ -338,7 +346,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 				fprintf(stderr,"Error: anglefile '%s' was not read with tabulatedAngleFile command.\n", angles[i].fileName.val());
 				exit(1);
 			}
-		angleList[i+k*numAngles] = make_int4( angles[i].ind1+k*num, angles[i].ind2+k*num, angles[i].ind3+k*num, angles[i].tabFileIndex );
+			angleList[i+k*numAngles] = make_int4( _get_index(angles[i].ind1,k), _get_index(angles[i].ind2,k), _get_index(angles[i].ind3,k), angles[i].tabFileIndex );
 	    }
 	}
 	}
@@ -352,14 +360,14 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 				fprintf(stderr,"Error: dihedralfile '%s' was not read with tabulatedDihedralFile command.\n", dihedrals[i].fileName.val());
 				exit(1);
 			}
-		dihedralList[i+k*numDihedrals] = make_int4( dihedrals[i].ind1+k*num, dihedrals[i].ind2+k*num, dihedrals[i].ind3+k*num, dihedrals[i].ind4+k*num);
+			dihedralList[i+k*numDihedrals] = make_int4( _get_index(dihedrals[i].ind1,k), _get_index(dihedrals[i].ind2,k), _get_index(dihedrals[i].ind3,k), _get_index(dihedrals[i].ind4,k) );
 		dihedralPotList[i+k*numDihedrals] = dihedrals[i].tabFileIndex;
 	    }
 	}
 	}
 	internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList);
 	
-	forceInternal = new Vector3[num * numReplicas];
+	forceInternal = new Vector3[(num+numGroupSites)*numReplicas];
 	if (fullLongRange != 0)
 	    printf("No cell decomposition created.\n");
 
@@ -570,7 +578,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
     int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
     int tl = temperatureGridFile.length();
     Vector3 *force_d;
-    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
+    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+numGroupSites) * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
     for (int i=0; i< gpuman.gpus.size(); ++i) {
@@ -593,11 +601,14 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
-		gpuman.nccl_broadcast(0, _pos, _pos, num*numReplicas, -1);
+		gpuman.nccl_broadcast(0, _pos, _pos, (num+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 	    gpuman.sync();
 
+	    if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num, numGroupSites, numReplicas);
+
+
             #ifdef _OPENMP
             omp_set_num_threads(4);
             #endif
@@ -686,14 +697,16 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-		gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+		gpuman.nccl_reduce(0, _f, _f, (num+numGroupSites)*numReplicas, -1);
 	    }
 	    #endif
 
+
         }//if step == 1
 
 	internal->clear_energy();
 	gpuman.sync();
+	if (numGroupSites > 0) distributeGroupSiteForces<<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
 
         if(particle_dynamic == String("Langevin"))
             updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
@@ -823,11 +836,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
 		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
-		gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, nccl_broadcast_streams);
+		gpuman.nccl_broadcast(0, _p, _p, (num+numGroupSites)*numReplicas, nccl_broadcast_streams);
 	    }
 	    #endif
     	}
 
+	if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+
         if (interparticleForce)
         {
             // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
@@ -900,6 +915,8 @@ void GrandBrownTown::RunNoseHooverLangevin()
             RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
                                  RigidBodyInterpolationType, sys, sys_d);
 
+	if (numGroupSites > 0) distributeGroupSiteForces<<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas);
+
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], 
             internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, 
@@ -1722,6 +1739,40 @@ void GrandBrownTown::InitNoseHooverBath(int N)
     }
     printf("Done in nose-hoover bath\n");
 }
+
+void GrandBrownTown::init_cuda_group_sites()
+{
+    
+    // Count the number of particles that form groups
+    int num_particles = 0;
+    for (auto it = conf.groupSiteData.begin(); it != conf.groupSiteData.end(); ++it) {
+	num_particles += it->size();
+    }
+
+    // Create GPU-friendly data structure
+    // TODO make this work for replicas
+    int* tmp = new int[numGroupSites+1+num_particles];
+    num_particles = 0;
+    int i = 0;
+    for (auto it = conf.groupSiteData.begin(); it != conf.groupSiteData.end(); ++it) {
+	tmp[i] = num_particles+numGroupSites;
+	for (auto it2 = it->begin(); it2 != it->end(); ++it2) {
+	    tmp[num_particles+numGroupSites+1] = *it2;
+	    num_particles++;
+	}
+	i++;
+    }
+    assert(i == numGroupSites);
+    tmp[i] = num_particles;
+
+    // Copy data structure to GPU
+    gpuErrchk(cudaMalloc((void**) &groupSiteData_d, sizeof(int)*(numGroupSites+1+num_particles)));
+    gpuErrchk(cudaMemcpy(groupSiteData_d, tmp, sizeof(int)*(numGroupSites+1+num_particles), cudaMemcpyHostToDevice));
+    // TODO deallocate CUDA
+    delete[] tmp;
+
+}
+
 // -----------------------------------------------------------------------------
 // Initialize file for recording ionic current
 void GrandBrownTown::newCurrent(int repID) const {
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index ba31f468596616e431b35c7bc82998ef69659c92..589fc2d09300b54f8760260eb4237b994e96eaa5 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -607,6 +607,55 @@ inline Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion,
 	return sys->wrap(r);
 }
 
+__global__
+void updateGroupSites(Vector3 pos[], int* groupSiteData, int num, int numGroupSites, int numReplicas) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // For all threads representing a valid pair of particles
+    if (i < numGroupSites*numReplicas) {
+	pos[num*numReplicas + i] = Vector3(0.0f); 
+    }
+
+    // For all threads representing a valid pair of particles
+    if (i < numGroupSites*numReplicas) {
+	const int imod = i % numReplicas;
+	const int rep = i/numReplicas;
+	const int start  = groupSiteData[imod];
+	const int finish = groupSiteData[imod+1];
+	float weight = 1.0 / (finish-start);
+	    
+	for (int j = start; j < finish; j++) {
+	    const int aj = groupSiteData[j] + num*rep;
+	    pos[num*numReplicas + i] += weight * pos[aj];
+	}
+    }
+}
+
+__global__
+void distributeGroupSiteForces(Vector3 force[], Vector3 pos[], int* groupSiteData, int num, int numGroupSites, int numReplicas) {
+    // TODO, handle groupsite energies
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // For all threads representing a valid pair of particles
+    if (i < numGroupSites*numReplicas) {
+	pos[num*numReplicas + i] = Vector3(0.0f); 
+    }
+
+    // For all threads representing a valid pair of particles
+    if (i < numGroupSites*numReplicas) {
+	const int imod = i % numReplicas;
+	const int rep = i/numReplicas;
+	const int start  = groupSiteData[imod];
+	const int finish = groupSiteData[imod+1];
+	float weight = 1.0 / (finish-start);
+	    
+	for (int j = start; j < finish; j++) {
+	    const int aj = groupSiteData[j] + num*rep;
+	    atomicAdd( force+aj, weight * force[num*numReplicas+i] );
+	}
+    }
+}
+
 __global__ void devicePrint(RigidBodyType* rb[]) {
 	// printf("Device printing\n");
 	int i = 0;
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index bb63a5c580de4a141321c63285003481dfc15c40..c6479a081fb2b4461493f49269c1e67b29eb7323 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -102,6 +102,9 @@ private:
         //Initialize the Nose-Hoover auxilliary variables
         void InitNoseHooverBath(int N);
         //curandState_t *randoDevice;
+
+	void init_cuda_group_sites();
+
 public:
 	// Compute the current in nanoamperes.
 	float current(float t) const;
@@ -226,6 +229,10 @@ private:
 	int numExcludes;
 	int numAngles;
 	int numDihedrals;
+
+	int numGroupSites;
+	int* groupSiteData_d;
+
 	String partFile;
 	String bondFile;
 	String excludeFile;