From 7d2e8acc6621fc199e2a59fe861456f4588ac1ab Mon Sep 17 00:00:00 2001
From: Han-Yi Chou <hchou10@illinois.edu>
Date: Sun, 11 Feb 2018 18:40:37 -0600
Subject: [PATCH] implement openMP option for rigid body simulation and fix bug
 in rigid body replicas, need to be reviewed and tested

Conflicts:
	src/GrandBrownTown.cu
	src/Makefile
---
 src/GrandBrownTown.cu      | 155 +++++++++++++++++++++++++++----------
 src/Makefile               |  24 ++++--
 src/RigidBodyController.cu |   2 +-
 3 files changed, 132 insertions(+), 49 deletions(-)

diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index e3f95ef..3e2baca 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -8,7 +8,14 @@
 #include <thrust/device_ptr.h>
 #include <fstream>
 
-#ifndef gpuErrchk
+#ifdef _OPENMP
+#include <omp.h>
+//#else
+//typedef int omp_int_t;
+//inline omp_int_t omp_get_thread_num() { return 0; }
+//inline omp_int_t omp_get_max_threads() { return 1; }
+#endif
+
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
 	if (code != cudaSuccess) {
@@ -16,7 +23,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 	if (abort) exit(code);
 	}
 }
-#endif
 
 static void checkEnergyFiles()
 {
@@ -443,7 +449,7 @@ GrandBrownTown::~GrandBrownTown() {
             gpuErrchk(cudaFree(randoGen_d));
             for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
             {
-                (*iter)->~RigidBodyController();
+                //(*iter)->~RigidBodyController();
                 delete *iter;
             }
             RBC.clear();
@@ -548,8 +554,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
         // cudaSetDevice(0);
         internal->decompose();
         gpuErrchk(cudaDeviceSynchronize());
-        for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-            (*iter)->updateParticleLists( internal->getPos_d(), sys_d);
+        #ifdef _OPENMP
+        omp_set_num_threads(4);
+        #endif
+        #pragma omp parallel for
+        for(int i = 0; i < numReplicas; ++i)
+            RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d);
+        gpuErrchk(cudaDeviceSynchronize());
     }
 
     float t; // simulation time
@@ -573,8 +584,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
             // 'interparticleForce' - determines whether particles interact with each other
             gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
             gpuErrchk(cudaMemset((void*)(internal->getEnergy()), 0, sizeof(float)*num*numReplicas));
-            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                (*iter)->clearForceAndTorque(); //Han-Yi Chou
+            #ifdef _OPENMP
+            omp_set_num_threads(4);
+            #endif
+            #pragma omp parallel for
+            for(int i = 0; i < numReplicas; ++i)
+                RBC[i]->clearForceAndTorque(); //Han-Yi Chou
             
             if (interparticleForce)
             {
@@ -587,8 +602,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             {
                                 // cudaSetDevice(0);
                                  internal -> decompose();
-                                for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                                    (*iter)->updateParticleLists( internal->getPos_d(), sys_d);
+                                #ifdef _OPENMP
+                                omp_set_num_threads(4);
+                                #endif
+                                #pragma omp parallel for
+                                for(int i = 0; i < numReplicas; ++i)
+                                    RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d);
                             }
                             internal -> computeTabulated(get_energy);
                             break;
@@ -607,8 +626,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             {
                                 // cudaSetDevice(0);
                                 internal->decompose();
-                                for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter !=  RBC.end(); ++iter)
-                                    (*iter)->updateParticleLists( internal->getPos_d(), sys_d);
+                                #ifdef _OPENMP
+                                omp_set_num_threads(4);
+                                #endif
+                                #pragma omp parallel for
+                                for(int i = 0; i < numReplicas; ++i)
+                                    RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d);
                             }
                             internal->compute(get_energy);
                             break;
@@ -628,14 +651,25 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 }
             }//if inter-particle force
             gpuErrchk(cudaDeviceSynchronize());
-            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                (*iter)->updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, internal->getEnergy(), get_energy, RigidBodyInterpolationType, sys, sys_d);
+            #ifdef _OPENMP
+            omp_set_num_threads(4);
+            #endif
+            #pragma omp parallel for
+            for(int i = 0; i < numReplicas; ++i)
+                RBC[i]->updateForces((internal->getPos_d())+i*num, (internal->getForceInternal_d())+i*num, s, (internal->getEnergy())+i*num, get_energy, 
+                                       RigidBodyInterpolationType, sys, sys_d);
+            gpuErrchk(cudaDeviceSynchronize());
+
             if(rigidbody_dynamic == String("Langevin"))
             {
-                for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
+                #ifdef _OPENMP
+                omp_set_num_threads(4);
+                #endif
+                #pragma omp parallel for
+                for(int i = 0; i < numReplicas; ++i)
                 {
-                    (*iter)->SetRandomTorques();
-                    (*iter)->AddLangevin();
+                    RBC[i]->SetRandomTorques();
+                    RBC[i]->AddLangevin();
                 }
             }
         }//if step == 1
@@ -657,17 +691,28 @@ void GrandBrownTown::RunNoseHooverLangevin()
 
         if(rigidbody_dynamic == String("Langevin"))
         {
-            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
+            #ifdef _OPENMP
+            omp_set_num_threads(4);
+            #endif
+            #pragma omp parallel for
+            for(int i = 0; i < numReplicas; ++i)
             {
-                (*iter)->integrateDLM(0);
-                (*iter)->integrateDLM(1);
+                RBC[i]->integrateDLM(0);
+                RBC[i]->integrateDLM(1);
             }
         }
         else
         {
-            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                (*iter)->integrate(s);
-            //RBC.print(s);
+            #ifdef _OPENMP
+            omp_set_num_threads(4);
+            #endif
+            #pragma omp parallel for ordered
+            for(int i = 0; i < numReplicas; ++i)
+            {
+                RBC[i]->integrate(s);
+                #pragma omp ordered
+                RBC[i]->print(s);
+            }
         }
         gpuErrchk(cudaDeviceSynchronize());
 
@@ -743,8 +788,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
         }
         if (imd_on && clientsock)
             internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
-        for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter) 
-            (*iter)->clearForceAndTorque();
+        #ifdef _OPENMP
+        omp_set_num_threads(4);
+        #endif
+        #pragma omp parallel for
+        for(int i = 0; i < numReplicas; ++i) 
+            RBC[i]->clearForceAndTorque();
         gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
         if (interparticleForce)
         {
@@ -757,8 +806,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                         if (s % decompPeriod == 0)
                         {
                             internal -> decompose();
-                            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                                (*iter)->updateParticleLists( internal->getPos_d(), sys_d);
+                            #ifdef _OPENMP
+                            omp_set_num_threads(4);
+                            #endif
+                            #pragma omp parallel for
+                            for(int i = 0; i < numReplicas; ++i)
+                                RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d);
                         }
                         internal -> computeTabulated(get_energy);
                         break;
@@ -776,8 +829,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                             if (s % decompPeriod == 0)
                             {
                                internal->decompose();
-                               for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                                   (*iter)->updateParticleLists( internal->getPos_d(), sys_d);
+                               #ifdef _OPENMP
+                               omp_set_num_threads(4);
+                               #endif
+                               #pragma omp parallel for
+                               for(int i = 0; i < numReplicas; ++i)
+                                   RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d);
                             }
                             internal->compute(get_energy);
                             break;
@@ -797,8 +854,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
         }
         gpuErrchk(cudaDeviceSynchronize());
         //compute the force for rigid bodies
-        for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-            (*iter)->updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, internal->getEnergy(), get_energy, RigidBodyInterpolationType, sys, sys_d);
+        #ifdef _OPENMP
+        omp_set_num_threads(4);
+        #endif
+        #pragma omp parallel for
+        for(int i = 0; i < numReplicas; ++i)
+            RBC[i]->updateForces((internal->getPos_d())+i*num, (internal->getForceInternal_d())+i*num, s, (internal->getEnergy())+i*num, get_energy, 
+                                 RigidBodyInterpolationType, sys, sys_d);
         gpuErrchk(cudaDeviceSynchronize());
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
@@ -809,12 +871,17 @@ void GrandBrownTown::RunNoseHooverLangevin()
 
         if(rigidbody_dynamic == String("Langevin"))
         {
-            for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
+            #ifdef _OPENMP
+            omp_set_num_threads(4);
+            #endif
+            #pragma omp parallel for ordered
+            for(int i = 0; i < numReplicas; ++i)
             {
-                (*iter)->SetRandomTorques();
-                (*iter)->AddLangevin();
-                (*iter)->integrateDLM(2);
-                (*iter)->print(s);
+                RBC[i]->SetRandomTorques();
+                RBC[i]->AddLangevin();
+                RBC[i]->integrateDLM(2);
+                #pragma omp ordered
+                RBC[i]->print(s);
             }
         }
 
@@ -896,8 +963,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 
                 if(rigidbody_dynamic == String("Langevin"))
                 {
-                    for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
-                        (*iter)->KineticEnergy();
+                    #ifdef _OPENMP
+                    omp_set_num_threads(4);
+                    #endif
+                    #pragma omp parallel for
+                    for(int i = 0; i < numReplicas; ++i)
+                        RBC[i]->KineticEnergy();
                 }
                 std::fstream rb_energy_file;
                 rb_energy_file.open("rb_energy_config.txt", std::fstream::out | std::fstream::app);
@@ -906,12 +977,16 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     float k_tol = 0.f;
                     float v_tol = 0.f;
                     float (RigidBody::*func_ptr)();
-                    for(std::vector<RigidBodyController*>::iterator iter = RBC.begin(); iter != RBC.end(); ++iter)
+                    #ifdef _OPENMP
+                    omp_set_num_threads(4);
+                    #endif
+                    #pragma omp parallel for private(func_ptr) reduction(+:k_tol,v_tol)
+                    for(int i = 0; i < numReplicas; ++i)
                     {
                         func_ptr = &RigidBody::getKinetic;
-                        k_tol += (*iter)->getEnergy(func_ptr);
+                        k_tol += RBC[i]->getEnergy(func_ptr);
                         func_ptr = &RigidBody::getEnergy;
-                        v_tol += (*iter)->getEnergy(func_ptr);
+                        v_tol += RBC[i]->getEnergy(func_ptr);
                     }
                     rb_energy_file << "Kinetic Energy "   << k_tol/numReplicas << " (kT)" << std::endl;
                     rb_energy_file << "Potential Energy " << v_tol/numReplicas << " (kcal/mol)" << std::endl;
diff --git a/src/Makefile b/src/Makefile
index 02f2afc..759562e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,12 +4,18 @@ INCLUDE = $(CUDA_PATH)/include
 
 
 ifeq ($(dbg),1)
-	NV_FLAGS += -g -G
-	EX_FLAGS = -g -O0 -m$(OS_SIZE)
+	NV_FLAGS += -g -G 
+	EX_FLAGS = -g -m$(OS_SIZE)
 else
+	NV_FLAGS = -Xptxas -O3
 	EX_FLAGS = -O3 -m$(OS_SIZE)
 endif
 
+ifeq ($(omp),1)
+	NV_FLAGS += -Xcompiler -fopenmp
+	CC_FLAGS += -fopenmp
+endif
+
 ## Determine the version; TODO add tags for versions
 ifndef (VERSION)
 ifeq ($(shell git status >& /dev/null && echo true),true)
@@ -25,9 +31,9 @@ CC_FLAGS += -DVERSION="\"$(VERSION)\""
 
 
 CC_FLAGS += -I$(CUDA_PATH)/include
-CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic # TODO: test on Mac OSX and other architectures
+CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic# TODO: test on Mac OSX and other architectures
 ifeq ($(dbg),1)
-NV_FLAGS += -Xcompiler -rdynamic -lineinfo
+NV_FLAGS += -lineinfo
 else
 NV_FLAGS += -lineinfo
 endif
@@ -69,12 +75,14 @@ CC_OBJ := $(patsubst %.cpp, %.o, $(CC_SRC))
 CU_OBJ := $(patsubst %.cu, %.o, $(CU_SRC))
 
 ### Targets
-ifeq ($(dbg),1)
-TARGET = arbd_dbg
-else
-# TARGET = arbd_lin_cubic
 TARGET = arbd
+ifeq ($(dbg),1)
+TARGET = $(TARGET)_dbg
 endif
+ifeq ($(omp),1)
+TARGET = $(TARGET)_omp
+endif
+
 all: $(TARGET)
 	@echo "Done ->" $(TARGET)
 
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index 09178c6..3cab4c6 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -401,7 +401,7 @@ void RigidBodyController::integrate(int step)
 					rb.integrate(1);	
 				}
 			}
-			print(step);
+			//print(step);
 
 			// start this cycle
 			/*for (int i = 0; i < rigidBodyByType.size(); i++) {
-- 
GitLab