From 90d19a953063bd66aa8e44f0fc93ea6af5ea29e9 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <aejjeh@tyler.cs.illinois.edu>
Date: Fri, 17 Jan 2020 19:03:24 -0600
Subject: [PATCH] adding modifications to NVPTX pass

---
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 1481 ++++++++++-------
 1 file changed, 925 insertions(+), 556 deletions(-)

diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 0ee18394ba..c85a8a4f2d 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -1130,7 +1130,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // constant memory, subject to size of course
   std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
-  F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, CONSTANT_ADDRSPACE);
+  F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
@@ -1416,350 +1416,720 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       default:
         llvm_unreachable("Unknown VISC Intrinsic!");
         break;
-      }
+			}
+
+		}
+		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+			IRBuilder<> Builder(I);
+			Value *Source = MemCpyI->getSource();
+			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+			Value *Length = MemCpyI->getOperand(2);
+			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+			DEBUG(errs() << "Source: " << *Source << "\n"); 
+			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
+			DEBUG(errs() << "Length: " << *Length << "\n");
+
+			size_t memcpy_length;
+			unsigned int memcpy_count;
+			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
+				if (CI->getBitWidth() <= 64) {
+					memcpy_length = CI->getSExtValue();
+					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+					Type *Source_Type = Source->getType()->getPointerElementType();
+					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
+						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
+							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+							Value *DestPtrOperand = destGEPI->getPointerOperand();
+							for(int i = 0; i < memcpy_count; ++i) {
+								Constant *increment;
+								LoadInst *newLoadI;
+								StoreInst *newStoreI;
+								// First, need to increment the correct index for both source and dest 
+								// This invluves checking to see how many indeces the GEP has
+								// Assume for now only 1 or 2 are the viable options.
+
+								std::vector<Value*> GEPlIndex;
+								if (sourceGEPI->getNumIndices() == 1) {
+									Value *Index = sourceGEPI->getOperand(1);      
+									increment = ConstantInt::get(Index->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPlIndex.push_back(incAdd);
+									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
+									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+									newLoadI = Builder.CreateLoad(newGEPIl);
+									DEBUG(errs() << "Load: " << *newLoadI << "\n");
+								} else { 
+									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
+								}
+
+
+								std::vector<Value*> GEPsIndex;
+								if (destGEPI->getNumIndices() == 1) {
+
+								} else if (destGEPI->getNumIndices() == 2) {
+									Value *Index0 = destGEPI->getOperand(1);      
+									GEPsIndex.push_back(Index0);
+									Value *Index1 = destGEPI->getOperand(2);      
+									increment = ConstantInt::get(Index1->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index1, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPsIndex.push_back(incAdd);
+									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
+									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
+									DEBUG(errs() << "Store: " << *newStoreI << "\n");
+								} else {
+									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
+								}
+							}
+							IItoRemove.push_back(sourceGEPI);
+							IItoRemove.push_back(destGEPI);
+							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+							IItoRemove.push_back(destBitcastI);
+							IItoRemove.push_back(sourceBitcastI);
+							IItoRemove.push_back(MemCpyI);
+						}
+					}
+
+				}
+			} else {
+				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+			}
+			//      llvm_unreachable("HERE!");
+		}
+
+		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+			DEBUG(errs() << "Found a call: " << *CI << "\n");
+			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+			if(calleeF->isDeclaration()) {
+				// Add the declaration to kernel module
+				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+				if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
+					// Now handle a few specific intrinsics
+					// For now, sin and cos are translated to their libclc equivalent
+					switch(II->getIntrinsicID()) {
+						case Intrinsic::sin:
+						case Intrinsic::cos:
+							{
+								DEBUG(errs() << "Found sincos: " << *II << "\n");
+								// Get the libclc function
+								// libclc uses mangled name for sin cos
+								assert(II->getType()->isFloatTy()
+										&& "Only handling sin(float) and cos(float)!");
+								std::string name;
+								if(II->getIntrinsicID() == Intrinsic::sin)
+									name = "sin";
+								else
+									name = "cos";
+
+								FunctionType* SinCosFT = FunctionType::get(II->getType(),
+										Type::getFloatTy(KernelM->getContext()),
+										false);
+								FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT);
+								CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
+
+								II->replaceAllUsesWith(CI);
+								IItoRemove.push_back(II);
+								break;
+							}
+						case Intrinsic::floor:
+							{
+								DEBUG(errs() << "Found floor intrinsic\n");
+								F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
+								FunctionType* FTy = F->getFunctionType();
+								DEBUG(errs() << *F << "\n");
+
+								// Create argument list
+								std::vector<Value*> args;
+								assert(CI->getNumArgOperands() == FTy->getNumParams()
+										&& "Number of arguments of call do not match with Intrinsic");
+								for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+									Value* V = CI->getArgOperand(i);
+									// Either the type should match or both should be of pointer type
+									assert((V->getType() == FTy->getParamType(i) ||
+												(V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
+											&& "Dummy function call argument does not match with Intrinsic argument!");
+									// If the types do not match, then both must be pointer type and pointer
+									// cast needs to be performed
+									if(V->getType() != FTy->getParamType(i)) {
+										V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+									}
+									args.push_back(V);
+								}
+								// Insert call instruction
+								CallInst* Inst = CallInst::Create(F, args,
+										F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+								DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+								CI->replaceAllUsesWith(Inst);
+								IItoRemove.push_back(II);
+								break;
+							}
+						default:
+							errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
+					}
+				}
+
+			}
+			else {
+				// Check if the called function has already been cloned before.
+				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+				// Iterate over the new function to see if it calls any other functions
+				// in the module.
+				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
+					if(auto *Call = dyn_cast<CallInst>(&*i)) {
+						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
+						CloneAndReplaceCall(Call, CalledFunc);
+					}
+				}
+			}
+			//TODO: how to handle address space qualifiers in load/store
+		}
+
+	}
+  // search for pattern where float is being casted to int and loaded/stored and change it.	
+  DEBUG(errs() << "finding pattern for replacement!\n");
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+    bool cont = false;
+    bool keepGEPI = false;
+    bool keepGEPI2= false;
+    Instruction *I = &(*i);
+    GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
 
+    if (!GEPI) {
+      // did nod find pattern start, continue
+      continue;
+    }
+    // may have found pattern, check
+    DEBUG(errs() << "GEPI " << *GEPI << "\n");
+    // print whatever we want for debug
+    Value* PtrOp = GEPI->getPointerOperand();
+    Type *SrcTy = GEPI->getSourceElementType();
+    unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+    if (SrcTy->isArrayTy()) 
+      DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
+    else
+      DEBUG(errs() << *SrcTy << " is not an array type!\n");
+    // check that source element type is float
+    if (SrcTy->isArrayTy()) {
+      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+        DEBUG(errs() << "GEPI type is array but not float!\n");
+        continue;
+      }
+    }
+    else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
+      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+      // does not fit this pattern - no float GEP instruction
+      continue;
+    }
+    // check that addressspace is 1
+    //	  if (GEPIaddrspace != 1) {
+    //			// does not fit this pattern - addrspace of pointer argument is not global
+    //			continue;
+    //		}
+    if (!(GEPI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      //continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI = true;
     }
-    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-      DEBUG(errs() << "Found a call: " << *CI << "\n");
-      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-      if(calleeF->isDeclaration()) {
-        // Add the declaration to kernel module
-        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-        if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
-          // Now handle a few specific intrinsics
-          // For now, sin and cos are translated to their libclc equivalent
-          switch(II->getIntrinsicID()) {
-          case Intrinsic::sin:
-          case Intrinsic::cos:
-          {
-            DEBUG(errs() << "Found sincos: " << *II << "\n");
-            // Get the libclc function
-            // libclc uses mangled name for sin cos
-            assert(II->getType()->isFloatTy()
-                   && "Only handling sin(float) and cos(float)!");
-            std::string name;
-            if(II->getIntrinsicID() == Intrinsic::sin)
-              name = "sin";
-            else
-              name = "cos";
-
-            FunctionType* SinCosFT = FunctionType::get(II->getType(),
-                                     Type::getFloatTy(KernelM->getContext()),
-                                     false);
-            FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT);
-            CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
-
-            II->replaceAllUsesWith(CI);
-            IItoRemove.push_back(II);
-            break;
-          }
-          case Intrinsic::floor:
-          {
-            DEBUG(errs() << "Found floor intrinsic\n");
-            F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
-            FunctionType* FTy = F->getFunctionType();
-            DEBUG(errs() << *F << "\n");
-
-            // Create argument list
-            std::vector<Value*> args;
-            assert(CI->getNumArgOperands() == FTy->getNumParams()
-                   && "Number of arguments of call do not match with Intrinsic");
-            for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-              Value* V = CI->getArgOperand(i);
-              // Either the type should match or both should be of pointer type
-              assert((V->getType() == FTy->getParamType(i) ||
-                     (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
-                     && "Dummy function call argument does not match with Intrinsic argument!");
-              // If the types do not match, then both must be pointer type and pointer
-              // cast needs to be performed
-              if(V->getType() != FTy->getParamType(i)) {
-                V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
-              }
-              args.push_back(V);
-            }
-            // Insert call instruction
-            CallInst* Inst = CallInst::Create(F, args,
-                  F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
-            DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
-            CI->replaceAllUsesWith(Inst);
-            IItoRemove.push_back(II);
-            break;
-          }
-          default:
-            errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
-          }
+    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+    // 1st GEPI it has one use
+    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+    // See if it is a bitcast
+    BitCastInst *BitCastI;
+    for (User * U : GEPI->users()) {
+      if(Instruction *ui = dyn_cast<Instruction> (U)) { 
+        DEBUG(errs() << "--" << *ui << "\n");
+        if (isa<BitCastInst>(ui)) {
+          BitCastI = dyn_cast<BitCastInst>(ui);
+          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+          break;
         }
-
       }
-      else {
-      // Check if the called function has already been cloned before.
-        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-        // Iterate over the new function to see if it calls any other functions
-        // in the module.
-        for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-          if(auto *Call = dyn_cast<CallInst>(&*i)) {
-            Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-            CloneAndReplaceCall(Call, CalledFunc);
-          }
+      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = GEPI->user_begin(),
+    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
+    //        DEBUG(errs() << "--" << *ui << "\n");
+    //			if (isa<BitCastInst>(*ui)) {
+    //				BitCastI = dyn_cast<BitCastInst>(*ui);
+    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+    //			}
+    //		}
+
+    if (cont/*!BitCastI*/) {
+      continue; // not in pattern
+    }
+
+    //    DEBUG(errs() << *BitCastI << "\n");
+    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
+    Value *Op2 = BitCastI->getOperand(0);
+    DEBUG(errs() << "----" << *Op2 << "\n");
+    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+    //		Type *OpTy = cast<Type>(Op2);
+    Type *OpTy = BitCastI->getDestTy();
+    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
+    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+      continue; // not in pattern
+    }
+
+    DEBUG(errs() << "----Here!\n");
+    // We are in GEP, bitcast.
+
+    // user_iterator, to find the load.
+
+    if (!(BitCastI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+    }
+    DEBUG(errs() << "----Bitcast has one use!\n");
+    // it has one use
+    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+    LoadInst *LoadI;
+    for (User * U : BitCastI->users()) { 
+      if (Instruction *ui = dyn_cast<Instruction> (U)) {
+        DEBUG(errs() << "-----" << *ui << "\n");
+        if (isa<LoadInst>(ui)) {
+          LoadI = dyn_cast<LoadInst>(ui);
+          DEBUG(errs() << "-----Found load as only use of bitcast\n");
+          break;
         }
       }
-      //TODO: how to handle address space qualifiers in load/store
+      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = BitCastI->user_begin(),
+    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+    //			if (isa<LoadInst>(*ui)) {
+    //				LoadI = dyn_cast<LoadInst>(*ui);
+    //        errs() << "Found load as only use of bitcast\n";
+    //			}
+    //		}
+
+    if (cont) {
+      continue; // not in pattern
     }
 
-  }
+    DEBUG("HERE!\n");
+    // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
+    assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
 
-  // We need to do this explicitly: DCE pass will not remove them because we
-  // have assumed theworst memory behaviour for these function calls
-  // Traverse the vector backwards, otherwise definitions are deleted while
-  // their subsequent uses are still around
-  for (auto *I : reverse(IItoRemove)) {
-    DEBUG(errs() << "Erasing: " << *I << "\n");
-    I->eraseFromParent();
-  }
+    // Copy user_iterator, to find the store.
 
- // Removed the cloned functions from the parent module into the new module 
-  for(auto *F : FuncToBeRemoved) {
-    F->removeFromParent(); //TODO: MARIA check
-    KernelM->getFunctionList().push_back(F);
-  }
+    if (!(LoadI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+      // TODO: generalize: one load can have more than one store users
+    }
+
+    // it has one use
+    assert(LoadI->hasOneUse() && "LoadI has a single use");
+    Value::user_iterator ui = LoadI->user_begin();
+    // skipped loop, because is has a single use
+    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+    if (!StoreI) {
+      continue; // not in pattern
+    }
+
+    // Also check that the store uses the loaded value as the value operand
+    if (StoreI->getValueOperand() != LoadI) {
+      continue;
+    }
+
+    DEBUG(errs() << "-------Found store instruction\n");
+
+    // Look for its bitcast, which is its pointer operand
+    Value *StPtrOp = StoreI->getPointerOperand();
+    DEBUG(errs() << "-------" << *StPtrOp << "\n");
+    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+    if (!BitCastI2) {
+      continue; //not in pattern
+    }
+
+    DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
+    // found bitcast. Look for the second GEP, its from operand.
+    Value *BCFromOp = BitCastI2->getOperand(0);
+    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+    if (!GEPI2) {
+      continue; //not in pattern
+    }
+
+    if (!(GEPI2->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      //continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI2 = true;
+    }
+    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
 
-  addCLMetadata(F_nvptx);
-  kernel->KernelFunction = F_nvptx;
-  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-  DEBUG(errs() << *KernelM);
+    Value *PtrOp2 = GEPI2->getPointerOperand();
+
+    // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
+
+    // Assume we found pattern
+    if (!keepGEPI) {  
+      IItoRemove.push_back(GEPI);
+      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+    } else {
+      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+    }
+    IItoRemove.push_back(BitCastI);
+    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+    IItoRemove.push_back(LoadI);
+    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+    IItoRemove.push_back(GEPI2);
+    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+    IItoRemove.push_back(BitCastI2);
+    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+    if (!keepGEPI2) {
+      IItoRemove.push_back(StoreI);
+      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+    } else {
+
+      DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
+    }
+
+    std::vector<Value*> GEPlIndex;
+    if (GEPI->hasIndices()) {
+      for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+        GEPlIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+    std::vector<Value*> GEPsIndex;
+    if (GEPI2->hasIndices()) {
+      for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+        GEPsIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+
+
+    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+    GetElementPtrInst* newlGEP =
+      GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
+          PtrOp, // operand from 1st GEP
+          ArrayRef<Value*>(GEPlIndex),
+          Twine(),
+          StoreI);
+    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+    // insert load before GEPI
+    LoadInst *newLoadI =
+      new LoadInst(Type::getFloatTy(M.getContext()),
+          newlGEP, // new GEP
+          Twine(),
+          LoadI->isVolatile(),
+          LoadI->getAlignment(),
+          LoadI->getOrdering(),
+          LoadI->getSyncScopeID(),
+          StoreI);
+    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+    // same for GEP for store, for store operand
+    GetElementPtrInst* newsGEP =
+      GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+          PtrOp2, // operand from 2nd GEP
+          ArrayRef<Value*>(GEPsIndex),
+          Twine(),
+          StoreI);
+    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+    // insert store before GEPI
+    StoreInst *newStoreI =
+      new StoreInst(newLoadI,
+          newsGEP, // new GEP
+          StoreI->isVolatile(),
+          StoreI->getAlignment(),
+          StoreI->getOrdering(),
+          StoreI->getSyncScopeID(),
+          StoreI);
+    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
 
-  return;
+  }
+
+	// We need to do this explicitly: DCE pass will not remove them because we
+	// have assumed theworst memory behaviour for these function calls
+	// Traverse the vector backwards, otherwise definitions are deleted while
+	// their subsequent uses are still around
+	for (auto *I : reverse(IItoRemove)) {
+		DEBUG(errs() << "Erasing: " << *I << "\n");
+		I->eraseFromParent();
+	}
+
+	// Removed the cloned functions from the parent module into the new module 
+	for(auto *F : FuncToBeRemoved) {
+		F->removeFromParent(); //TODO: MARIA check
+		KernelM->getFunctionList().push_back(F);
+	}
+
+	addCLMetadata(F_nvptx);
+	kernel->KernelFunction = F_nvptx;
+	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+	DEBUG(errs() << *KernelM);
+
+	return;
 }
 
 bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-  errs() << "\nDFG2LLVM_NVPTX PASS\n";
+	errs() << "\nDFG2LLVM_NVPTX PASS\n";
 
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
+	// Get the BuildDFG Analysis Results:
+	// - Dataflow graph
+	// - Maps from i8* hansles to DFNode and DFEdge
+	BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
-  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+	// DFInternalNode *Root = DFG.getRoot();
+	std::vector<DFInternalNode*> Roots = DFG.getRoots();
+	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  // Visitor for Code Generation Graph Traversal
-  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+	// Visitor for Code Generation Graph Traversal
+	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
 
-  // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(rootNode);
-  }
+	// Iterate over all the DFGs and produce code for each one of them
+	for (auto rootNode: Roots) {
+		// Initiate code generation for root DFNode
+		CGTVisitor->visit(rootNode);
+	}
 
-  CGTVisitor->writeKernelsModule();
+	CGTVisitor->writeKernelsModule();
 
-  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
-  delete CGTVisitor;
+	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
+	delete CGTVisitor;
 
-  return true;
+	return true;
 }
 
 std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-  /*SmallString<128> currentDir;
-  llvm::sys::fs::current_path(currentDir);
-  std::string fileName = getFilenameFromModule(M);
-  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-  return output.str().append(".kernels.ll");*/
-  std::string mid = M.getModuleIdentifier();
-  return mid.append(".kernels.ll");
+	/*SmallString<128> currentDir;
+		llvm::sys::fs::current_path(currentDir);
+		std::string fileName = getFilenameFromModule(M);
+		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+		return output.str().append(".kernels.ll");*/
+	std::string mid = M.getModuleIdentifier();
+	return mid.append(".kernels.ll");
 }
 
 void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-  assert(isa<PointerType>(V->getType())
-         && "Value should be of Pointer Type!");
-  PointerType* OldTy = cast<PointerType>(V->getType());
-  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-  V->mutateType(NewTy);
-  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-    // Change all uses producing pointer type in same address space to new
-    // addressspace.
-    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-        fixValueAddrspace(*ui, addrspace);
-      }
-    }
-  }
+	assert(isa<PointerType>(V->getType())
+			&& "Value should be of Pointer Type!");
+	PointerType* OldTy = cast<PointerType>(V->getType());
+	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+	V->mutateType(NewTy);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+		// Change all uses producing pointer type in same address space to new
+		// addressspace.
+		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+				fixValueAddrspace(*ui, addrspace);
+			}
+		}
+	}
 }
 
 
 std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-  std::vector<unsigned> ConstantMemArgs;
-   for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-     Argument* arg = &*ai; 
-    std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-        GlobalMemArgs->end(), arg->getArgNo());
-    // It has to be a global memory argument to be promotable
-    if(pos == GlobalMemArgs->end())
-      continue;
-
-    // Check if it can/should be promoted
-    if(canBePromoted(arg, F)) {
-      errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
-      ConstantMemArgs.push_back(arg->getArgNo());
-      GlobalMemArgs->erase(pos);
-    }
-  }
-  return ConstantMemArgs;
+	std::vector<unsigned> ConstantMemArgs;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument* arg = &*ai; 
+		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
+				GlobalMemArgs->end(), arg->getArgNo());
+		// It has to be a global memory argument to be promotable
+		if(pos == GlobalMemArgs->end())
+			continue;
+
+		// Check if it can/should be promoted
+		if(canBePromoted(arg, F)) {
+			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
+			ConstantMemArgs.push_back(arg->getArgNo());
+			GlobalMemArgs->erase(pos);
+		}
+	}
+	return ConstantMemArgs;
 }
 
 Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-  unsigned idx = 0;
-  std::vector<Type*> ArgTypes;
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    Argument *arg = &*ai;
-    DEBUG(errs() << *arg << "\n");
-    unsigned argno = arg->getArgNo();
-    if ((idx < Args.size()) && (argno == Args[idx])) {
-      fixValueAddrspace(arg, addrspace);
-      idx++;
-    }
-    ArgTypes.push_back(arg->getType());
-  }
-  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-  return newF;
+	unsigned idx = 0;
+	std::vector<Type*> ArgTypes;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument *arg = &*ai;
+		DEBUG(errs() << *arg << "\n");
+		unsigned argno = arg->getArgNo();
+		if ((idx < Args.size()) && (argno == Args[idx])) {
+			fixValueAddrspace(arg, addrspace);
+			idx++;
+		}
+		ArgTypes.push_back(arg->getType());
+	}
+	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+	return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-  IRBuilder<> Builder(&*F->begin());
+	IRBuilder<> Builder(&*F->begin());
 
-  SmallVector<Metadata*,8> KernelMD;
-  KernelMD.push_back(ValueAsMetadata::get(F));
+	SmallVector<Metadata*,8> KernelMD;
+	KernelMD.push_back(ValueAsMetadata::get(F));
 
-  // TODO: There is additional metadata used by kernel files but we skip them as
-  // they are not mandatory. In future they might be useful to enable
-  // optimizations
+	// TODO: There is additional metadata used by kernel files but we skip them as
+	// they are not mandatory. In future they might be useful to enable
+	// optimizations
 
-  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-  MDN_kernels->addOperand(MDKernelNode);
+	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+	MDN_kernels->addOperand(MDKernelNode);
 
-  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-  // TODO: Replace 1 with the number of the kernel.
-  // Add when support for multiple launces is added
-  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+	// TODO: Replace 1 with the number of the kernel.
+	// Add when support for multiple launces is added
+	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
 }
 
 void CGT_NVPTX::writeKernelsModule() {
 
-  // In addition to deleting all other functions, we also want to spiff it
-  // up a little bit.  Do this now.
-  legacy::PassManager Passes;
+	// In addition to deleting all other functions, we also want to spiff it
+	// up a little bit.  Do this now.
+	legacy::PassManager Passes;
 
-  errs() << "Writing to File --- ";
-  errs() << getKernelsModuleName(M).c_str() << "\n";
-  std::error_code EC;
-  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-  if (EC) {
-    errs() << EC.message() << '\n';
-  }
+	errs() << "Writing to File --- ";
+	errs() << getKernelsModuleName(M).c_str() << "\n";
+	std::error_code EC;
+	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+	if (EC) {
+		errs() << EC.message() << '\n';
+	}
 
-  Passes.add(
-      createPrintModulePass(Out.os()));
+	Passes.add(
+			createPrintModulePass(Out.os()));
 
-  Passes.run(*KernelM);
+	Passes.run(*KernelM);
 
-  // Declare success.
-  Out.keep();
+	// Declare success.
+	Out.keep();
 }
 
 Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 
-  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-  // FIXME: Maybe do that using the Node?
-  StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-  assert(FRetTy && "Return Type must always be a struct");
-
-  // Keeps return statements, because we will need to replace them
-  std::vector<ReturnInst *> RItoRemove;
-  findReturnInst(F, RItoRemove);
-
-
-  // Check for { } return struct, which means that the function returns void
-  if (FRetTy->isEmptyTy()) {
-
-    DEBUG(errs() << "\tFunction output struct is void\n");
-    DEBUG(errs() << "\tNo parameters added\n");
-
-    // Replacing return statements with others returning void
-    for (auto *RI : RItoRemove) {
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-    }
-    DEBUG(errs() << "\tChanged return statements to return void\n");
-  }
-  else {
-    // The struct has return values, thus needs to be converted to parameter
-
-    // Iterate over all element types of return struct and add arguments to the
-    // function
-    std::vector<Argument*> Args;
-    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-      Args.push_back(RetArg);
-      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-    }
-
-    DEBUG(errs() << "\tReplacing Return statements\n");
-    // Replace return statements with extractValue and store instructions
-    for (auto *RI : RItoRemove) {
-      Value* RetVal = RI->getReturnValue();
-      for(unsigned i = 0; i < Args.size(); i++) {
-        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-                               Args[i]->getName()+".val", RI);
-        new StoreInst(EI, Args[i], RI);
-      }
-      // assert(RetVal && "Return value should not be null at this point");
-      // StructType* RetType = cast<StructType>(RetVal->getType());
-      // assert(RetType && "Return type is not a struct");
-
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-
-    }
-  }
-  DEBUG(errs() << "\tReplaced return statements\n");
-
-  // Create the argument type list with the added argument's type
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-
-  // Adding new arguments to the function argument list, would not change the
-  // function type. We need to change the type of this function to reflect the
-  // added arguments
-  Type* VoidRetType = Type::getVoidTy(F->getContext());
-  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-  // Change the function type
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-  //F->eraseFromParent();
-  return newF;
+	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+	// FIXME: Maybe do that using the Node?
+	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
+	assert(FRetTy && "Return Type must always be a struct");
+
+	// Keeps return statements, because we will need to replace them
+	std::vector<ReturnInst *> RItoRemove;
+	findReturnInst(F, RItoRemove);
+
+	std::vector<Type *> RetArgTypes;
+	std::vector<Argument*> RetArgs;
+	std::vector<Argument*> Args;
+	// Check for { } return struct, which means that the function returns void
+	if (FRetTy->isEmptyTy()) {
+
+		DEBUG(errs() << "\tFunction output struct is void\n");
+		DEBUG(errs() << "\tNo parameters added\n");
+
+		// Replacing return statements with others returning void
+		for (auto *RI : RItoRemove) {
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+		}
+		DEBUG(errs() << "\tChanged return statements to return void\n");
+	}
+	else {
+		// The struct has return values, thus needs to be converted to parameter
+
+		// Iterate over all element types of return struct and add arguments to the
+		// function
+		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+			RetArgs.push_back(RetArg);
+			RetArgTypes.push_back(RetArg->getType());
+			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+		}
+
+		DEBUG(errs() << "\tReplacing Return statements\n");
+		// Replace return statements with extractValue and store instructions
+		for (auto *RI : RItoRemove) {
+			Value* RetVal = RI->getReturnValue();
+			for(unsigned i = 0; i < RetArgs.size(); i++) {
+				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+						RetArgs[i]->getName()+".val", RI);
+				new StoreInst(EI, RetArgs[i], RI);
+			}
+			// assert(RetVal && "Return value should not be null at this point");
+			// StructType* RetType = cast<StructType>(RetVal->getType());
+			// assert(RetType && "Return type is not a struct");
+
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+
+		}
+	}
+	DEBUG(errs() << "\tReplaced return statements\n");
+
+	// Create the argument type list with the added argument's type
+	std::vector<Type*> ArgTypes;
+	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		ArgTypes.push_back(ai->getType());
+	}
+	for(auto *RATy: RetArgTypes) {
+		ArgTypes.push_back(RATy);
+	}
+
+	// Creating Args vector to use in cloning!
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Args.push_back(&*ai);
+	}
+	for(auto *ai : RetArgs) {
+		Args.push_back(ai);
+	}
+
+	// Adding new arguments to the function argument list, would not change the
+	// function type. We need to change the type of this function to reflect the
+	// added arguments
+	Type* VoidRetType = Type::getVoidTy(F->getContext());
+	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+	// Change the function type
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+	//F->eraseFromParent();
+	return newF;
 }
 
 /******************************************************************************
@@ -1771,334 +2141,333 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
 static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-  if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  VisitedList->push_back(V);
-  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-      ui != ue; ++ui) {
-    Instruction* I = dyn_cast<Instruction>(*ui);
-    if(!I) {
-      // if use is not an instruction, then skip it
-      continue;
-    }
-    DEBUG(errs() << "\t" << *I << "\n");
-    if(isa<LoadInst>(I)) {
-      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-      UseList->push_back(V);
-    }
-    else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-      // found a store in use chain
-      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-      return true;
-    }
-    else if(BuildDFG::isViscIntrinsic(I)) {
-      // If it is an atomic intrinsic, we found a store
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-          && "Only visc atomic intrinsics can have an argument as input");
-      return true;
-    }
-    else {
-      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-      if(findLoadStoreUses(I, UseList, VisitedList))
-        return true;
-    }
-  }
- return false;
+	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	VisitedList->push_back(V);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
+			ui != ue; ++ui) {
+		Instruction* I = dyn_cast<Instruction>(*ui);
+		if(!I) {
+			// if use is not an instruction, then skip it
+			continue;
+		}
+		DEBUG(errs() << "\t" << *I << "\n");
+		if(isa<LoadInst>(I)) {
+			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+			UseList->push_back(V);
+		}
+		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+			// found a store in use chain
+			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+			return true;
+		}
+		else if(BuildDFG::isViscIntrinsic(I)) {
+			// If it is an atomic intrinsic, we found a store
+			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
+					&& "Only visc atomic intrinsics can have an argument as input");
+			return true;
+		}
+		else {
+			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+			if(findLoadStoreUses(I, UseList, VisitedList))
+				return true;
+		}
+	}
+	return false;
 }
 
 static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-  if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  DependenceList->push_back(V);
-  // If not an instruction, then not dependent on node instance id
-  if(!isa<Instruction>(V) || isa<Constant>(V)) {
-    DEBUG(errs() << "\tStop\n");
-    return false;
-  }
-
-  Instruction* I = cast<Instruction>(V);
-  for(unsigned i = 0; i < I->getNumOperands(); i++) {
-    Value* operand = I->getOperand(i);
-    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-      if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-          || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-            || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-        Value* Node = II->getArgOperand(0);
-        IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-        assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-        if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-          return true;
-        }
-      }
-    }
-    if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-      DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-      continue;
-    }
-    DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-    if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-      return true;
-    }
-  }
-  return false;
+	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	DependenceList->push_back(V);
+	// If not an instruction, then not dependent on node instance id
+	if(!isa<Instruction>(V) || isa<Constant>(V)) {
+		DEBUG(errs() << "\tStop\n");
+		return false;
+	}
+
+	Instruction* I = cast<Instruction>(V);
+	for(unsigned i = 0; i < I->getNumOperands(); i++) {
+		Value* operand = I->getOperand(i);
+		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
+			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+				Value* Node = II->getArgOperand(0);
+				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
+				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
+				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
+					return true;
+				}
+			}
+		}
+		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
+			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
+			continue;
+		}
+		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
+			return true;
+		}
+	}
+	return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
 static bool canBePromoted(Argument* arg, Function* F) {
-  DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-  std::vector<Value*> UseList;
-  std::vector<Value*> VisitedList;
-  // recursively traverse use chain
-  // if find a store instruction return false, everything fails, cannot be
-  // promoted
-  // if find a load instruction as use, add the GEP instruction to list
-  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-  if(foundStore == true)
-    return false;
-  // See that the GEP instructions are not dependent on getNodeInstanceID
-  // intrinsic
-  DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-  std::vector<Value*>DependenceList;
-  for(auto U: UseList) {
-    if(isDependentOnNodeInstanceID(U, &DependenceList))
-      return false;
-  }
-  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-  return true;
+	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
+	std::vector<Value*> UseList;
+	std::vector<Value*> VisitedList;
+	// recursively traverse use chain
+	// if find a store instruction return false, everything fails, cannot be
+	// promoted
+	// if find a load instruction as use, add the GEP instruction to list
+	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+	if(foundStore == true)
+		return false;
+	// See that the GEP instructions are not dependent on getNodeInstanceID
+	// intrinsic
+	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
+	std::vector<Value*>DependenceList;
+	for(auto U: UseList) {
+		if(isDependentOnNodeInstanceID(U, &DependenceList))
+			return false;
+	}
+	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+	return true;
 }
 
 
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
 static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-  // Assign number of dimenstions a constant value
-  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-  // If local work group size if null
-  if(!kernel->hasLocalWG()) {
-    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-  }
-  else {
-    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-      if(isa<Argument>(kernel->localWGSize[i]))
-        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-    }
-    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-  }
-
-  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-    if(isa<Argument>(kernel->globalWGSize[i]))
-      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-  }
-
-  // For OpenCL, global work group size is the total bumber of instances in each
-  // dimension. So, multiply local and global dim limits.
-  std::vector<Value*> globalWGSizeInsts;
-  if(kernel->hasLocalWG()) {
-    for (unsigned i = 0; i < kernel->gridDim; i++) {
-      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-      globalWGSizeInsts.push_back(MulInst);
-    }
-  }
-  else {
-    globalWGSizeInsts = kernel->globalWGSize;
-  }
-  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+	// Assign number of dimenstions a constant value
+	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+	// If local work group size if null
+	if(!kernel->hasLocalWG()) {
+		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+	}
+	else {
+		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+			if(isa<Argument>(kernel->localWGSize[i]))
+				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+		}
+		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+	}
+
+	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+		if(isa<Argument>(kernel->globalWGSize[i]))
+			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+	}
+
+	// For OpenCL, global work group size is the total bumber of instances in each
+	// dimension. So, multiply local and global dim limits.
+	std::vector<Value*> globalWGSizeInsts;
+	if(kernel->hasLocalWG()) {
+		for (unsigned i = 0; i < kernel->gridDim; i++) {
+			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+			globalWGSizeInsts.push_back(MulInst);
+		}
+	}
+	else {
+		globalWGSizeInsts = kernel->globalWGSize;
+	}
+	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
 static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-  Value* WGPtr;
-  // Get int64_t and or ease of use
-  Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-  // Work Group type is [#dim x i64]
-  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-  // Allocate space of Global work group data on stack and get pointer to
-  // first element.
-  AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-  Value* nextDim = WGPtr;
-  DEBUG(errs() << *WGPtr << "\n");
-
-  // Iterate over the number of dimensions and store the global work group
-  // size in that dimension
-  for(unsigned i=0; i < WGSize.size(); i++) {
-    DEBUG(errs() << *WGSize[i] << "\n");
-    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-    if(WGSize[i]->getType() != Int64Ty) {
-      // If number of dimensions are mentioned in any other integer format,
-      // generate code to extend it to i64. We need to use the mapped value in
-      // the new generated function, hence the use of VMap
-      // FIXME: Why are we changing the kernel WGSize vector here?
-      DEBUG(errs() << "Not i64. Zero extend required.\n");
-      DEBUG(errs() << *WGSize[i] << "\n");
-      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-      DEBUG(errs() << "Bitcast done.\n");
-      StoreInst* SI = new StoreInst(CI, nextDim, IB);
-      DEBUG(errs() << "Zero extend done.\n");
-      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-    } else {
-      // Store the value representing work group size in ith dimension on
-      // stack
-      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-    }
-    if(i+1 < WGSize.size()) {
-      // Move to next dimension
-      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-                               WG->getName()+"."+Twine(i+1),
-                               IB);
-      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-      nextDim = GEP;
-    }
-  }
-  return WGPtr;
+	Value* WGPtr;
+	// Get int64_t and or ease of use
+	Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+	// Work Group type is [#dim x i64]
+	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+	// Allocate space of Global work group data on stack and get pointer to
+	// first element.
+	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
+	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+	Value* nextDim = WGPtr;
+	DEBUG(errs() << *WGPtr << "\n");
+
+	// Iterate over the number of dimensions and store the global work group
+	// size in that dimension
+	for(unsigned i=0; i < WGSize.size(); i++) {
+		DEBUG(errs() << *WGSize[i] << "\n");
+		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+		if(WGSize[i]->getType() != Int64Ty) {
+			// If number of dimensions are mentioned in any other integer format,
+			// generate code to extend it to i64. We need to use the mapped value in
+			// the new generated function, hence the use of VMap
+			// FIXME: Why are we changing the kernel WGSize vector here?
+			DEBUG(errs() << "Not i64. Zero extend required.\n");
+			DEBUG(errs() << *WGSize[i] << "\n");
+			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+			DEBUG(errs() << "Bitcast done.\n");
+			StoreInst* SI = new StoreInst(CI, nextDim, IB);
+			DEBUG(errs() << "Zero extend done.\n");
+			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+		} else {
+			// Store the value representing work group size in ith dimension on
+			// stack
+			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+		}
+		if(i+1 < WGSize.size()) {
+			// Move to next dimension
+			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+					WG->getName()+"."+Twine(i+1),
+					IB);
+			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+			nextDim = GEP;
+		}
+	}
+	return WGPtr;
 
 }
 
 // Get generated PTX binary name
 static std::string getPTXFilename(const Module& M) {
-  std::string moduleID = M.getModuleIdentifier();
-  moduleID.append(".kernels.cl");
-  return moduleID;
+	std::string moduleID = M.getModuleIdentifier();
+	moduleID.append(".kernels.cl");
+	return moduleID;
 }
 
 // Get the name of the input file from module ID
 static std::string getFilenameFromModule(const Module& M) {
-  std::string moduleID = M.getModuleIdentifier();
-  return moduleID.substr(moduleID.find_last_of("/")+1);
+	std::string moduleID = M.getModuleIdentifier();
+	return moduleID.substr(moduleID.find_last_of("/")+1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-  if (TARGET_PTX == 32)
-    M.setDataLayout(StringRef(nvptx32_layoutStr));
-  else if (TARGET_PTX == 64)
-    M.setDataLayout(StringRef(nvptx64_layoutStr));
-  else assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setDataLayout(StringRef(nvptx32_layoutStr));
+	else if (TARGET_PTX == 64)
+		M.setDataLayout(StringRef(nvptx64_layoutStr));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 static void changeTargetTriple(Module &M) {
-  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-  if (TARGET_PTX == 32)
-    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-  else if (TARGET_PTX == 64)
-    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-  else assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+	else if (TARGET_PTX == 64)
+		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 // Helper function, populate a vector with all return statements in a function
 static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-  for (auto &BB : *F) {
-    if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-	ReturnInstVec.push_back(RI);
-  }	
+	for (auto &BB : *F) {
+		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+			ReturnInstVec.push_back(RI);
+	}	
 }
 
 // Helper function, populate a vector with all IntrinsicID intrinsics in a function
 static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction *I = &(*i);
-    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-    if (II && II->getIntrinsicID() == IntrinsicID) {
-      IntrinsicInstVec.push_back(II);
-    }
-  }
+	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+		Instruction *I = &(*i);
+		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+		if (II && II->getIntrinsicID() == IntrinsicID) {
+			IntrinsicInstVec.push_back(II);
+		}
+	}
 }
 
 // Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-  switch(ID) {
-  case Intrinsic::visc_atomic_add:
-    return AtomicRMWInst::Add;
-  case Intrinsic::visc_atomic_sub:
-    return AtomicRMWInst::Sub;
-  case Intrinsic::visc_atomic_min:
-    return AtomicRMWInst::Min;
-  case Intrinsic::visc_atomic_umin:
-    return AtomicRMWInst::UMin;
-  case Intrinsic::visc_atomic_max:
-    return AtomicRMWInst::Max;
-  case Intrinsic::visc_atomic_umax:
-    return AtomicRMWInst::UMax;
-  //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
-  //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
-  case Intrinsic::visc_atomic_xchg:
-    return AtomicRMWInst::Xchg;
-  case Intrinsic::visc_atomic_and:
-    return AtomicRMWInst::And;
-  case Intrinsic::visc_atomic_or:
-    return AtomicRMWInst::Or;
-  case Intrinsic::visc_atomic_xor:
-    return AtomicRMWInst::Xor;
-  default:
-    llvm_unreachable("Unsupported atomic intrinsic!");
-  };
+	switch(ID) {
+		case Intrinsic::visc_atomic_add:
+			return AtomicRMWInst::Add;
+		case Intrinsic::visc_atomic_sub:
+			return AtomicRMWInst::Sub;
+		case Intrinsic::visc_atomic_min:
+			return AtomicRMWInst::Min;
+		case Intrinsic::visc_atomic_umin:
+			return AtomicRMWInst::UMin;
+		case Intrinsic::visc_atomic_max:
+			return AtomicRMWInst::Max;
+		case Intrinsic::visc_atomic_umax:
+			return AtomicRMWInst::UMax;
+			//case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+			//case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+		case Intrinsic::visc_atomic_xchg:
+			return AtomicRMWInst::Xchg;
+		case Intrinsic::visc_atomic_and:
+			return AtomicRMWInst::And;
+		case Intrinsic::visc_atomic_or:
+			return AtomicRMWInst::Or;
+		case Intrinsic::visc_atomic_xor:
+			return AtomicRMWInst::Xor;
+		default:
+			llvm_unreachable("Unsupported atomic intrinsic!");
+	};
 }
 
 
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-  switch(ID) {
-  case Intrinsic::visc_atomic_cmpxchg:
-    return "atom_cmpxchg";
-  case Intrinsic::visc_atomic_add:
-    return "atom_add";
-  case Intrinsic::visc_atomic_sub:
-    return "atom_sub";
-  case Intrinsic::visc_atomic_min:
-    return "atom_min";
-  case Intrinsic::visc_atomic_max:
-    return "atom_max";
-  case Intrinsic::visc_atomic_inc:
-    return "atom_inc";
-  case Intrinsic::visc_atomic_dec:
-    return "atom_dec";
-  case Intrinsic::visc_atomic_xchg:
-    return "atom_xchg";
-  case Intrinsic::visc_atomic_and:
-    return "atom_and";
-  case Intrinsic::visc_atomic_or:
-    return "atom_or";
-  case Intrinsic::visc_atomic_xor:
-    return "atom_xor";
-  default:
-    llvm_unreachable("Unsupported atomic intrinsic!");
-  };
+	switch(ID) {
+		case Intrinsic::visc_atomic_cmpxchg:
+			return "atom_cmpxchg";
+		case Intrinsic::visc_atomic_add:
+			return "atom_add";
+		case Intrinsic::visc_atomic_sub:
+			return "atom_sub";
+		case Intrinsic::visc_atomic_min:
+			return "atom_min";
+		case Intrinsic::visc_atomic_max:
+			return "atom_max";
+		case Intrinsic::visc_atomic_inc:
+			return "atom_inc";
+		case Intrinsic::visc_atomic_dec:
+			return "atom_dec";
+		case Intrinsic::visc_atomic_xchg:
+			return "atom_xchg";
+		case Intrinsic::visc_atomic_and:
+			return "atom_and";
+		case Intrinsic::visc_atomic_or:
+			return "atom_or";
+		case Intrinsic::visc_atomic_xor:
+			return "atom_xor";
+		default:
+			llvm_unreachable("Unsupported atomic intrinsic!");
+	};
 }
 
 } // End of namespace
 
 char DFG2LLVM_NVPTX::ID = 0;
 static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
-                                      "Dataflow Graph to LLVM for NVPTX Pass",
-                                      false /* does not modify the CFG */,
-                                      true /* transformation,   *
-                                            * not just analysis */);
-
+		"Dataflow Graph to LLVM for NVPTX Pass",
+		false /* does not modify the CFG */,
+		true /* transformation,   *
+					* not just analysis */);
 
-- 
GitLab