diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index cea4ff3eb355686dd28c9b2011d5347f2faa67c5..02b45860469d49d4b419d0a8329e889d917af19a 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -1400,715 +1400,700 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         assert(Ptr->getType()->isPointerTy()
                && "First argument of supported atomics is expected to be a pointer");
         PointerType* PtrTy = cast<PointerType>(Ptr->getType());
-				std::string name;
-        if(PtrTy == Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
-          if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-            name = "atomic_add";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-            name = "atomic_sub";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-            name = "atomic_xchg";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-            name = "atomic_min";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-            name = "atomic_max";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-            name = "atomic_and";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-            name = "atomic_or";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-            name = "atomic_xor";
-        } else {
-          assert(PtrTy == Type::getInt64PtrTy(II->getContext(), PtrTy->getAddressSpace()) && "Invalid pointer type");
-          if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-            name = "atom_add";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-            name = "atom_sub";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-            name = "atom_xchg";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-            name = "atom_min";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-            name = "atom_max";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-            name = "atom_and";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-            name = "atom_or";
-          else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-            name = "atom_xor";
-        }
-        Type* paramTypes[] = {PtrTy, Val->getType()};
-				FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
-			  FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
-			  
-        Value* Params[] = {Ptr, Val};
-        CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
-        DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
-        II->replaceAllUsesWith(AtomCI);
-        IItoRemove.push_back(II);
-      }
-      break;
-      default:
-      llvm_unreachable("Unknown VISC Intrinsic!");
-      break;
-      }
-
-    }
-    else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-      IRBuilder<> Builder(I);
-      Value *Source = MemCpyI->getSource();
-      Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-      Value *Length = MemCpyI->getOperand(2);
-      DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-      DEBUG(errs() << "Source: " << *Source << "\n"); 
-      DEBUG(errs() << "Destination: " << *Destination << "\n"); 
-      DEBUG(errs() << "Length: " << *Length << "\n");
-
-      size_t memcpy_length;
-      unsigned int memcpy_count;
-      if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
-        if (CI->getBitWidth() <= 64) {
-          memcpy_length = CI->getSExtValue();
-          DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-          Type *Source_Type = Source->getType()->getPointerElementType();
-          DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-          memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-          DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-          if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
-            if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
-              Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-              Value *DestPtrOperand = destGEPI->getPointerOperand();
-              for(int i = 0; i < memcpy_count; ++i) {
-                Constant *increment;
-                LoadInst *newLoadI;
-                StoreInst *newStoreI;
-                // First, need to increment the correct index for both source and dest 
-                // This invluves checking to see how many indeces the GEP has
-                // Assume for now only 1 or 2 are the viable options.
-
-                std::vector<Value*> GEPlIndex;
-                if (sourceGEPI->getNumIndices() == 1) {
-                  Value *Index = sourceGEPI->getOperand(1);      
-                  increment = ConstantInt::get(Index->getType(), i, false);
-                  Value *incAdd = Builder.CreateAdd(Index, increment);
-                  DEBUG(errs() << "Add: " << *incAdd << "\n");
-                  GEPlIndex.push_back(incAdd);
-                  Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
-                  DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-                  newLoadI = Builder.CreateLoad(newGEPIl);
-                  DEBUG(errs() << "Load: " << *newLoadI << "\n");
-                } else { 
-                  llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
-                }
-
-
-                std::vector<Value*> GEPsIndex;
-                if (destGEPI->getNumIndices() == 1) {
-
-                } else if (destGEPI->getNumIndices() == 2) {
-                  Value *Index0 = destGEPI->getOperand(1);      
-                  GEPsIndex.push_back(Index0);
-                  Value *Index1 = destGEPI->getOperand(2);      
-                  increment = ConstantInt::get(Index1->getType(), i, false);
-                  Value *incAdd = Builder.CreateAdd(Index1, increment);
-                  DEBUG(errs() << "Add: " << *incAdd << "\n");
-                  GEPsIndex.push_back(incAdd);
-                  Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
-                  DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-                  newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
-                  DEBUG(errs() << "Store: " << *newStoreI << "\n");
-                } else {
-                  llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
-                }
-              }
-              IItoRemove.push_back(sourceGEPI);
-              IItoRemove.push_back(destGEPI);
-              Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-              Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-              IItoRemove.push_back(destBitcastI);
-              IItoRemove.push_back(sourceBitcastI);
-              IItoRemove.push_back(MemCpyI);
-            }
-          }
-
-        }
-      } else {
-        llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-      }
-      //      llvm_unreachable("HERE!");
-    }
-
-    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-      DEBUG(errs() << "Found a call: " << *CI << "\n");
-      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-      if(calleeF->isDeclaration()) {
-        // Add the declaration to kernel module
-        if (calleeF->getName() == "sqrtf") {
-          calleeF->setName(Twine("sqrt"));
-          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-          DEBUG(errs() << "CI: " << *CI << "\n");
-        } else if (calleeF->getName() == "rsqrtf") {
-          calleeF->setName(Twine("rsqrt"));
-          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-          DEBUG(errs() << "CI: " << *CI << "\n");
-        }  
-        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-      }
-      else {
-        // Check if the called function has already been cloned before.
-        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-        // Iterate over the new function to see if it calls any other functions
-        // in the module.
-        for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-          if(auto *Call = dyn_cast<CallInst>(&*i)) {
-            Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-            CloneAndReplaceCall(Call, CalledFunc);
-          }
-        }
-      }
-      //TODO: how to handle address space qualifiers in load/store
-    }
-
-  }
-  // search for pattern where float is being casted to int and loaded/stored and change it.	
-  DEBUG(errs() << "finding pattern for replacement!\n");
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-    bool cont = false;
-    bool keepGEPI = false;
-    bool keepGEPI2= false;
-    Instruction *I = &(*i);
-    GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
-
-    if (!GEPI) {
-      // did nod find pattern start, continue
-      continue;
-    }
-    // may have found pattern, check
-    DEBUG(errs() << "GEPI " << *GEPI << "\n");
-    // print whatever we want for debug
-    Value* PtrOp = GEPI->getPointerOperand();
-    Type *SrcTy = GEPI->getSourceElementType();
-    unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-    if (SrcTy->isArrayTy()) 
-      DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
-    else
-      DEBUG(errs() << *SrcTy << " is not an array type!\n");
-    // check that source element type is float
-    if (SrcTy->isArrayTy()) {
-      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-        DEBUG(errs() << "GEPI type is array but not float!\n");
-        continue;
-      }
-    }
-    else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
-      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-      // does not fit this pattern - no float GEP instruction
-      continue;
-    }
-    // check that addressspace is 1
-    //	  if (GEPIaddrspace != 1) {
-    //			// does not fit this pattern - addrspace of pointer argument is not global
-    //			continue;
-    //		}
-    if (!(GEPI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      //continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI = true;
-    }
-    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-    // 1st GEPI it has one use
-    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-    // See if it is a bitcast
-    BitCastInst *BitCastI;
-    for (User * U : GEPI->users()) {
-      if(Instruction *ui = dyn_cast<Instruction> (U)) { 
-        DEBUG(errs() << "--" << *ui << "\n");
-        if (isa<BitCastInst>(ui)) {
-          BitCastI = dyn_cast<BitCastInst>(ui);
-          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-          break;
-        }
-      }
-      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = GEPI->user_begin(),
-    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
-    //        DEBUG(errs() << "--" << *ui << "\n");
-    //			if (isa<BitCastInst>(*ui)) {
-    //				BitCastI = dyn_cast<BitCastInst>(*ui);
-    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-    //			}
-    //		}
-
-    if (cont/*!BitCastI*/) {
-      continue; // not in pattern
-    }
-
-    //    DEBUG(errs() << *BitCastI << "\n");
-    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
-    Value *Op2 = BitCastI->getOperand(0);
-    DEBUG(errs() << "----" << *Op2 << "\n");
-    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-    //		Type *OpTy = cast<Type>(Op2);
-    Type *OpTy = BitCastI->getDestTy();
-    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
-    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-      continue; // not in pattern
-    }
-
-    DEBUG(errs() << "----Here!\n");
-    // We are in GEP, bitcast.
-
-    // user_iterator, to find the load.
-
-    if (!(BitCastI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-    }
-    DEBUG(errs() << "----Bitcast has one use!\n");
-    // it has one use
-    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-    LoadInst *LoadI;
-    for (User * U : BitCastI->users()) { 
-      if (Instruction *ui = dyn_cast<Instruction> (U)) {
-        DEBUG(errs() << "-----" << *ui << "\n");
-        if (isa<LoadInst>(ui)) {
-          LoadI = dyn_cast<LoadInst>(ui);
-          DEBUG(errs() << "-----Found load as only use of bitcast\n");
-          break;
+ 
+       if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
+          Ptr = CastInst::CreatePointerCast(Ptr, Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), "", II);
         }
-      }
-      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = BitCastI->user_begin(),
-    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-    //			if (isa<LoadInst>(*ui)) {
-    //				LoadI = dyn_cast<LoadInst>(*ui);
-    //        errs() << "Found load as only use of bitcast\n";
-    //			}
-    //		}
-
-    if (cont) {
-      continue; // not in pattern
-    }
-
-    DEBUG("HERE!\n");
-    // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
-    assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
-
-    // Copy user_iterator, to find the store.
-
-    if (!(LoadI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-      // TODO: generalize: one load can have more than one store users
-    }
-
-    // it has one use
-    assert(LoadI->hasOneUse() && "LoadI has a single use");
-    Value::user_iterator ui = LoadI->user_begin();
-    // skipped loop, because is has a single use
-    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-    if (!StoreI) {
-      continue; // not in pattern
-    }
-
-    // Also check that the store uses the loaded value as the value operand
-    if (StoreI->getValueOperand() != LoadI) {
-      continue;
-    }
-
-    DEBUG(errs() << "-------Found store instruction\n");
-
-    // Look for its bitcast, which is its pointer operand
-    Value *StPtrOp = StoreI->getPointerOperand();
-    DEBUG(errs() << "-------" << *StPtrOp << "\n");
-    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-    if (!BitCastI2) {
-      continue; //not in pattern
-    }
-
-    DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
-    // found bitcast. Look for the second GEP, its from operand.
-    Value *BCFromOp = BitCastI2->getOperand(0);
-    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-    if (!GEPI2) {
-      continue; //not in pattern
-    }
-
-    if (!(GEPI2->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      //continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI2 = true;
-    }
-    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
-
-    Value *PtrOp2 = GEPI2->getPointerOperand();
-
-    // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
 
-    // Assume we found pattern
-    if (!keepGEPI) {  
-      IItoRemove.push_back(GEPI);
-      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-    } else {
-      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-    }
-    IItoRemove.push_back(BitCastI);
-    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-    IItoRemove.push_back(LoadI);
-    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-    IItoRemove.push_back(GEPI2);
-    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-    IItoRemove.push_back(BitCastI2);
-    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-    if (!keepGEPI2) {
-      IItoRemove.push_back(StoreI);
-      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-    } else {
-
-      DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
-    }
-
-    std::vector<Value*> GEPlIndex;
-    if (GEPI->hasIndices()) {
-      for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-        GEPlIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-    std::vector<Value*> GEPsIndex;
-    if (GEPI2->hasIndices()) {
-      for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-        GEPsIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-
-
-    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-    GetElementPtrInst* newlGEP =
-      GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
-          PtrOp, // operand from 1st GEP
-          ArrayRef<Value*>(GEPlIndex),
-          Twine(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-    // insert load before GEPI
-    LoadInst *newLoadI =
-      new LoadInst(Type::getFloatTy(M.getContext()),
-          newlGEP, // new GEP
-          Twine(),
-          LoadI->isVolatile(),
-          LoadI->getAlignment(),
-          LoadI->getOrdering(),
-          LoadI->getSyncScopeID(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-    // same for GEP for store, for store operand
-    GetElementPtrInst* newsGEP =
-      GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-          PtrOp2, // operand from 2nd GEP
-          ArrayRef<Value*>(GEPsIndex),
-          Twine(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-    // insert store before GEPI
-    StoreInst *newStoreI =
-      new StoreInst(newLoadI,
-          newsGEP, // new GEP
-          StoreI->isVolatile(),
-          StoreI->getAlignment(),
-          StoreI->getOrdering(),
-          StoreI->getSyncScopeID(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
-
-  }
-
-  // We need to do this explicitly: DCE pass will not remove them because we
-  // have assumed theworst memory behaviour for these function calls
-  // Traverse the vector backwards, otherwise definitions are deleted while
-  // their subsequent uses are still around
-  for (auto *I : reverse(IItoRemove)) {
-    DEBUG(errs() << "Erasing: " << *I << "\n");
-    I->eraseFromParent();
-  }
-
-  // Removed the cloned functions from the parent module into the new module 
-  for(auto *F : FuncToBeRemoved) {
-    F->removeFromParent(); //TODO: MARIA check
-    KernelM->getFunctionList().push_back(F);
-  }
-
-  addCLMetadata(F_nvptx);
-  kernel->KernelFunction = F_nvptx;
-  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-  DEBUG(errs() << *KernelM);
-
-  return;
+			 std::string name;
+			 if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
+				 name = "atomic_add";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
+				 name = "atomic_sub";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
+				 name = "atomic_xchg";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
+				 name = "atomic_min";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
+				 name = "atomic_max";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
+				 name = "atomic_and";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
+				 name = "atomic_or";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
+				 name = "atomic_xor";
+			 Type* paramTypes[] = {PtrTy, Val->getType()};
+			 FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
+			 FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
+
+			 Value* Params[] = {Ptr, Val};
+			 CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
+			 DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
+			 II->replaceAllUsesWith(AtomCI);
+			 IItoRemove.push_back(II);
+			}
+			break;
+			default:
+			llvm_unreachable("Unknown VISC Intrinsic!");
+			break;
+			}
+
+		}
+		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+			IRBuilder<> Builder(I);
+			Value *Source = MemCpyI->getSource();
+			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+			Value *Length = MemCpyI->getOperand(2);
+			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+			DEBUG(errs() << "Source: " << *Source << "\n"); 
+			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
+			DEBUG(errs() << "Length: " << *Length << "\n");
+
+			size_t memcpy_length;
+			unsigned int memcpy_count;
+			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
+				if (CI->getBitWidth() <= 64) {
+					memcpy_length = CI->getSExtValue();
+					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+					Type *Source_Type = Source->getType()->getPointerElementType();
+					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
+						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
+							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+							Value *DestPtrOperand = destGEPI->getPointerOperand();
+							for(int i = 0; i < memcpy_count; ++i) {
+								Constant *increment;
+								LoadInst *newLoadI;
+								StoreInst *newStoreI;
+								// First, need to increment the correct index for both source and dest 
+								// This invluves checking to see how many indeces the GEP has
+								// Assume for now only 1 or 2 are the viable options.
+
+								std::vector<Value*> GEPlIndex;
+								if (sourceGEPI->getNumIndices() == 1) {
+									Value *Index = sourceGEPI->getOperand(1);      
+									increment = ConstantInt::get(Index->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPlIndex.push_back(incAdd);
+									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
+									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+									newLoadI = Builder.CreateLoad(newGEPIl);
+									DEBUG(errs() << "Load: " << *newLoadI << "\n");
+								} else { 
+									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
+								}
+
+
+								std::vector<Value*> GEPsIndex;
+								if (destGEPI->getNumIndices() == 1) {
+
+								} else if (destGEPI->getNumIndices() == 2) {
+									Value *Index0 = destGEPI->getOperand(1);      
+									GEPsIndex.push_back(Index0);
+									Value *Index1 = destGEPI->getOperand(2);      
+									increment = ConstantInt::get(Index1->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index1, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPsIndex.push_back(incAdd);
+									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
+									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
+									DEBUG(errs() << "Store: " << *newStoreI << "\n");
+								} else {
+									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
+								}
+							}
+							IItoRemove.push_back(sourceGEPI);
+							IItoRemove.push_back(destGEPI);
+							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+							IItoRemove.push_back(destBitcastI);
+							IItoRemove.push_back(sourceBitcastI);
+							IItoRemove.push_back(MemCpyI);
+						}
+					}
+
+				}
+			} else {
+				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+			}
+			//      llvm_unreachable("HERE!");
+		}
+
+		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+			DEBUG(errs() << "Found a call: " << *CI << "\n");
+			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+			if(calleeF->isDeclaration()) {
+				// Add the declaration to kernel module
+				if (calleeF->getName() == "sqrtf") {
+					calleeF->setName(Twine("sqrt"));
+					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+					DEBUG(errs() << "CI: " << *CI << "\n");
+				} else if (calleeF->getName() == "rsqrtf") {
+					calleeF->setName(Twine("rsqrt"));
+					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+					DEBUG(errs() << "CI: " << *CI << "\n");
+				}  
+				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+			}
+			else {
+				// Check if the called function has already been cloned before.
+				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+				// Iterate over the new function to see if it calls any other functions
+				// in the module.
+				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
+					if(auto *Call = dyn_cast<CallInst>(&*i)) {
+						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
+						CloneAndReplaceCall(Call, CalledFunc);
+					}
+				}
+			}
+			//TODO: how to handle address space qualifiers in load/store
+		}
+
+	}
+	// search for pattern where float is being casted to int and loaded/stored and change it.	
+	DEBUG(errs() << "finding pattern for replacement!\n");
+	for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+		bool cont = false;
+		bool keepGEPI = false;
+		bool keepGEPI2= false;
+		Instruction *I = &(*i);
+		GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
+
+		if (!GEPI) {
+			// did nod find pattern start, continue
+			continue;
+		}
+		// may have found pattern, check
+		DEBUG(errs() << "GEPI " << *GEPI << "\n");
+		// print whatever we want for debug
+		Value* PtrOp = GEPI->getPointerOperand();
+		Type *SrcTy = GEPI->getSourceElementType();
+		unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+		if (SrcTy->isArrayTy()) 
+			DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
+		else
+			DEBUG(errs() << *SrcTy << " is not an array type!\n");
+		// check that source element type is float
+		if (SrcTy->isArrayTy()) {
+			if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+				DEBUG(errs() << "GEPI type is array but not float!\n");
+				continue;
+			}
+		}
+		else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
+			DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+			// does not fit this pattern - no float GEP instruction
+			continue;
+		}
+		// check that addressspace is 1
+		//	  if (GEPIaddrspace != 1) {
+		//			// does not fit this pattern - addrspace of pointer argument is not global
+		//			continue;
+		//		}
+		if (!(GEPI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			//continue;
+			// Keep GEPI around if it has other uses
+			keepGEPI = true;
+		}
+		DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+		// 1st GEPI it has one use
+		//		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+		// See if it is a bitcast
+		BitCastInst *BitCastI;
+		for (User * U : GEPI->users()) {
+			if(Instruction *ui = dyn_cast<Instruction> (U)) { 
+				DEBUG(errs() << "--" << *ui << "\n");
+				if (isa<BitCastInst>(ui)) {
+					BitCastI = dyn_cast<BitCastInst>(ui);
+					DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+					break;
+				}
+			}
+			DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+			cont = true;
+		}
+		//		for (Value::user_iterator ui = GEPI->user_begin(),
+		//				ue = GEPI->user_end(); ui!=ue; ++ui) {
+		//        DEBUG(errs() << "--" << *ui << "\n");
+		//			if (isa<BitCastInst>(*ui)) {
+		//				BitCastI = dyn_cast<BitCastInst>(*ui);
+		//        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+		//			}
+		//		}
+
+		if (cont/*!BitCastI*/) {
+			continue; // not in pattern
+		}
+
+		//    DEBUG(errs() << *BitCastI << "\n");
+		// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
+		Value *Op2 = BitCastI->getOperand(0);
+		DEBUG(errs() << "----" << *Op2 << "\n");
+		//		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+		//		Type *OpTy = cast<Type>(Op2);
+		Type *OpTy = BitCastI->getDestTy();
+		DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+		//    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
+		if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+			// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+			continue; // not in pattern
+		}
+
+		DEBUG(errs() << "----Here!\n");
+		// We are in GEP, bitcast.
+
+		// user_iterator, to find the load.
+
+		if (!(BitCastI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			continue;
+		}
+		DEBUG(errs() << "----Bitcast has one use!\n");
+		// it has one use
+		assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+		LoadInst *LoadI;
+		for (User * U : BitCastI->users()) { 
+			if (Instruction *ui = dyn_cast<Instruction> (U)) {
+				DEBUG(errs() << "-----" << *ui << "\n");
+				if (isa<LoadInst>(ui)) {
+					LoadI = dyn_cast<LoadInst>(ui);
+					DEBUG(errs() << "-----Found load as only use of bitcast\n");
+					break;
+				}
+			}
+			DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+			cont = true;
+		}
+		//		for (Value::user_iterator ui = BitCastI->user_begin(),
+		//				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+		//			if (isa<LoadInst>(*ui)) {
+		//				LoadI = dyn_cast<LoadInst>(*ui);
+		//        errs() << "Found load as only use of bitcast\n";
+		//			}
+		//		}
+
+		if (cont) {
+			continue; // not in pattern
+		}
+
+		DEBUG("HERE!\n");
+		// check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
+		assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
+
+		// Copy user_iterator, to find the store.
+
+		if (!(LoadI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			continue;
+			// TODO: generalize: one load can have more than one store users
+		}
+
+		// it has one use
+		assert(LoadI->hasOneUse() && "LoadI has a single use");
+		Value::user_iterator ui = LoadI->user_begin();
+		// skipped loop, because is has a single use
+		StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+		if (!StoreI) {
+			continue; // not in pattern
+		}
+
+		// Also check that the store uses the loaded value as the value operand
+		if (StoreI->getValueOperand() != LoadI) {
+			continue;
+		}
+
+		DEBUG(errs() << "-------Found store instruction\n");
+
+		// Look for its bitcast, which is its pointer operand
+		Value *StPtrOp = StoreI->getPointerOperand();
+		DEBUG(errs() << "-------" << *StPtrOp << "\n");
+		BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+		DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+		if (!BitCastI2) {
+			continue; //not in pattern
+		}
+
+		DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
+		// found bitcast. Look for the second GEP, its from operand.
+		Value *BCFromOp = BitCastI2->getOperand(0);
+		GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+		DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+		if (!GEPI2) {
+			continue; //not in pattern
+		}
+
+		if (!(GEPI2->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			//continue;
+			// Keep GEPI around if it has other uses
+			keepGEPI2 = true;
+		}
+		DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
+
+		Value *PtrOp2 = GEPI2->getPointerOperand();
+
+		// Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
+
+		// Assume we found pattern
+		if (!keepGEPI) {  
+			IItoRemove.push_back(GEPI);
+			DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+		} else {
+			DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+		}
+		IItoRemove.push_back(BitCastI);
+		DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+		IItoRemove.push_back(LoadI);
+		DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+		IItoRemove.push_back(GEPI2);
+		DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+		IItoRemove.push_back(BitCastI2);
+		DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+		if (!keepGEPI2) {
+			IItoRemove.push_back(StoreI);
+			DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+		} else {
+
+			DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
+		}
+
+		std::vector<Value*> GEPlIndex;
+		if (GEPI->hasIndices()) {
+			for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+				Value *Index = dyn_cast<Value>(&*ii);
+				DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+				GEPlIndex.push_back(Index);
+			}
+		}
+		//    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+		std::vector<Value*> GEPsIndex;
+		if (GEPI2->hasIndices()) {
+			for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+				Value *Index = dyn_cast<Value>(&*ii);
+				DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+				GEPsIndex.push_back(Index);
+			}
+		}
+		//    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+
+
+		//    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+		GetElementPtrInst* newlGEP =
+			GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
+					PtrOp, // operand from 1st GEP
+					ArrayRef<Value*>(GEPlIndex),
+					Twine(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+		// insert load before GEPI
+		LoadInst *newLoadI =
+			new LoadInst(Type::getFloatTy(M.getContext()),
+					newlGEP, // new GEP
+					Twine(),
+					LoadI->isVolatile(),
+					LoadI->getAlignment(),
+					LoadI->getOrdering(),
+					LoadI->getSyncScopeID(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+		// same for GEP for store, for store operand
+		GetElementPtrInst* newsGEP =
+			GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+					PtrOp2, // operand from 2nd GEP
+					ArrayRef<Value*>(GEPsIndex),
+					Twine(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+		// insert store before GEPI
+		StoreInst *newStoreI =
+			new StoreInst(newLoadI,
+					newsGEP, // new GEP
+					StoreI->isVolatile(),
+					StoreI->getAlignment(),
+					StoreI->getOrdering(),
+					StoreI->getSyncScopeID(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+
+	}
+
+	// We need to do this explicitly: DCE pass will not remove them because we
+	// have assumed theworst memory behaviour for these function calls
+	// Traverse the vector backwards, otherwise definitions are deleted while
+	// their subsequent uses are still around
+	for (auto *I : reverse(IItoRemove)) {
+		DEBUG(errs() << "Erasing: " << *I << "\n");
+		I->eraseFromParent();
+	}
+
+	// Removed the cloned functions from the parent module into the new module 
+	for(auto *F : FuncToBeRemoved) {
+		F->removeFromParent(); //TODO: MARIA check
+		KernelM->getFunctionList().push_back(F);
+	}
+
+	addCLMetadata(F_nvptx);
+	kernel->KernelFunction = F_nvptx;
+	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+	DEBUG(errs() << *KernelM);
+
+	return;
 }
 
 bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-  errs() << "\nDFG2LLVM_NVPTX PASS\n";
+	errs() << "\nDFG2LLVM_NVPTX PASS\n";
 
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
+	// Get the BuildDFG Analysis Results:
+	// - Dataflow graph
+	// - Maps from i8* hansles to DFNode and DFEdge
+	BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
-  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+	// DFInternalNode *Root = DFG.getRoot();
+	std::vector<DFInternalNode*> Roots = DFG.getRoots();
+	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  // Visitor for Code Generation Graph Traversal
-  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+	// Visitor for Code Generation Graph Traversal
+	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
 
-  // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(rootNode);
-  }
+	// Iterate over all the DFGs and produce code for each one of them
+	for (auto rootNode: Roots) {
+		// Initiate code generation for root DFNode
+		CGTVisitor->visit(rootNode);
+	}
 
-  CGTVisitor->writeKernelsModule();
+	CGTVisitor->writeKernelsModule();
 
-  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
-  delete CGTVisitor;
+	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
+	delete CGTVisitor;
 
-  return true;
+	return true;
 }
 
 std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-  /*SmallString<128> currentDir;
-    llvm::sys::fs::current_path(currentDir);
-    std::string fileName = getFilenameFromModule(M);
-    Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-    return output.str().append(".kernels.ll");*/
-  std::string mid = M.getModuleIdentifier();
-  return mid.append(".kernels.ll");
+	/*SmallString<128> currentDir;
+		llvm::sys::fs::current_path(currentDir);
+		std::string fileName = getFilenameFromModule(M);
+		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+		return output.str().append(".kernels.ll");*/
+	std::string mid = M.getModuleIdentifier();
+	return mid.append(".kernels.ll");
 }
 
 void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-  assert(isa<PointerType>(V->getType())
-      && "Value should be of Pointer Type!");
-  PointerType* OldTy = cast<PointerType>(V->getType());
-  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-  V->mutateType(NewTy);
-  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-    // Change all uses producing pointer type in same address space to new
-    // addressspace.
-    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-        fixValueAddrspace(*ui, addrspace);
-      }
-    }
-  }
+	assert(isa<PointerType>(V->getType())
+			&& "Value should be of Pointer Type!");
+	PointerType* OldTy = cast<PointerType>(V->getType());
+	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+	V->mutateType(NewTy);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+		// Change all uses producing pointer type in same address space to new
+		// addressspace.
+		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+				fixValueAddrspace(*ui, addrspace);
+			}
+		}
+	}
 }
 
 
 std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-  std::vector<unsigned> ConstantMemArgs;
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    Argument* arg = &*ai; 
-    std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-        GlobalMemArgs->end(), arg->getArgNo());
-    // It has to be a global memory argument to be promotable
-    if(pos == GlobalMemArgs->end())
-      continue;
-
-    // Check if it can/should be promoted
-    if(canBePromoted(arg, F)) {
-      errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
-      ConstantMemArgs.push_back(arg->getArgNo());
-      GlobalMemArgs->erase(pos);
-    }
-  }
-  return ConstantMemArgs;
+	std::vector<unsigned> ConstantMemArgs;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument* arg = &*ai; 
+		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
+				GlobalMemArgs->end(), arg->getArgNo());
+		// It has to be a global memory argument to be promotable
+		if(pos == GlobalMemArgs->end())
+			continue;
+
+		// Check if it can/should be promoted
+		if(canBePromoted(arg, F)) {
+			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
+			ConstantMemArgs.push_back(arg->getArgNo());
+			GlobalMemArgs->erase(pos);
+		}
+	}
+	return ConstantMemArgs;
 }
 
 Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-  unsigned idx = 0;
-  std::vector<Type*> ArgTypes;
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    Argument *arg = &*ai;
-    DEBUG(errs() << *arg << "\n");
-    unsigned argno = arg->getArgNo();
-    if ((idx < Args.size()) && (argno == Args[idx])) {
-      fixValueAddrspace(arg, addrspace);
-      idx++;
-    }
-    ArgTypes.push_back(arg->getType());
-  }
-  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-  return newF;
+	unsigned idx = 0;
+	std::vector<Type*> ArgTypes;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument *arg = &*ai;
+		DEBUG(errs() << *arg << "\n");
+		unsigned argno = arg->getArgNo();
+		if ((idx < Args.size()) && (argno == Args[idx])) {
+			fixValueAddrspace(arg, addrspace);
+			idx++;
+		}
+		ArgTypes.push_back(arg->getType());
+	}
+	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+	return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-  IRBuilder<> Builder(&*F->begin());
+	IRBuilder<> Builder(&*F->begin());
 
-  SmallVector<Metadata*,8> KernelMD;
-  KernelMD.push_back(ValueAsMetadata::get(F));
+	SmallVector<Metadata*,8> KernelMD;
+	KernelMD.push_back(ValueAsMetadata::get(F));
 
-  // TODO: There is additional metadata used by kernel files but we skip them as
-  // they are not mandatory. In future they might be useful to enable
-  // optimizations
+	// TODO: There is additional metadata used by kernel files but we skip them as
+	// they are not mandatory. In future they might be useful to enable
+	// optimizations
 
-  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-  MDN_kernels->addOperand(MDKernelNode);
+	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+	MDN_kernels->addOperand(MDKernelNode);
 
-  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-  // TODO: Replace 1 with the number of the kernel.
-  // Add when support for multiple launces is added
-  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+	// TODO: Replace 1 with the number of the kernel.
+	// Add when support for multiple launces is added
+	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
 }
 
 void CGT_NVPTX::writeKernelsModule() {
 
-  // In addition to deleting all other functions, we also want to spiff it
-  // up a little bit.  Do this now.
-  legacy::PassManager Passes;
+	// In addition to deleting all other functions, we also want to spiff it
+	// up a little bit.  Do this now.
+	legacy::PassManager Passes;
 
-  errs() << "Writing to File --- ";
-  errs() << getKernelsModuleName(M).c_str() << "\n";
-  std::error_code EC;
-  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-  if (EC) {
-    errs() << EC.message() << '\n';
-  }
+	errs() << "Writing to File --- ";
+	errs() << getKernelsModuleName(M).c_str() << "\n";
+	std::error_code EC;
+	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+	if (EC) {
+		errs() << EC.message() << '\n';
+	}
 
-  Passes.add(
-      createPrintModulePass(Out.os()));
+	Passes.add(
+			createPrintModulePass(Out.os()));
 
-  Passes.run(*KernelM);
+	Passes.run(*KernelM);
 
-  // Declare success.
-  Out.keep();
+	// Declare success.
+	Out.keep();
 }
 
 Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 
-  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-  // FIXME: Maybe do that using the Node?
-  StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-  assert(FRetTy && "Return Type must always be a struct");
-
-  // Keeps return statements, because we will need to replace them
-  std::vector<ReturnInst *> RItoRemove;
-  findReturnInst(F, RItoRemove);
-
-  std::vector<Type *> RetArgTypes;
-  std::vector<Argument*> RetArgs;
-  std::vector<Argument*> Args;
-  // Check for { } return struct, which means that the function returns void
-  if (FRetTy->isEmptyTy()) {
-
-    DEBUG(errs() << "\tFunction output struct is void\n");
-    DEBUG(errs() << "\tNo parameters added\n");
-
-    // Replacing return statements with others returning void
-    for (auto *RI : RItoRemove) {
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-    }
-    DEBUG(errs() << "\tChanged return statements to return void\n");
-  }
-  else {
-    // The struct has return values, thus needs to be converted to parameter
-
-    // Iterate over all element types of return struct and add arguments to the
-    // function
-    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-      RetArgs.push_back(RetArg);
-      RetArgTypes.push_back(RetArg->getType());
-      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-    }
-
-    DEBUG(errs() << "\tReplacing Return statements\n");
-    // Replace return statements with extractValue and store instructions
-    for (auto *RI : RItoRemove) {
-      Value* RetVal = RI->getReturnValue();
-      for(unsigned i = 0; i < RetArgs.size(); i++) {
-        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-            RetArgs[i]->getName()+".val", RI);
-        new StoreInst(EI, RetArgs[i], RI);
-      }
-      // assert(RetVal && "Return value should not be null at this point");
-      // StructType* RetType = cast<StructType>(RetVal->getType());
-      // assert(RetType && "Return type is not a struct");
-
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-
-    }
-  }
-  DEBUG(errs() << "\tReplaced return statements\n");
-
-  // Create the argument type list with the added argument's type
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-  for(auto *RATy: RetArgTypes) {
-    ArgTypes.push_back(RATy);
-  }
-
-  // Creating Args vector to use in cloning!
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    Args.push_back(&*ai);
-  }
-  for(auto *ai : RetArgs) {
-    Args.push_back(ai);
-  }
-
-  // Adding new arguments to the function argument list, would not change the
-  // function type. We need to change the type of this function to reflect the
-  // added arguments
-  Type* VoidRetType = Type::getVoidTy(F->getContext());
-  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-  // Change the function type
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-  //F->eraseFromParent();
-  return newF;
+	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+	// FIXME: Maybe do that using the Node?
+	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
+	assert(FRetTy && "Return Type must always be a struct");
+
+	// Keeps return statements, because we will need to replace them
+	std::vector<ReturnInst *> RItoRemove;
+	findReturnInst(F, RItoRemove);
+
+	std::vector<Type *> RetArgTypes;
+	std::vector<Argument*> RetArgs;
+	std::vector<Argument*> Args;
+	// Check for { } return struct, which means that the function returns void
+	if (FRetTy->isEmptyTy()) {
+
+		DEBUG(errs() << "\tFunction output struct is void\n");
+		DEBUG(errs() << "\tNo parameters added\n");
+
+		// Replacing return statements with others returning void
+		for (auto *RI : RItoRemove) {
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+		}
+		DEBUG(errs() << "\tChanged return statements to return void\n");
+	}
+	else {
+		// The struct has return values, thus needs to be converted to parameter
+
+		// Iterate over all element types of return struct and add arguments to the
+		// function
+		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+			RetArgs.push_back(RetArg);
+			RetArgTypes.push_back(RetArg->getType());
+			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+		}
+
+		DEBUG(errs() << "\tReplacing Return statements\n");
+		// Replace return statements with extractValue and store instructions
+		for (auto *RI : RItoRemove) {
+			Value* RetVal = RI->getReturnValue();
+			for(unsigned i = 0; i < RetArgs.size(); i++) {
+				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+						RetArgs[i]->getName()+".val", RI);
+				new StoreInst(EI, RetArgs[i], RI);
+			}
+			// assert(RetVal && "Return value should not be null at this point");
+			// StructType* RetType = cast<StructType>(RetVal->getType());
+			// assert(RetType && "Return type is not a struct");
+
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+
+		}
+	}
+	DEBUG(errs() << "\tReplaced return statements\n");
+
+	// Create the argument type list with the added argument's type
+	std::vector<Type*> ArgTypes;
+	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		ArgTypes.push_back(ai->getType());
+	}
+	for(auto *RATy: RetArgTypes) {
+		ArgTypes.push_back(RATy);
+	}
+
+	// Creating Args vector to use in cloning!
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Args.push_back(&*ai);
+	}
+	for(auto *ai : RetArgs) {
+		Args.push_back(ai);
+	}
+
+	// Adding new arguments to the function argument list, would not change the
+	// function type. We need to change the type of this function to reflect the
+	// added arguments
+	Type* VoidRetType = Type::getVoidTy(F->getContext());
+	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+	// Change the function type
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+	//F->eraseFromParent();
+	return newF;
 }
 
 /******************************************************************************
@@ -2120,333 +2105,333 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
 static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-  if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  VisitedList->push_back(V);
-  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-      ui != ue; ++ui) {
-    Instruction* I = dyn_cast<Instruction>(*ui);
-    if(!I) {
-      // if use is not an instruction, then skip it
-      continue;
-    }
-    DEBUG(errs() << "\t" << *I << "\n");
-    if(isa<LoadInst>(I)) {
-      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-      UseList->push_back(V);
-    }
-    else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-      // found a store in use chain
-      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-      return true;
-    }
-    else if(BuildDFG::isViscIntrinsic(I)) {
-      // If it is an atomic intrinsic, we found a store
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-          && "Only visc atomic intrinsics can have an argument as input");
-      return true;
-    }
-    else {
-      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-      if(findLoadStoreUses(I, UseList, VisitedList))
-        return true;
-    }
-  }
-  return false;
+	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	VisitedList->push_back(V);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
+			ui != ue; ++ui) {
+		Instruction* I = dyn_cast<Instruction>(*ui);
+		if(!I) {
+			// if use is not an instruction, then skip it
+			continue;
+		}
+		DEBUG(errs() << "\t" << *I << "\n");
+		if(isa<LoadInst>(I)) {
+			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+			UseList->push_back(V);
+		}
+		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+			// found a store in use chain
+			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+			return true;
+		}
+		else if(BuildDFG::isViscIntrinsic(I)) {
+			// If it is an atomic intrinsic, we found a store
+			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
+					&& "Only visc atomic intrinsics can have an argument as input");
+			return true;
+		}
+		else {
+			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+			if(findLoadStoreUses(I, UseList, VisitedList))
+				return true;
+		}
+	}
+	return false;
 }
 
 static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-  if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  DependenceList->push_back(V);
-  // If not an instruction, then not dependent on node instance id
-  if(!isa<Instruction>(V) || isa<Constant>(V)) {
-    DEBUG(errs() << "\tStop\n");
-    return false;
-  }
-
-  Instruction* I = cast<Instruction>(V);
-  for(unsigned i = 0; i < I->getNumOperands(); i++) {
-    Value* operand = I->getOperand(i);
-    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-      if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-            || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-            || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-        Value* Node = II->getArgOperand(0);
-        IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-        assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-        if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-          return true;
-        }
-      }
-    }
-    if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-      DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-      continue;
-    }
-    DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-    if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-      return true;
-    }
-  }
-  return false;
+	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	DependenceList->push_back(V);
+	// If not an instruction, then not dependent on node instance id
+	if(!isa<Instruction>(V) || isa<Constant>(V)) {
+		DEBUG(errs() << "\tStop\n");
+		return false;
+	}
+
+	Instruction* I = cast<Instruction>(V);
+	for(unsigned i = 0; i < I->getNumOperands(); i++) {
+		Value* operand = I->getOperand(i);
+		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
+			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+				Value* Node = II->getArgOperand(0);
+				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
+				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
+				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
+					return true;
+				}
+			}
+		}
+		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
+			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
+			continue;
+		}
+		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
+			return true;
+		}
+	}
+	return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
 static bool canBePromoted(Argument* arg, Function* F) {
-  DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-  std::vector<Value*> UseList;
-  std::vector<Value*> VisitedList;
-  // recursively traverse use chain
-  // if find a store instruction return false, everything fails, cannot be
-  // promoted
-  // if find a load instruction as use, add the GEP instruction to list
-  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-  if(foundStore == true)
-    return false;
-  // See that the GEP instructions are not dependent on getNodeInstanceID
-  // intrinsic
-  DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-  std::vector<Value*>DependenceList;
-  for(auto U: UseList) {
-    if(isDependentOnNodeInstanceID(U, &DependenceList))
-      return false;
-  }
-  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-  return true;
+	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
+	std::vector<Value*> UseList;
+	std::vector<Value*> VisitedList;
+	// recursively traverse use chain
+	// if find a store instruction return false, everything fails, cannot be
+	// promoted
+	// if find a load instruction as use, add the GEP instruction to list
+	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+	if(foundStore == true)
+		return false;
+	// See that the GEP instructions are not dependent on getNodeInstanceID
+	// intrinsic
+	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
+	std::vector<Value*>DependenceList;
+	for(auto U: UseList) {
+		if(isDependentOnNodeInstanceID(U, &DependenceList))
+			return false;
+	}
+	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+	return true;
 }
 
 
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
 static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-    &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-  // Assign number of dimenstions a constant value
-  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-  // If local work group size if null
-  if(!kernel->hasLocalWG()) {
-    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-  }
-  else {
-    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-      if(isa<Argument>(kernel->localWGSize[i]))
-        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-    }
-    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-  }
-
-  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-    if(isa<Argument>(kernel->globalWGSize[i]))
-      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-  }
-
-  // For OpenCL, global work group size is the total bumber of instances in each
-  // dimension. So, multiply local and global dim limits.
-  std::vector<Value*> globalWGSizeInsts;
-  if(kernel->hasLocalWG()) {
-    for (unsigned i = 0; i < kernel->gridDim; i++) {
-      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-      globalWGSizeInsts.push_back(MulInst);
-    }
-  }
-  else {
-    globalWGSizeInsts = kernel->globalWGSize;
-  }
-  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+	// Assign number of dimenstions a constant value
+	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+	// If local work group size if null
+	if(!kernel->hasLocalWG()) {
+		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+	}
+	else {
+		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+			if(isa<Argument>(kernel->localWGSize[i]))
+				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+		}
+		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+	}
+
+	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+		if(isa<Argument>(kernel->globalWGSize[i]))
+			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+	}
+
+	// For OpenCL, global work group size is the total bumber of instances in each
+	// dimension. So, multiply local and global dim limits.
+	std::vector<Value*> globalWGSizeInsts;
+	if(kernel->hasLocalWG()) {
+		for (unsigned i = 0; i < kernel->gridDim; i++) {
+			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+			globalWGSizeInsts.push_back(MulInst);
+		}
+	}
+	else {
+		globalWGSizeInsts = kernel->globalWGSize;
+	}
+	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
 static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-  Value* WGPtr;
-  // Get int64_t and or ease of use
-  Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-  // Work Group type is [#dim x i64]
-  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-  // Allocate space of Global work group data on stack and get pointer to
-  // first element.
-  AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-  Value* nextDim = WGPtr;
-  DEBUG(errs() << *WGPtr << "\n");
-
-  // Iterate over the number of dimensions and store the global work group
-  // size in that dimension
-  for(unsigned i=0; i < WGSize.size(); i++) {
-    DEBUG(errs() << *WGSize[i] << "\n");
-    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-    if(WGSize[i]->getType() != Int64Ty) {
-      // If number of dimensions are mentioned in any other integer format,
-      // generate code to extend it to i64. We need to use the mapped value in
-      // the new generated function, hence the use of VMap
-      // FIXME: Why are we changing the kernel WGSize vector here?
-      DEBUG(errs() << "Not i64. Zero extend required.\n");
-      DEBUG(errs() << *WGSize[i] << "\n");
-      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-      DEBUG(errs() << "Bitcast done.\n");
-      StoreInst* SI = new StoreInst(CI, nextDim, IB);
-      DEBUG(errs() << "Zero extend done.\n");
-      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-    } else {
-      // Store the value representing work group size in ith dimension on
-      // stack
-      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-    }
-    if(i+1 < WGSize.size()) {
-      // Move to next dimension
-      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-          ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-          WG->getName()+"."+Twine(i+1),
-          IB);
-      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-      nextDim = GEP;
-    }
-  }
-  return WGPtr;
+	Value* WGPtr;
+	// Get int64_t and or ease of use
+	Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+	// Work Group type is [#dim x i64]
+	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+	// Allocate space of Global work group data on stack and get pointer to
+	// first element.
+	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
+	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+	Value* nextDim = WGPtr;
+	DEBUG(errs() << *WGPtr << "\n");
+
+	// Iterate over the number of dimensions and store the global work group
+	// size in that dimension
+	for(unsigned i=0; i < WGSize.size(); i++) {
+		DEBUG(errs() << *WGSize[i] << "\n");
+		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+		if(WGSize[i]->getType() != Int64Ty) {
+			// If number of dimensions are mentioned in any other integer format,
+			// generate code to extend it to i64. We need to use the mapped value in
+			// the new generated function, hence the use of VMap
+			// FIXME: Why are we changing the kernel WGSize vector here?
+			DEBUG(errs() << "Not i64. Zero extend required.\n");
+			DEBUG(errs() << *WGSize[i] << "\n");
+			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+			DEBUG(errs() << "Bitcast done.\n");
+			StoreInst* SI = new StoreInst(CI, nextDim, IB);
+			DEBUG(errs() << "Zero extend done.\n");
+			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+		} else {
+			// Store the value representing work group size in ith dimension on
+			// stack
+			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+		}
+		if(i+1 < WGSize.size()) {
+			// Move to next dimension
+			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+					WG->getName()+"."+Twine(i+1),
+					IB);
+			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+			nextDim = GEP;
+		}
+	}
+	return WGPtr;
 
 }
 
 // Get generated PTX binary name
 static std::string getPTXFilename(const Module& M) {
-  std::string moduleID = M.getModuleIdentifier();
-  moduleID.append(".kernels.cl");
-  return moduleID;
+	std::string moduleID = M.getModuleIdentifier();
+	moduleID.append(".kernels.cl");
+	return moduleID;
 }
 
 // Get the name of the input file from module ID
 static std::string getFilenameFromModule(const Module& M) {
-  std::string moduleID = M.getModuleIdentifier();
-  return moduleID.substr(moduleID.find_last_of("/")+1);
+	std::string moduleID = M.getModuleIdentifier();
+	return moduleID.substr(moduleID.find_last_of("/")+1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-  if (TARGET_PTX == 32)
-    M.setDataLayout(StringRef(nvptx32_layoutStr));
-  else if (TARGET_PTX == 64)
-    M.setDataLayout(StringRef(nvptx64_layoutStr));
-  else assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setDataLayout(StringRef(nvptx32_layoutStr));
+	else if (TARGET_PTX == 64)
+		M.setDataLayout(StringRef(nvptx64_layoutStr));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 static void changeTargetTriple(Module &M) {
-  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-  if (TARGET_PTX == 32)
-    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-  else if (TARGET_PTX == 64)
-    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-  else assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+	else if (TARGET_PTX == 64)
+		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 // Helper function, populate a vector with all return statements in a function
 static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-  for (auto &BB : *F) {
-    if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-      ReturnInstVec.push_back(RI);
-  }	
+	for (auto &BB : *F) {
+		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+			ReturnInstVec.push_back(RI);
+	}	
 }
 
 // Helper function, populate a vector with all IntrinsicID intrinsics in a function
 static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction *I = &(*i);
-    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-    if (II && II->getIntrinsicID() == IntrinsicID) {
-      IntrinsicInstVec.push_back(II);
-    }
-  }
+	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+		Instruction *I = &(*i);
+		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+		if (II && II->getIntrinsicID() == IntrinsicID) {
+			IntrinsicInstVec.push_back(II);
+		}
+	}
 }
 
 // Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-  switch(ID) {
-    case Intrinsic::visc_atomic_add:
-      return AtomicRMWInst::Add;
-    case Intrinsic::visc_atomic_sub:
-      return AtomicRMWInst::Sub;
-    case Intrinsic::visc_atomic_min:
-      return AtomicRMWInst::Min;
-    case Intrinsic::visc_atomic_umin:
-      return AtomicRMWInst::UMin;
-    case Intrinsic::visc_atomic_max:
-      return AtomicRMWInst::Max;
-    case Intrinsic::visc_atomic_umax:
-      return AtomicRMWInst::UMax;
-      //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
-      //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
-    case Intrinsic::visc_atomic_xchg:
-      return AtomicRMWInst::Xchg;
-    case Intrinsic::visc_atomic_and:
-      return AtomicRMWInst::And;
-    case Intrinsic::visc_atomic_or:
-      return AtomicRMWInst::Or;
-    case Intrinsic::visc_atomic_xor:
-      return AtomicRMWInst::Xor;
-    default:
-      llvm_unreachable("Unsupported atomic intrinsic!");
-  };
+	switch(ID) {
+		case Intrinsic::visc_atomic_add:
+			return AtomicRMWInst::Add;
+		case Intrinsic::visc_atomic_sub:
+			return AtomicRMWInst::Sub;
+		case Intrinsic::visc_atomic_min:
+			return AtomicRMWInst::Min;
+		case Intrinsic::visc_atomic_umin:
+			return AtomicRMWInst::UMin;
+		case Intrinsic::visc_atomic_max:
+			return AtomicRMWInst::Max;
+		case Intrinsic::visc_atomic_umax:
+			return AtomicRMWInst::UMax;
+			//case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+			//case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+		case Intrinsic::visc_atomic_xchg:
+			return AtomicRMWInst::Xchg;
+		case Intrinsic::visc_atomic_and:
+			return AtomicRMWInst::And;
+		case Intrinsic::visc_atomic_or:
+			return AtomicRMWInst::Or;
+		case Intrinsic::visc_atomic_xor:
+			return AtomicRMWInst::Xor;
+		default:
+			llvm_unreachable("Unsupported atomic intrinsic!");
+	};
 }
 
 
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-  switch(ID) {
-    case Intrinsic::visc_atomic_cmpxchg:
-      return "atom_cmpxchg";
-    case Intrinsic::visc_atomic_add:
-      return "atom_add";
-    case Intrinsic::visc_atomic_sub:
-      return "atom_sub";
-    case Intrinsic::visc_atomic_min:
-      return "atom_min";
-    case Intrinsic::visc_atomic_max:
-      return "atom_max";
-    case Intrinsic::visc_atomic_inc:
-      return "atom_inc";
-    case Intrinsic::visc_atomic_dec:
-      return "atom_dec";
-    case Intrinsic::visc_atomic_xchg:
-      return "atom_xchg";
-    case Intrinsic::visc_atomic_and:
-      return "atom_and";
-    case Intrinsic::visc_atomic_or:
-      return "atom_or";
-    case Intrinsic::visc_atomic_xor:
-      return "atom_xor";
-    default:
-      llvm_unreachable("Unsupported atomic intrinsic!");
-  };
+	switch(ID) {
+		case Intrinsic::visc_atomic_cmpxchg:
+			return "atom_cmpxchg";
+		case Intrinsic::visc_atomic_add:
+			return "atom_add";
+		case Intrinsic::visc_atomic_sub:
+			return "atom_sub";
+		case Intrinsic::visc_atomic_min:
+			return "atom_min";
+		case Intrinsic::visc_atomic_max:
+			return "atom_max";
+		case Intrinsic::visc_atomic_inc:
+			return "atom_inc";
+		case Intrinsic::visc_atomic_dec:
+			return "atom_dec";
+		case Intrinsic::visc_atomic_xchg:
+			return "atom_xchg";
+		case Intrinsic::visc_atomic_and:
+			return "atom_and";
+		case Intrinsic::visc_atomic_or:
+			return "atom_or";
+		case Intrinsic::visc_atomic_xor:
+			return "atom_xor";
+		default:
+			llvm_unreachable("Unsupported atomic intrinsic!");
+	};
 }
 
 } // End of namespace
 
 char DFG2LLVM_NVPTX::ID = 0;
 static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
-    "Dataflow Graph to LLVM for NVPTX Pass",
-    false /* does not modify the CFG */,
-    true /* transformation,   *
-          * not just analysis */);
+		"Dataflow Graph to LLVM for NVPTX Pass",
+		false /* does not modify the CFG */,
+		true /* transformation,   *
+					* not just analysis */);