diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index c85a8a4f2dcf6dbf94285b0e22f8c4bf89fada4d..0ee18394ba0146b1f5f548f70c491369a4fcb04d 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -1130,7 +1130,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // constant memory, subject to size of course
   std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
-  F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
+  F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, CONSTANT_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
@@ -1416,720 +1416,350 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       default:
         llvm_unreachable("Unknown VISC Intrinsic!");
         break;
-			}
-
-		}
-		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-			IRBuilder<> Builder(I);
-			Value *Source = MemCpyI->getSource();
-			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-			Value *Length = MemCpyI->getOperand(2);
-			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-			DEBUG(errs() << "Source: " << *Source << "\n"); 
-			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
-			DEBUG(errs() << "Length: " << *Length << "\n");
-
-			size_t memcpy_length;
-			unsigned int memcpy_count;
-			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
-				if (CI->getBitWidth() <= 64) {
-					memcpy_length = CI->getSExtValue();
-					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-					Type *Source_Type = Source->getType()->getPointerElementType();
-					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
-						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
-							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-							Value *DestPtrOperand = destGEPI->getPointerOperand();
-							for(int i = 0; i < memcpy_count; ++i) {
-								Constant *increment;
-								LoadInst *newLoadI;
-								StoreInst *newStoreI;
-								// First, need to increment the correct index for both source and dest 
-								// This invluves checking to see how many indeces the GEP has
-								// Assume for now only 1 or 2 are the viable options.
-
-								std::vector<Value*> GEPlIndex;
-								if (sourceGEPI->getNumIndices() == 1) {
-									Value *Index = sourceGEPI->getOperand(1);      
-									increment = ConstantInt::get(Index->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPlIndex.push_back(incAdd);
-									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
-									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-									newLoadI = Builder.CreateLoad(newGEPIl);
-									DEBUG(errs() << "Load: " << *newLoadI << "\n");
-								} else { 
-									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
-								}
-
-
-								std::vector<Value*> GEPsIndex;
-								if (destGEPI->getNumIndices() == 1) {
-
-								} else if (destGEPI->getNumIndices() == 2) {
-									Value *Index0 = destGEPI->getOperand(1);      
-									GEPsIndex.push_back(Index0);
-									Value *Index1 = destGEPI->getOperand(2);      
-									increment = ConstantInt::get(Index1->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index1, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPsIndex.push_back(incAdd);
-									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
-									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
-									DEBUG(errs() << "Store: " << *newStoreI << "\n");
-								} else {
-									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
-								}
-							}
-							IItoRemove.push_back(sourceGEPI);
-							IItoRemove.push_back(destGEPI);
-							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-							IItoRemove.push_back(destBitcastI);
-							IItoRemove.push_back(sourceBitcastI);
-							IItoRemove.push_back(MemCpyI);
-						}
-					}
-
-				}
-			} else {
-				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-			}
-			//      llvm_unreachable("HERE!");
-		}
-
-		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-			DEBUG(errs() << "Found a call: " << *CI << "\n");
-			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-			if(calleeF->isDeclaration()) {
-				// Add the declaration to kernel module
-				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-				if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
-					// Now handle a few specific intrinsics
-					// For now, sin and cos are translated to their libclc equivalent
-					switch(II->getIntrinsicID()) {
-						case Intrinsic::sin:
-						case Intrinsic::cos:
-							{
-								DEBUG(errs() << "Found sincos: " << *II << "\n");
-								// Get the libclc function
-								// libclc uses mangled name for sin cos
-								assert(II->getType()->isFloatTy()
-										&& "Only handling sin(float) and cos(float)!");
-								std::string name;
-								if(II->getIntrinsicID() == Intrinsic::sin)
-									name = "sin";
-								else
-									name = "cos";
-
-								FunctionType* SinCosFT = FunctionType::get(II->getType(),
-										Type::getFloatTy(KernelM->getContext()),
-										false);
-								FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT);
-								CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
-
-								II->replaceAllUsesWith(CI);
-								IItoRemove.push_back(II);
-								break;
-							}
-						case Intrinsic::floor:
-							{
-								DEBUG(errs() << "Found floor intrinsic\n");
-								F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
-								FunctionType* FTy = F->getFunctionType();
-								DEBUG(errs() << *F << "\n");
-
-								// Create argument list
-								std::vector<Value*> args;
-								assert(CI->getNumArgOperands() == FTy->getNumParams()
-										&& "Number of arguments of call do not match with Intrinsic");
-								for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-									Value* V = CI->getArgOperand(i);
-									// Either the type should match or both should be of pointer type
-									assert((V->getType() == FTy->getParamType(i) ||
-												(V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
-											&& "Dummy function call argument does not match with Intrinsic argument!");
-									// If the types do not match, then both must be pointer type and pointer
-									// cast needs to be performed
-									if(V->getType() != FTy->getParamType(i)) {
-										V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
-									}
-									args.push_back(V);
-								}
-								// Insert call instruction
-								CallInst* Inst = CallInst::Create(F, args,
-										F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
-								DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
-								CI->replaceAllUsesWith(Inst);
-								IItoRemove.push_back(II);
-								break;
-							}
-						default:
-							errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
-					}
-				}
-
-			}
-			else {
-				// Check if the called function has already been cloned before.
-				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-				// Iterate over the new function to see if it calls any other functions
-				// in the module.
-				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-					if(auto *Call = dyn_cast<CallInst>(&*i)) {
-						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-						CloneAndReplaceCall(Call, CalledFunc);
-					}
-				}
-			}
-			//TODO: how to handle address space qualifiers in load/store
-		}
-
-	}
-  // search for pattern where float is being casted to int and loaded/stored and change it.	
-  DEBUG(errs() << "finding pattern for replacement!\n");
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-    bool cont = false;
-    bool keepGEPI = false;
-    bool keepGEPI2= false;
-    Instruction *I = &(*i);
-    GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
-
-    if (!GEPI) {
-      // did nod find pattern start, continue
-      continue;
-    }
-    // may have found pattern, check
-    DEBUG(errs() << "GEPI " << *GEPI << "\n");
-    // print whatever we want for debug
-    Value* PtrOp = GEPI->getPointerOperand();
-    Type *SrcTy = GEPI->getSourceElementType();
-    unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-    if (SrcTy->isArrayTy()) 
-      DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
-    else
-      DEBUG(errs() << *SrcTy << " is not an array type!\n");
-    // check that source element type is float
-    if (SrcTy->isArrayTy()) {
-      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-        DEBUG(errs() << "GEPI type is array but not float!\n");
-        continue;
       }
-    }
-    else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
-      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-      // does not fit this pattern - no float GEP instruction
-      continue;
-    }
-    // check that addressspace is 1
-    //	  if (GEPIaddrspace != 1) {
-    //			// does not fit this pattern - addrspace of pointer argument is not global
-    //			continue;
-    //		}
-    if (!(GEPI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      //continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI = true;
-    }
-    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-    // 1st GEPI it has one use
-    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-    // See if it is a bitcast
-    BitCastInst *BitCastI;
-    for (User * U : GEPI->users()) {
-      if(Instruction *ui = dyn_cast<Instruction> (U)) { 
-        DEBUG(errs() << "--" << *ui << "\n");
-        if (isa<BitCastInst>(ui)) {
-          BitCastI = dyn_cast<BitCastInst>(ui);
-          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-          break;
-        }
-      }
-      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = GEPI->user_begin(),
-    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
-    //        DEBUG(errs() << "--" << *ui << "\n");
-    //			if (isa<BitCastInst>(*ui)) {
-    //				BitCastI = dyn_cast<BitCastInst>(*ui);
-    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-    //			}
-    //		}
-
-    if (cont/*!BitCastI*/) {
-      continue; // not in pattern
-    }
 
-    //    DEBUG(errs() << *BitCastI << "\n");
-    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
-    Value *Op2 = BitCastI->getOperand(0);
-    DEBUG(errs() << "----" << *Op2 << "\n");
-    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-    //		Type *OpTy = cast<Type>(Op2);
-    Type *OpTy = BitCastI->getDestTy();
-    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
-    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-      continue; // not in pattern
     }
+    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if(calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+        if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
+          // Now handle a few specific intrinsics
+          // For now, sin and cos are translated to their libclc equivalent
+          switch(II->getIntrinsicID()) {
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+          {
+            DEBUG(errs() << "Found sincos: " << *II << "\n");
+            // Get the libclc function
+            // libclc uses mangled name for sin cos
+            assert(II->getType()->isFloatTy()
+                   && "Only handling sin(float) and cos(float)!");
+            std::string name;
+            if(II->getIntrinsicID() == Intrinsic::sin)
+              name = "sin";
+            else
+              name = "cos";
+
+            FunctionType* SinCosFT = FunctionType::get(II->getType(),
+                                     Type::getFloatTy(KernelM->getContext()),
+                                     false);
+            FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT);
+            CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
+
+            II->replaceAllUsesWith(CI);
+            IItoRemove.push_back(II);
+            break;
+          }
+          case Intrinsic::floor:
+          {
+            DEBUG(errs() << "Found floor intrinsic\n");
+            F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
+            FunctionType* FTy = F->getFunctionType();
+            DEBUG(errs() << *F << "\n");
+
+            // Create argument list
+            std::vector<Value*> args;
+            assert(CI->getNumArgOperands() == FTy->getNumParams()
+                   && "Number of arguments of call do not match with Intrinsic");
+            for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+              Value* V = CI->getArgOperand(i);
+              // Either the type should match or both should be of pointer type
+              assert((V->getType() == FTy->getParamType(i) ||
+                     (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
+                     && "Dummy function call argument does not match with Intrinsic argument!");
+              // If the types do not match, then both must be pointer type and pointer
+              // cast needs to be performed
+              if(V->getType() != FTy->getParamType(i)) {
+                V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+              }
+              args.push_back(V);
+            }
+            // Insert call instruction
+            CallInst* Inst = CallInst::Create(F, args,
+                  F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+            DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+            CI->replaceAllUsesWith(Inst);
+            IItoRemove.push_back(II);
+            break;
+          }
+          default:
+            errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
+          }
+        }
 
-    DEBUG(errs() << "----Here!\n");
-    // We are in GEP, bitcast.
-
-    // user_iterator, to find the load.
-
-    if (!(BitCastI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-    }
-    DEBUG(errs() << "----Bitcast has one use!\n");
-    // it has one use
-    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-    LoadInst *LoadI;
-    for (User * U : BitCastI->users()) { 
-      if (Instruction *ui = dyn_cast<Instruction> (U)) {
-        DEBUG(errs() << "-----" << *ui << "\n");
-        if (isa<LoadInst>(ui)) {
-          LoadI = dyn_cast<LoadInst>(ui);
-          DEBUG(errs() << "-----Found load as only use of bitcast\n");
-          break;
+      }
+      else {
+      // Check if the called function has already been cloned before.
+        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+        // Iterate over the new function to see if it calls any other functions
+        // in the module.
+        for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
+          if(auto *Call = dyn_cast<CallInst>(&*i)) {
+            Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
+            CloneAndReplaceCall(Call, CalledFunc);
+          }
         }
       }
-      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = BitCastI->user_begin(),
-    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-    //			if (isa<LoadInst>(*ui)) {
-    //				LoadI = dyn_cast<LoadInst>(*ui);
-    //        errs() << "Found load as only use of bitcast\n";
-    //			}
-    //		}
-
-    if (cont) {
-      continue; // not in pattern
-    }
-
-    DEBUG("HERE!\n");
-    // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
-    assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
-
-    // Copy user_iterator, to find the store.
-
-    if (!(LoadI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-      // TODO: generalize: one load can have more than one store users
-    }
-
-    // it has one use
-    assert(LoadI->hasOneUse() && "LoadI has a single use");
-    Value::user_iterator ui = LoadI->user_begin();
-    // skipped loop, because is has a single use
-    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-    if (!StoreI) {
-      continue; // not in pattern
-    }
-
-    // Also check that the store uses the loaded value as the value operand
-    if (StoreI->getValueOperand() != LoadI) {
-      continue;
-    }
-
-    DEBUG(errs() << "-------Found store instruction\n");
-
-    // Look for its bitcast, which is its pointer operand
-    Value *StPtrOp = StoreI->getPointerOperand();
-    DEBUG(errs() << "-------" << *StPtrOp << "\n");
-    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-    if (!BitCastI2) {
-      continue; //not in pattern
-    }
-
-    DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
-    // found bitcast. Look for the second GEP, its from operand.
-    Value *BCFromOp = BitCastI2->getOperand(0);
-    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-    if (!GEPI2) {
-      continue; //not in pattern
-    }
-
-    if (!(GEPI2->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      //continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI2 = true;
+      //TODO: how to handle address space qualifiers in load/store
     }
-    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
 
-    Value *PtrOp2 = GEPI2->getPointerOperand();
-
-    // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
-
-    // Assume we found pattern
-    if (!keepGEPI) {  
-      IItoRemove.push_back(GEPI);
-      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-    } else {
-      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-    }
-    IItoRemove.push_back(BitCastI);
-    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-    IItoRemove.push_back(LoadI);
-    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-    IItoRemove.push_back(GEPI2);
-    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-    IItoRemove.push_back(BitCastI2);
-    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-    if (!keepGEPI2) {
-      IItoRemove.push_back(StoreI);
-      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-    } else {
-
-      DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
-    }
+  }
 
-    std::vector<Value*> GEPlIndex;
-    if (GEPI->hasIndices()) {
-      for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-        GEPlIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-    std::vector<Value*> GEPsIndex;
-    if (GEPI2->hasIndices()) {
-      for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-        GEPsIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-
-
-    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-    GetElementPtrInst* newlGEP =
-      GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
-          PtrOp, // operand from 1st GEP
-          ArrayRef<Value*>(GEPlIndex),
-          Twine(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-    // insert load before GEPI
-    LoadInst *newLoadI =
-      new LoadInst(Type::getFloatTy(M.getContext()),
-          newlGEP, // new GEP
-          Twine(),
-          LoadI->isVolatile(),
-          LoadI->getAlignment(),
-          LoadI->getOrdering(),
-          LoadI->getSyncScopeID(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-    // same for GEP for store, for store operand
-    GetElementPtrInst* newsGEP =
-      GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-          PtrOp2, // operand from 2nd GEP
-          ArrayRef<Value*>(GEPsIndex),
-          Twine(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-    // insert store before GEPI
-    StoreInst *newStoreI =
-      new StoreInst(newLoadI,
-          newsGEP, // new GEP
-          StoreI->isVolatile(),
-          StoreI->getAlignment(),
-          StoreI->getOrdering(),
-          StoreI->getSyncScopeID(),
-          StoreI);
-    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (auto *I : reverse(IItoRemove)) {
+    DEBUG(errs() << "Erasing: " << *I << "\n");
+    I->eraseFromParent();
+  }
 
+ // Removed the cloned functions from the parent module into the new module 
+  for(auto *F : FuncToBeRemoved) {
+    F->removeFromParent(); //TODO: MARIA check
+    KernelM->getFunctionList().push_back(F);
   }
 
-	// We need to do this explicitly: DCE pass will not remove them because we
-	// have assumed theworst memory behaviour for these function calls
-	// Traverse the vector backwards, otherwise definitions are deleted while
-	// their subsequent uses are still around
-	for (auto *I : reverse(IItoRemove)) {
-		DEBUG(errs() << "Erasing: " << *I << "\n");
-		I->eraseFromParent();
-	}
-
-	// Removed the cloned functions from the parent module into the new module 
-	for(auto *F : FuncToBeRemoved) {
-		F->removeFromParent(); //TODO: MARIA check
-		KernelM->getFunctionList().push_back(F);
-	}
-
-	addCLMetadata(F_nvptx);
-	kernel->KernelFunction = F_nvptx;
-	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-	DEBUG(errs() << *KernelM);
-
-	return;
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+  DEBUG(errs() << *KernelM);
+
+  return;
 }
 
 bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-	errs() << "\nDFG2LLVM_NVPTX PASS\n";
+  errs() << "\nDFG2LLVM_NVPTX PASS\n";
 
-	// Get the BuildDFG Analysis Results:
-	// - Dataflow graph
-	// - Maps from i8* hansles to DFNode and DFEdge
-	BuildDFG &DFG = getAnalysis<BuildDFG>();
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-	// DFInternalNode *Root = DFG.getRoot();
-	std::vector<DFInternalNode*> Roots = DFG.getRoots();
-	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-	// Visitor for Code Generation Graph Traversal
-	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
 
-	// Iterate over all the DFGs and produce code for each one of them
-	for (auto rootNode: Roots) {
-		// Initiate code generation for root DFNode
-		CGTVisitor->visit(rootNode);
-	}
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
 
-	CGTVisitor->writeKernelsModule();
+  CGTVisitor->writeKernelsModule();
 
-	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
-	delete CGTVisitor;
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
 
-	return true;
+  return true;
 }
 
 std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-	/*SmallString<128> currentDir;
-		llvm::sys::fs::current_path(currentDir);
-		std::string fileName = getFilenameFromModule(M);
-		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-		return output.str().append(".kernels.ll");*/
-	std::string mid = M.getModuleIdentifier();
-	return mid.append(".kernels.ll");
+  /*SmallString<128> currentDir;
+  llvm::sys::fs::current_path(currentDir);
+  std::string fileName = getFilenameFromModule(M);
+  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+  return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
 }
 
 void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-	assert(isa<PointerType>(V->getType())
-			&& "Value should be of Pointer Type!");
-	PointerType* OldTy = cast<PointerType>(V->getType());
-	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-	V->mutateType(NewTy);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-		// Change all uses producing pointer type in same address space to new
-		// addressspace.
-		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-				fixValueAddrspace(*ui, addrspace);
-			}
-		}
-	}
+  assert(isa<PointerType>(V->getType())
+         && "Value should be of Pointer Type!");
+  PointerType* OldTy = cast<PointerType>(V->getType());
+  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
 }
 
 
 std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-	std::vector<unsigned> ConstantMemArgs;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument* arg = &*ai; 
-		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-				GlobalMemArgs->end(), arg->getArgNo());
-		// It has to be a global memory argument to be promotable
-		if(pos == GlobalMemArgs->end())
-			continue;
-
-		// Check if it can/should be promoted
-		if(canBePromoted(arg, F)) {
-			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
-			ConstantMemArgs.push_back(arg->getArgNo());
-			GlobalMemArgs->erase(pos);
-		}
-	}
-	return ConstantMemArgs;
+  std::vector<unsigned> ConstantMemArgs;
+   for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+     Argument* arg = &*ai; 
+    std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
+        GlobalMemArgs->end(), arg->getArgNo());
+    // It has to be a global memory argument to be promotable
+    if(pos == GlobalMemArgs->end())
+      continue;
+
+    // Check if it can/should be promoted
+    if(canBePromoted(arg, F)) {
+      errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
+      ConstantMemArgs.push_back(arg->getArgNo());
+      GlobalMemArgs->erase(pos);
+    }
+  }
+  return ConstantMemArgs;
 }
 
 Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-	unsigned idx = 0;
-	std::vector<Type*> ArgTypes;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument *arg = &*ai;
-		DEBUG(errs() << *arg << "\n");
-		unsigned argno = arg->getArgNo();
-		if ((idx < Args.size()) && (argno == Args[idx])) {
-			fixValueAddrspace(arg, addrspace);
-			idx++;
-		}
-		ArgTypes.push_back(arg->getType());
-	}
-	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-	return newF;
+  unsigned idx = 0;
+  std::vector<Type*> ArgTypes;
+  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    Argument *arg = &*ai;
+    DEBUG(errs() << *arg << "\n");
+    unsigned argno = arg->getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg->getType());
+  }
+  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+  return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-	IRBuilder<> Builder(&*F->begin());
+  IRBuilder<> Builder(&*F->begin());
 
-	SmallVector<Metadata*,8> KernelMD;
-	KernelMD.push_back(ValueAsMetadata::get(F));
+  SmallVector<Metadata*,8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
 
-	// TODO: There is additional metadata used by kernel files but we skip them as
-	// they are not mandatory. In future they might be useful to enable
-	// optimizations
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
 
-	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-	MDN_kernels->addOperand(MDKernelNode);
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
 
-	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-	// TODO: Replace 1 with the number of the kernel.
-	// Add when support for multiple launces is added
-	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
 }
 
 void CGT_NVPTX::writeKernelsModule() {
 
-	// In addition to deleting all other functions, we also want to spiff it
-	// up a little bit.  Do this now.
-	legacy::PassManager Passes;
+  // In addition to deleting all other functions, we also want to spiff it
+  // up a little bit.  Do this now.
+  legacy::PassManager Passes;
 
-	errs() << "Writing to File --- ";
-	errs() << getKernelsModuleName(M).c_str() << "\n";
-	std::error_code EC;
-	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-	if (EC) {
-		errs() << EC.message() << '\n';
-	}
+  errs() << "Writing to File --- ";
+  errs() << getKernelsModuleName(M).c_str() << "\n";
+  std::error_code EC;
+  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
+  }
 
-	Passes.add(
-			createPrintModulePass(Out.os()));
+  Passes.add(
+      createPrintModulePass(Out.os()));
 
-	Passes.run(*KernelM);
+  Passes.run(*KernelM);
 
-	// Declare success.
-	Out.keep();
+  // Declare success.
+  Out.keep();
 }
 
 Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 
-	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-	// FIXME: Maybe do that using the Node?
-	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-	assert(FRetTy && "Return Type must always be a struct");
-
-	// Keeps return statements, because we will need to replace them
-	std::vector<ReturnInst *> RItoRemove;
-	findReturnInst(F, RItoRemove);
-
-	std::vector<Type *> RetArgTypes;
-	std::vector<Argument*> RetArgs;
-	std::vector<Argument*> Args;
-	// Check for { } return struct, which means that the function returns void
-	if (FRetTy->isEmptyTy()) {
-
-		DEBUG(errs() << "\tFunction output struct is void\n");
-		DEBUG(errs() << "\tNo parameters added\n");
-
-		// Replacing return statements with others returning void
-		for (auto *RI : RItoRemove) {
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-		}
-		DEBUG(errs() << "\tChanged return statements to return void\n");
-	}
-	else {
-		// The struct has return values, thus needs to be converted to parameter
-
-		// Iterate over all element types of return struct and add arguments to the
-		// function
-		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-			RetArgs.push_back(RetArg);
-			RetArgTypes.push_back(RetArg->getType());
-			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-		}
-
-		DEBUG(errs() << "\tReplacing Return statements\n");
-		// Replace return statements with extractValue and store instructions
-		for (auto *RI : RItoRemove) {
-			Value* RetVal = RI->getReturnValue();
-			for(unsigned i = 0; i < RetArgs.size(); i++) {
-				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-						RetArgs[i]->getName()+".val", RI);
-				new StoreInst(EI, RetArgs[i], RI);
-			}
-			// assert(RetVal && "Return value should not be null at this point");
-			// StructType* RetType = cast<StructType>(RetVal->getType());
-			// assert(RetType && "Return type is not a struct");
-
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-
-		}
-	}
-	DEBUG(errs() << "\tReplaced return statements\n");
-
-	// Create the argument type list with the added argument's type
-	std::vector<Type*> ArgTypes;
-	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		ArgTypes.push_back(ai->getType());
-	}
-	for(auto *RATy: RetArgTypes) {
-		ArgTypes.push_back(RATy);
-	}
-
-	// Creating Args vector to use in cloning!
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Args.push_back(&*ai);
-	}
-	for(auto *ai : RetArgs) {
-		Args.push_back(ai);
-	}
-
-	// Adding new arguments to the function argument list, would not change the
-	// function type. We need to change the type of this function to reflect the
-	// added arguments
-	Type* VoidRetType = Type::getVoidTy(F->getContext());
-	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-	// Change the function type
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-	//F->eraseFromParent();
-	return newF;
+  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+  // FIXME: Maybe do that using the Node?
+  StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
+
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
+
+
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
+
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
+
+    // Replacing return statements with others returning void
+    for (auto *RI : RItoRemove) {
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  }
+  else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    std::vector<Argument*> Args;
+    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      Args.push_back(RetArg);
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
+
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (auto *RI : RItoRemove) {
+      Value* RetVal = RI->getReturnValue();
+      for(unsigned i = 0; i < Args.size(); i++) {
+        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+                               Args[i]->getName()+".val", RI);
+        new StoreInst(EI, Args[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type* VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  //F->eraseFromParent();
+  return newF;
 }
 
 /******************************************************************************
@@ -2141,333 +1771,334 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
 static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	VisitedList->push_back(V);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-			ui != ue; ++ui) {
-		Instruction* I = dyn_cast<Instruction>(*ui);
-		if(!I) {
-			// if use is not an instruction, then skip it
-			continue;
-		}
-		DEBUG(errs() << "\t" << *I << "\n");
-		if(isa<LoadInst>(I)) {
-			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-			UseList->push_back(V);
-		}
-		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-			// found a store in use chain
-			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-			return true;
-		}
-		else if(BuildDFG::isViscIntrinsic(I)) {
-			// If it is an atomic intrinsic, we found a store
-			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-					&& "Only visc atomic intrinsics can have an argument as input");
-			return true;
-		}
-		else {
-			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-			if(findLoadStoreUses(I, UseList, VisitedList))
-				return true;
-		}
-	}
-	return false;
+  if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  VisitedList->push_back(V);
+  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
+      ui != ue; ++ui) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    if(!I) {
+      // if use is not an instruction, then skip it
+      continue;
+    }
+    DEBUG(errs() << "\t" << *I << "\n");
+    if(isa<LoadInst>(I)) {
+      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+      UseList->push_back(V);
+    }
+    else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+      // found a store in use chain
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+      return true;
+    }
+    else if(BuildDFG::isViscIntrinsic(I)) {
+      // If it is an atomic intrinsic, we found a store
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
+          && "Only visc atomic intrinsics can have an argument as input");
+      return true;
+    }
+    else {
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+      if(findLoadStoreUses(I, UseList, VisitedList))
+        return true;
+    }
+  }
+ return false;
 }
 
 static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	DependenceList->push_back(V);
-	// If not an instruction, then not dependent on node instance id
-	if(!isa<Instruction>(V) || isa<Constant>(V)) {
-		DEBUG(errs() << "\tStop\n");
-		return false;
-	}
-
-	Instruction* I = cast<Instruction>(V);
-	for(unsigned i = 0; i < I->getNumOperands(); i++) {
-		Value* operand = I->getOperand(i);
-		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-				Value* Node = II->getArgOperand(0);
-				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-					return true;
-				}
-			}
-		}
-		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-			continue;
-		}
-		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-			return true;
-		}
-	}
-	return false;
+  if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  DependenceList->push_back(V);
+  // If not an instruction, then not dependent on node instance id
+  if(!isa<Instruction>(V) || isa<Constant>(V)) {
+    DEBUG(errs() << "\tStop\n");
+    return false;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  for(unsigned i = 0; i < I->getNumOperands(); i++) {
+    Value* operand = I->getOperand(i);
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
+      if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
+          || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
+            || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+        Value* Node = II->getArgOperand(0);
+        IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
+        assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
+        if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
+          return true;
+        }
+      }
+    }
+    if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
+      DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
+      continue;
+    }
+    DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+    if(isDependentOnNodeInstanceID(operand, DependenceList)) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
 static bool canBePromoted(Argument* arg, Function* F) {
-	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-	std::vector<Value*> UseList;
-	std::vector<Value*> VisitedList;
-	// recursively traverse use chain
-	// if find a store instruction return false, everything fails, cannot be
-	// promoted
-	// if find a load instruction as use, add the GEP instruction to list
-	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-	if(foundStore == true)
-		return false;
-	// See that the GEP instructions are not dependent on getNodeInstanceID
-	// intrinsic
-	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-	std::vector<Value*>DependenceList;
-	for(auto U: UseList) {
-		if(isDependentOnNodeInstanceID(U, &DependenceList))
-			return false;
-	}
-	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-	return true;
+  DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
+  std::vector<Value*> UseList;
+  std::vector<Value*> VisitedList;
+  // recursively traverse use chain
+  // if find a store instruction return false, everything fails, cannot be
+  // promoted
+  // if find a load instruction as use, add the GEP instruction to list
+  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+  if(foundStore == true)
+    return false;
+  // See that the GEP instructions are not dependent on getNodeInstanceID
+  // intrinsic
+  DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
+  std::vector<Value*>DependenceList;
+  for(auto U: UseList) {
+    if(isDependentOnNodeInstanceID(U, &DependenceList))
+      return false;
+  }
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+  return true;
 }
 
 
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
 static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-	// Assign number of dimenstions a constant value
-	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-	// If local work group size if null
-	if(!kernel->hasLocalWG()) {
-		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-	}
-	else {
-		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-			if(isa<Argument>(kernel->localWGSize[i]))
-				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-		}
-		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-	}
-
-	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-		if(isa<Argument>(kernel->globalWGSize[i]))
-			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-	}
-
-	// For OpenCL, global work group size is the total bumber of instances in each
-	// dimension. So, multiply local and global dim limits.
-	std::vector<Value*> globalWGSizeInsts;
-	if(kernel->hasLocalWG()) {
-		for (unsigned i = 0; i < kernel->gridDim; i++) {
-			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-			globalWGSizeInsts.push_back(MulInst);
-		}
-	}
-	else {
-		globalWGSizeInsts = kernel->globalWGSize;
-	}
-	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if(!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  }
+  else {
+    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if(isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if(isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value*> globalWGSizeInsts;
+  if(kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  }
+  else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
 static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-	Value* WGPtr;
-	// Get int64_t and or ease of use
-	Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-	// Work Group type is [#dim x i64]
-	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-	// Allocate space of Global work group data on stack and get pointer to
-	// first element.
-	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-	Value* nextDim = WGPtr;
-	DEBUG(errs() << *WGPtr << "\n");
-
-	// Iterate over the number of dimensions and store the global work group
-	// size in that dimension
-	for(unsigned i=0; i < WGSize.size(); i++) {
-		DEBUG(errs() << *WGSize[i] << "\n");
-		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-		if(WGSize[i]->getType() != Int64Ty) {
-			// If number of dimensions are mentioned in any other integer format,
-			// generate code to extend it to i64. We need to use the mapped value in
-			// the new generated function, hence the use of VMap
-			// FIXME: Why are we changing the kernel WGSize vector here?
-			DEBUG(errs() << "Not i64. Zero extend required.\n");
-			DEBUG(errs() << *WGSize[i] << "\n");
-			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-			DEBUG(errs() << "Bitcast done.\n");
-			StoreInst* SI = new StoreInst(CI, nextDim, IB);
-			DEBUG(errs() << "Zero extend done.\n");
-			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-		} else {
-			// Store the value representing work group size in ith dimension on
-			// stack
-			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-		}
-		if(i+1 < WGSize.size()) {
-			// Move to next dimension
-			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-					WG->getName()+"."+Twine(i+1),
-					IB);
-			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-			nextDim = GEP;
-		}
-	}
-	return WGPtr;
+  Value* WGPtr;
+  // Get int64_t and or ease of use
+  Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+  Value* nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for(unsigned i=0; i < WGSize.size(); i++) {
+    DEBUG(errs() << *WGSize[i] << "\n");
+    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+    if(WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst* SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if(i+1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+                               WG->getName()+"."+Twine(i+1),
+                               IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
 
 }
 
 // Get generated PTX binary name
 static std::string getPTXFilename(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	moduleID.append(".kernels.cl");
-	return moduleID;
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".kernels.cl");
+  return moduleID;
 }
 
 // Get the name of the input file from module ID
 static std::string getFilenameFromModule(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	return moduleID.substr(moduleID.find_last_of("/")+1);
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/")+1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-	if (TARGET_PTX == 32)
-		M.setDataLayout(StringRef(nvptx32_layoutStr));
-	else if (TARGET_PTX == 64)
-		M.setDataLayout(StringRef(nvptx64_layoutStr));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 static void changeTargetTriple(Module &M) {
-	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-	if (TARGET_PTX == 32)
-		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-	else if (TARGET_PTX == 64)
-		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 // Helper function, populate a vector with all return statements in a function
 static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-	for (auto &BB : *F) {
-		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-			ReturnInstVec.push_back(RI);
-	}	
+  for (auto &BB : *F) {
+    if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+	ReturnInstVec.push_back(RI);
+  }	
 }
 
 // Helper function, populate a vector with all IntrinsicID intrinsics in a function
 static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-		Instruction *I = &(*i);
-		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-		if (II && II->getIntrinsicID() == IntrinsicID) {
-			IntrinsicInstVec.push_back(II);
-		}
-	}
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
 }
 
 // Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return AtomicRMWInst::Add;
-		case Intrinsic::visc_atomic_sub:
-			return AtomicRMWInst::Sub;
-		case Intrinsic::visc_atomic_min:
-			return AtomicRMWInst::Min;
-		case Intrinsic::visc_atomic_umin:
-			return AtomicRMWInst::UMin;
-		case Intrinsic::visc_atomic_max:
-			return AtomicRMWInst::Max;
-		case Intrinsic::visc_atomic_umax:
-			return AtomicRMWInst::UMax;
-			//case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
-			//case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
-		case Intrinsic::visc_atomic_xchg:
-			return AtomicRMWInst::Xchg;
-		case Intrinsic::visc_atomic_and:
-			return AtomicRMWInst::And;
-		case Intrinsic::visc_atomic_or:
-			return AtomicRMWInst::Or;
-		case Intrinsic::visc_atomic_xor:
-			return AtomicRMWInst::Xor;
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch(ID) {
+  case Intrinsic::visc_atomic_add:
+    return AtomicRMWInst::Add;
+  case Intrinsic::visc_atomic_sub:
+    return AtomicRMWInst::Sub;
+  case Intrinsic::visc_atomic_min:
+    return AtomicRMWInst::Min;
+  case Intrinsic::visc_atomic_umin:
+    return AtomicRMWInst::UMin;
+  case Intrinsic::visc_atomic_max:
+    return AtomicRMWInst::Max;
+  case Intrinsic::visc_atomic_umax:
+    return AtomicRMWInst::UMax;
+  //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+  //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+  case Intrinsic::visc_atomic_xchg:
+    return AtomicRMWInst::Xchg;
+  case Intrinsic::visc_atomic_and:
+    return AtomicRMWInst::And;
+  case Intrinsic::visc_atomic_or:
+    return AtomicRMWInst::Or;
+  case Intrinsic::visc_atomic_xor:
+    return AtomicRMWInst::Xor;
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
 
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_cmpxchg:
-			return "atom_cmpxchg";
-		case Intrinsic::visc_atomic_add:
-			return "atom_add";
-		case Intrinsic::visc_atomic_sub:
-			return "atom_sub";
-		case Intrinsic::visc_atomic_min:
-			return "atom_min";
-		case Intrinsic::visc_atomic_max:
-			return "atom_max";
-		case Intrinsic::visc_atomic_inc:
-			return "atom_inc";
-		case Intrinsic::visc_atomic_dec:
-			return "atom_dec";
-		case Intrinsic::visc_atomic_xchg:
-			return "atom_xchg";
-		case Intrinsic::visc_atomic_and:
-			return "atom_and";
-		case Intrinsic::visc_atomic_or:
-			return "atom_or";
-		case Intrinsic::visc_atomic_xor:
-			return "atom_xor";
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch(ID) {
+  case Intrinsic::visc_atomic_cmpxchg:
+    return "atom_cmpxchg";
+  case Intrinsic::visc_atomic_add:
+    return "atom_add";
+  case Intrinsic::visc_atomic_sub:
+    return "atom_sub";
+  case Intrinsic::visc_atomic_min:
+    return "atom_min";
+  case Intrinsic::visc_atomic_max:
+    return "atom_max";
+  case Intrinsic::visc_atomic_inc:
+    return "atom_inc";
+  case Intrinsic::visc_atomic_dec:
+    return "atom_dec";
+  case Intrinsic::visc_atomic_xchg:
+    return "atom_xchg";
+  case Intrinsic::visc_atomic_and:
+    return "atom_and";
+  case Intrinsic::visc_atomic_or:
+    return "atom_or";
+  case Intrinsic::visc_atomic_xor:
+    return "atom_xor";
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
 } // End of namespace
 
 char DFG2LLVM_NVPTX::ID = 0;
 static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
-		"Dataflow Graph to LLVM for NVPTX Pass",
-		false /* does not modify the CFG */,
-		true /* transformation,   *
-					* not just analysis */);
+                                      "Dataflow Graph to LLVM for NVPTX Pass",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
 
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index fb16c28c13ebe7141160d5990cc40105cfd94d32..06e4e79183d726bb80113264f3cc7da0a4701ecf 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -827,12 +827,60 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
   switchToTimer(visc_TimerID_COMPUTATION, CI);
   switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
 
-  // Code for returning the output
-  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                             CI->getType()->getPointerTo(),
-                             CI->getName()+".addr",
-                             RI);
-  new StoreInst(CI, OutputAddrCast, RI);
+  StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
+
+  // if Root has non empty return 
+  if (RootRetTy->getNumElements()) {
+    // We can't access the type of the arg struct - build it
+    std::vector<Type*> TyList;
+    for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+        ai != ae; ++ai) {
+      TyList.push_back(ai->getType());
+    }
+    TyList.push_back(CI->getType());
+
+    StructType* ArgStructTy = StructType::create(M.getContext(),
+                                                 ArrayRef<Type*>(TyList),
+                                 (RootF_X86->getName()+".arg.struct.ty").str(), true);
+
+    // Cast the data pointer to the type of the arg struct
+    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                                 ArgStructTy->getPointerTo(),
+                                 "argStructCast.addr",
+                                 RI);
+
+    // Result struct is the last element of the packed struct passed to launch
+    unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
+
+    ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                          outStructIdx);
+
+    Value* GEPIIdxList[] = { IntZero,
+                             IntIdx
+                           };
+    // Get data pointer to the last element of struct - result field
+    GetElementPtrInst *OutGEPI =
+      GetElementPtrInst::Create(ArgStructTy,
+                                OutputAddrCast,
+                                ArrayRef<Value*>(GEPIIdxList, 2),
+                                CI->getName()+".addr",
+                                RI);
+    // Store result there
+    new StoreInst(CI, OutGEPI, RI);
+  } else {
+    // There is no return - no need to actually code gen, but for fewer
+    // changes maintain what code was already doing
+    // We were casting the data pointer to the result type of Root, and
+    // returning result there. This would work at the LLVM level, but not
+    // at the C level, thus the rewrite.
+    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                               CI->getType()->getPointerTo(),
+                               CI->getName()+".addr",
+                               RI);
+    new StoreInst(CI, OutputAddrCast, RI);
+  }
+
   switchToTimer(visc_TimerID_NONE, RI);
 
   DEBUG(errs() << "Application specific function:\n");