From 9b7e9cf320731dea06919412a3120bae6d7fe09b Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@tyler.cs.illinois.edu>
Date: Wed, 22 Jan 2020 11:59:53 -0600
Subject: [PATCH] Removed support for unused intrinsics

---
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 2906 ++++++++---------
 hpvm/lib/Transforms/GenVISC/GenVISC.cpp       |  818 +++--
 2 files changed, 1771 insertions(+), 1953 deletions(-)

diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 5f861eaf1e..84d17b2657 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -15,28 +15,29 @@
 #define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_NVPTX"
-#include "SupportVISC/DFG2LLVM.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/VISCUtils.h"
-#include "llvm-c/Core.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "SupportVISC/VISCTimer.h"
+#include "SupportVISC/DFG2LLVM.h"
+#include "SupportVISC/VISCUtils.h"
 
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/UseListOrder.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/IR/UseListOrder.h"
+
 
 #include <sstream>
 
@@ -46,8 +47,8 @@ using namespace dfg2llvm;
 using namespace viscUtils;
 
 // VISC Command line option to use timer or not
-static cl::opt<bool> VISCTimer_NVPTX("visc-timers-ptx",
-                                     cl::desc("Enable visc timers"));
+static cl::opt<bool>
+VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers"));
 
 namespace {
 // Helper class declarations
@@ -56,88 +57,94 @@ namespace {
 // in bytes. Would have preferred to use tuple but support not yet available
 class OutputPtr {
 public:
-  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
-      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
+    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
 
-  Value *h_ptr;
-  Value *d_ptr;
-  Value *bytes;
+  Value* h_ptr;
+  Value* d_ptr;
+  Value* bytes;
 };
 
 // Class to maintain important kernel info required for generating runtime
 // calls
 class Kernel {
 public:
-  Kernel(
-      Function *_KF, DFLeafNode *_KLeafNode,
-      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
-      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
-          std::map<unsigned, std::pair<Value *, unsigned>>(),
-      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
-      unsigned _gridDim = 0,
-      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
-      unsigned _blockDim = 0,
-      std::vector<Value *> _localWGSize = std::vector<Value *>())
-      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
-        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
-        localWGSize(_localWGSize) {
-
-    assert(gridDim == globalWGSize.size() &&
-           "gridDim should be same as the size of vector globalWGSize");
-    assert(blockDim == localWGSize.size() &&
-           "blockDim should be same as the size of vector localWGSize");
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
+           std::map<unsigned, unsigned>(),
+         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
+           std::map<unsigned, std::pair<Value*, unsigned> >(),
+         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
+         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
+    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
+      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size()
+           && "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size()
+           && "blockDim should be same as the size of vector localWGSize");
   }
 
-  Function *KernelFunction;
-  DFLeafNode *KernelLeafNode;
+  Function* KernelFunction;
+  DFLeafNode* KernelLeafNode;
   std::map<unsigned, unsigned> inArgMap;
   // Map for shared memory arguments
-  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
+  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
   // Fields for (potential) allocation node
-  DFLeafNode *AllocationNode;
-  Function *AllocationFunction;
+  DFLeafNode* AllocationNode;
+  Function* AllocationFunction;
   std::map<unsigned, unsigned> allocInArgMap;
 
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
-  std::vector<Value *> globalWGSize;
+  std::vector<Value*> globalWGSize;
   unsigned blockDim;
-  std::vector<Value *> localWGSize;
+  std::vector<Value*> localWGSize;
   std::vector<int> localDimMap;
 
-  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
-  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
+  std::map<unsigned, unsigned> &getInArgMap() {
+    return inArgMap;
+  }
+  void setInArgMap(std::map<unsigned, unsigned> map) {
+    inArgMap = map;
+  }
 
-  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() {
     return sharedInArgMap;
   }
-  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
     sharedInArgMap = map;
   }
 
-  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
-  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
+  std::vector<unsigned> &getOutArgMap() {
+    return outArgMap;
+  }
+  void setOutArgMap(std::vector<unsigned> map) {
+    outArgMap = map;
+  }
 
-  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
+  void setLocalWGSize(std::vector<Value*> V) {
+    localWGSize = V;
+  }
 
-  bool hasLocalWG() const { return blockDim != 0; }
+  bool hasLocalWG() const {
+    return blockDim != 0;
+  }
 };
 
 // Helper function declarations
-static bool canBePromoted(Argument *arg, Function *F);
-static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
-                                 Kernel *, ValueToValueMapTy &, Instruction *);
-static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
-                              ValueToValueMapTy &, Instruction *,
-                              const Twine &WGName = "WGSize");
-static std::string getPTXFilename(const Module &);
-static std::string getFilenameFromModule(const Module &M);
+static bool canBePromoted(Argument* arg, Function* F);
+static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
+                                 ValueToValueMapTy&, Instruction*);
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
+                              Instruction*, const Twine& WGName = "WGSize");
+static std::string getPTXFilename(const Module&);
+static std::string getFilenameFromModule(const Module& M);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-static void findIntrinsicInst(Function *, Intrinsic::ID,
-                              std::vector<IntrinsicInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
 static std::string getAtomicOpName(Intrinsic::ID);
 
@@ -147,6 +154,7 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM {
   DFG2LLVM_NVPTX() : DFG2LLVM(ID) {}
 
 private:
+
 public:
   bool runOnModule(Module &M);
 };
@@ -155,10 +163,10 @@ public:
 class CGT_NVPTX : public CodeGenTraversal {
 
 private:
-  // Member variables
+  //Member variables
   std::unique_ptr<Module> KernelM;
-  DFNode *KernelLaunchNode = NULL;
-  Kernel *kernel;
+  DFNode* KernelLaunchNode = NULL;
+  Kernel* kernel;
 
   // VISC Runtime API
   FunctionCallee llvm_visc_ocl_launch;
@@ -173,16 +181,14 @@ private:
   FunctionCallee llvm_visc_ocl_getOutput;
   FunctionCallee llvm_visc_ocl_executeNode;
 
-  // Functions
+  //Functions
   std::string getKernelsModuleName(Module &M);
-  void fixValueAddrspace(Value *V, unsigned addrspace);
-  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
-                                                  Function *);
-  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
-                               unsigned i);
-  void addCLMetadata(Function *F);
-  Function *transformFunctionToVoid(Function *F);
-  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
+  void fixValueAddrspace(Value* V, unsigned addrspace);
+  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*);
+  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
+  void addCLMetadata(Function* F);
+  Function* transformFunctionToVoid(Function* F);
+  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
 
   // Virtual Functions
   void init() {
@@ -190,25 +196,24 @@ private:
     TargetName = "NVPTX";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode *N);
-  void codeGen(DFLeafNode *N);
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
 
 public:
+
   // Constructor
-  CGT_NVPTX(Module &_M, BuildDFG &_DFG)
-      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
     init();
     initRuntimeAPI();
-    DEBUG(errs() << "Old module pointer: " << &_M << "\n");
-    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
+    errs() << "Old module pointer: " << &_M << "\n";
+    errs() << "New module pointer: " <<  KernelM.get() << "\n";
 
-    // Copying instead of creating new, in order to preserve required info
-    // (metadata) Remove functions, global variables and aliases
-    std::vector<GlobalVariable *> GVVect;
+    // Copying instead of creating new, in order to preserve required info (metadata)
+    // Remove functions, global variables and aliases
+    std::vector<GlobalVariable*> GVVect;
     for (Module::global_iterator mi = KernelM->global_begin(),
-                                 me = KernelM->global_end();
-         (mi != me); ++mi) {
-      GlobalVariable *GV = &*mi;
+         me = KernelM->global_end(); (mi != me); ++mi) {
+      GlobalVariable* GV = &*mi;
       GVVect.push_back(GV);
     }
     for (auto *GV : GVVect) {
@@ -216,10 +221,10 @@ public:
       GV->eraseFromParent();
     }
 
-    std::vector<Function *> FuncVect;
-    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
-         (mi != me); ++mi) {
-      Function *F = &*mi;
+    std::vector<Function*> FuncVect;
+    for (Module::iterator mi = KernelM->begin(),
+         me = KernelM->end(); (mi != me); ++mi) {
+      Function* F = &*mi;
       FuncVect.push_back(F);
     }
     for (auto *F : FuncVect) {
@@ -227,11 +232,10 @@ public:
       F->eraseFromParent();
     }
 
-    std::vector<GlobalAlias *> GAVect;
+    std::vector<GlobalAlias*> GAVect;
     for (Module::alias_iterator mi = KernelM->alias_begin(),
-                                me = KernelM->alias_end();
-         (mi != me); ++mi) {
-      GlobalAlias *GA = &*mi;
+         me = KernelM->alias_end(); (mi != me); ++mi) {
+      GlobalAlias* GA = &*mi;
       GAVect.push_back(GA);
     }
     for (auto *GA : GAVect) {
@@ -242,7 +246,9 @@ public:
     changeDataLayout(*KernelM);
     changeTargetTriple(*KernelM);
 
+
     DEBUG(errs() << *KernelM);
+
   }
 
   void writeKernelsModule();
@@ -254,14 +260,14 @@ void CGT_NVPTX::initRuntimeAPI() {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
   Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  if (runtimeModule == nullptr)
+  if(runtimeModule == nullptr)
     DEBUG(errs() << Err.getMessage());
   else
     DEBUG(errs() << "Successfully loaded visc-rt API module\n");
@@ -284,25 +290,27 @@ void CGT_NVPTX::initRuntimeAPI() {
 
   // Insert init context in main
   DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n");
-  Function *VI = M.getFunction("llvm.visc.init");
+  Function* VI = M.getFunction("llvm.visc.init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
 
   InitCall = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(InitCall);
   switchToTimer(visc_TimerID_INIT_CTX, InitCall);
   CallInst::Create(llvm_visc_ocl_initContext,
-                   ArrayRef<Value *>(getTargetID(M, visc::GPU_TARGET)), "",
-                   InitCall);
+                   ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)),
+                   "", InitCall);
   switchToTimer(visc_TimerID_NONE, InitCall);
 
   // Insert print instruction at visc exit
   DEBUG(errs() << "Gen Code to print NVPTX Timer\n");
-  Function *VC = M.getFunction("llvm.visc.cleanup");
+  Function* VC = M.getFunction("llvm.visc.cleanup");
   DEBUG(errs() << *VC << "\n");
   assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
 
   CleanupCall = cast<Instruction>(*VC->user_begin());
   printTimerSet(CleanupCall);
+
+
 }
 
 // Generate Code to call the kernel
@@ -310,37 +318,36 @@ void CGT_NVPTX::initRuntimeAPI() {
 // used to generate a function to associate with this leaf node. The function
 // is responsible for all the memory allocation/transfer and invoking the
 // kernel call on the device
-void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
-                                   const Twine &FileName) {
+void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
-  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
   assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
          "Code already generated for this node");
 
   // Useful values
-  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
 
   // If kernel struct has not been initialized with kernel function, then fail
   assert(K != NULL && "No kernel found!!");
 
   DEBUG(errs() << "Generating kernel call code\n");
 
-  Function *F = N->getFuncPointer();
+  Function* F = N->getFuncPointer();
+
 
   // Create of clone of F with no instructions. Only the type is the same as F
   // without the extra arguments.
-  Function *F_X86;
+  Function* F_X86;
 
   // Clone the function, if we are seeing this function for the first time. We
   // only need a clone in terms of type.
   ValueToValueMapTy VMap;
 
   // Create new function with the same type
-  F_X86 =
-      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
 
   // Loop over the arguments, copying the names of arguments over.
   Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -353,25 +360,26 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
 
   // Add a basic block to this empty function
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-  ReturnInst *RI = ReturnInst::Create(
-      M.getContext(), UndefValue::get(F_X86->getReturnType()), BB);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(F_X86->getReturnType()), BB);
 
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   BB = &*F_X86->begin();
   RI = cast<ReturnInst>(BB->getTerminator());
 
-  // Add the generated function info to DFNode
-  //  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  //Add the generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
   N->addGenFunc(F_X86, visc::GPU_TARGET, true);
-  DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
-               << N->getFuncPointer()->getName() << "\n");
+  errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
+         << N->getFuncPointer()->getName() << "\n";
+
 
   // Loop over the arguments, to create the VMap
   dest_iterator = F_X86->arg_begin();
@@ -404,53 +412,51 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
       break;
   }
 
-  assert(C->isDummyNode() == false && "Internal Node only contains dummy
-  nodes!");
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
 
   Function* CF = C->getFuncPointer();
   */
-  Function *KF = K->KernelLeafNode->getFuncPointer();
+  Function* KF = K->KernelLeafNode->getFuncPointer();
   // Initialize context
-  // DEBUG(errs() << "Initializing context" << "\n");
-  // CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+  //DEBUG(errs() << "Initializing context" << "\n");
+  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
 
-  DEBUG(errs() << "Initializing commandQ"
-               << "\n");
+  DEBUG(errs() << "Initializing commandQ" << "\n");
   // Initialize command queue
   switchToTimer(visc_TimerID_SETUP, InitCall);
-  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
+  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
   DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - "
-               << K->KernelFunction->getName() << "\n");
-  Value *kernelStr =
-      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");
-
-  Value *LaunchInstArgs[] = {fileStr, kernelStr};
-
-  DEBUG(errs() << "Inserting launch call"
-               << "\n");
-  CallInst *NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
-                                         ArrayRef<Value *>(LaunchInstArgs, 2),
-                                         "graph" + KF->getName(), InitCall);
+  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
+  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
+
+  Value* LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call" << "\n");
+  CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
+                                         ArrayRef<Value*>(LaunchInstArgs, 2),
+                                         "graph"+KF->getName(),
+                                         InitCall);
   DEBUG(errs() << *NVPTX_Ctx << "\n");
-  GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false,
+  GraphIDAddr = new GlobalVariable(M,
+                                   NVPTX_Ctx->getType(),
+                                   false,
                                    GlobalValue::CommonLinkage,
                                    Constant::getNullValue(NVPTX_Ctx->getType()),
-                                   "graph" + KF->getName() + ".addr");
+                                   "graph"+KF->getName()+".addr");
   DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
-  StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
+  StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
   DEBUG(errs() << *SI << "\n");
   switchToTimer(visc_TimerID_NONE, InitCall);
   switchToTimer(visc_TimerID_SETUP, RI);
-  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
+  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
 
   // Iterate over the required input edges of the node and use the visc-rt API
   // to set inputs
   DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
   std::vector<OutputPtr> OutputPointers;
-  // Vector to hold the device memory object that need to be cleared before we
-  // release context
-  std::vector<Value *> DevicePointers;
+  // Vector to hold the device memory object that need to be cleared before we release
+  // context
+  std::vector<Value*> DevicePointers;
 
   std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
   /*
@@ -462,134 +468,133 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
 
   */
 
-  for (auto &InArgMapPair : kernelInArgMap) {
+  for(auto &InArgMapPair  : kernelInArgMap) {
     unsigned i = InArgMapPair.first;
-    Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second);
-    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
+    Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second);
+    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
 
     // input value has been obtained.
     // Check if input is a scalar value or a pointer operand
     // For scalar values such as int, float, etc. the size is simply the size of
     // type on target machine, but for pointers, the size of data would be the
     // next integer argument
-    if (inputVal->getType()->isPointerTy()) {
+    if(inputVal->getType()->isPointerTy()) {
 
       switchToTimer(visc_TimerID_COPY_PTR, RI);
       // Pointer Input
       // CheckAttribute
-      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
-      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
-                        !(hasAttribute(KF, i, Attribute::In)))
-                           ? False
-                           : True;
-
-      Argument *A = getArgumentAt(KF, i);
-      if (isOutput == True) {
+      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
+      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
+                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
+
+      Argument* A = getArgumentAt(KF, i);
+      if(isOutput == True) {
         DEBUG(errs() << *A << " is an OUTPUT argument\n");
       }
-      if (isInput == True) {
+      if(isInput == True) {
         DEBUG(errs() << *A << " is an INPUT argument\n");
       }
 
-      Value *inputValI8Ptr = CastInst::CreatePointerCast(
-          inputVal, Type::getInt8PtrTy(M.getContext()),
-          inputVal->getName() + ".i8ptr", RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
 
       // Assert that the pointer argument size (next argument) is in the map
-      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());
-
-      Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]);
-      assert(
-          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
-          "Pointer type input must always be followed by size (integer type)");
-      Value *setInputArgs[] = {
-          GraphID,
-          inputValI8Ptr,
-          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
-          inputSize,
-          isInput,
-          isOutput};
-      Value *d_ptr =
-          CallInst::Create(llvm_visc_ocl_argument_ptr,
-                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
+      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
+
+      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
+      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
+             && "Pointer type input must always be followed by size (integer type)");
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               inputSize,
+                               isInput,
+                               isOutput
+                              };
+      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
+                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
       DevicePointers.push_back(d_ptr);
       // If this has out attribute, store the returned device pointer in
       // memory to read device memory later
-      if (isOutput == True)
-        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
-    } else {
+      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    }
+    else {
       switchToTimer(visc_TimerID_COPY_SCALAR, RI);
       // Scalar Input
       // Store the scalar value on stack and then pass the pointer to its
       // location
-      AllocaInst *inputValPtr = new AllocaInst(
-          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
-      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);
-
-      Value *inputValI8Ptr = CastInst::CreatePointerCast(
-          inputValPtr, Type::getInt8PtrTy(M.getContext()),
-          inputVal->getName() + ".i8ptr", RI);
-
-      Value *setInputArgs[] = {
-          GraphID, inputValI8Ptr,
-          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
-          ConstantExpr::getSizeOf(inputVal->getType())};
+      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI);
+      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               ConstantExpr::getSizeOf(inputVal->getType())
+                              };
       CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
     }
   }
 
-  DEBUG(
-      errs() << "Setup shared memory arguments of node and insert visc api\n");
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
 
   // Check to see if all the allocation sizes are constant (determined
   // statically)
   bool constSizes = true;
-  for (auto &e : K->getSharedInArgMap()) {
+  for (auto& e: K->getSharedInArgMap()) {
     constSizes &= isa<Constant>(e.second.first);
   }
 
   // If the sizes are all constant
   if (constSizes) {
-    for (auto &e : K->getSharedInArgMap()) {
+    for (auto& e: K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value *allocSize = e.second.first;
+      Value* allocSize = e.second.first;
 
-      DEBUG(errs() << "\tLocal Memory at " << argNum
-                   << ", size = " << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
-        assert(isa<Constant>(allocSize) &&
-               "Constant shared memory size is expected");
+        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
 
-        Value *setInputArgs[] = {
-            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
-            allocSize};
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
         CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
-      } else {
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
         // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst *allocSizePtr =
-            new AllocaInst(allocSize->getType(), 0,
-                           allocSize->getName() + ".sharedMem.ptr", RI);
-        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
-            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
-            allocSize->getName() + ".sharedMem.i8ptr", RI);
-
-        Value *setInputArgs[] = {
-            GraphID, allocSizeI8Ptr,
-            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
-            ConstantExpr::getSizeOf(allocSize->getType())};
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0,
+            allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                                Type::getInt8PtrTy(M.getContext()),
+                                allocSize->getName()+".sharedMem.i8ptr",
+                                RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
         CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
       }
     }
   } else {
@@ -610,64 +615,68 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
       ExtractValueInstVec.push_back(EI);
     }
 
-    for (auto &e : K->getSharedInArgMap()) {
+    for (auto& e: K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
+      Value* allocSize = ExtractValueInstVec[e.second.second/2];
 
-      DEBUG(errs() << "\tLocal Memory at " << argNum
-                   << ", size = " << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
-        Value *setInputArgs[] = {
-            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
-            allocSize};
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
         CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
-      } else {
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
         // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst *allocSizePtr =
-            new AllocaInst(allocSize->getType(), 0,
-                           allocSize->getName() + ".sharedMem.ptr", RI);
-        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
-            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
-            allocSize->getName() + ".sharedMem.i8ptr", RI);
-
-        Value *setInputArgs[] = {
-            GraphID, allocSizeI8Ptr,
-            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
-            ConstantExpr::getSizeOf(allocSize->getType())};
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, 
+            allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                                Type::getInt8PtrTy(M.getContext()),
+                                allocSize->getName()+".sharedMem.i8ptr",
+                                RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
         CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
       }
     }
   }
 
+
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
   // Set output if struct is not an empty struct
-  StructType *OutputTy = K->KernelLeafNode->getOutputType();
-  std::vector<Value *> d_Outputs;
-  if (!OutputTy->isEmptyTy()) {
+  StructType* OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value*> d_Outputs;
+  if(!OutputTy->isEmptyTy()) {
     switchToTimer(visc_TimerID_COPY_PTR, RI);
     // Not an empty struct
     // Iterate over all elements of the struct and put them in
-    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
-      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
-      Value *setOutputArgs[] = {
-          GraphID,
-          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
-          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
-
-      CallInst *d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
-                                            ArrayRef<Value *>(setOutputArgs, 3),
-                                            "d_output." + KF->getName(), RI);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
+      Value* setOutputArgs[] = {GraphID,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
+                               };
+
+      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                            ArrayRef<Value*>(setOutputArgs, 3),
+                                            "d_output."+KF->getName(),
+                                            RI);
       d_Outputs.push_back(d_Output);
     }
   }
@@ -681,37 +690,46 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
   Value *workDim, *LocalWGPtr, *GlobalWGPtr;
   getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
   switchToTimer(visc_TimerID_KERNEL, RI);
-  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
-  CallInst *Event = CallInst::Create(llvm_visc_ocl_executeNode,
-                                     ArrayRef<Value *>(ExecNodeArgs, 4),
-                                     "event." + KF->getName(), RI);
+  Value* ExecNodeArgs[] = {GraphID,
+                           workDim,
+                           LocalWGPtr,
+                           GlobalWGPtr
+                          };
+  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
+                                     ArrayRef<Value*>(ExecNodeArgs, 4),
+                                     "event."+KF->getName(),
+                                     RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
 
   // Wait for Kernel to Finish
-  CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
+  CallInst::Create(llvm_visc_ocl_wait,
+                   ArrayRef<Value*>(GraphID),
+                   "",
+                   RI);
 
   switchToTimer(visc_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
-  if (!OutputTy->isEmptyTy()) {
-    std::vector<Value *> h_Outputs;
-    Value *KernelOutput = UndefValue::get(OutputTy);
-    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
-      Value *GetOutputArgs[] = {
-          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
-      CallInst *h_Output = CallInst::Create(
-          llvm_visc_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
-          "h_output." + KF->getName() + ".addr", RI);
+  if(!OutputTy->isEmptyTy()) {
+    std::vector<Value*>h_Outputs;
+    Value* KernelOutput = UndefValue::get(OutputTy);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Value* GetOutputArgs[] = {GraphID,
+                                Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                                d_Outputs[i],
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
+                               };
+      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
+                                            ArrayRef<Value*>(GetOutputArgs, 4),
+                                            "h_output."+KF->getName()+".addr",
+                                            RI);
       // Read each device pointer listed in output struct
       // Load the output struct
-      CastInst *BI = BitCastInst::CreatePointerCast(
-          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
-          RI);
-
-      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
-                                             ArrayRef<unsigned>(i),
-                                             KF->getName() + "output", RI);
+      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
+                     OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
+
+      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
+                                             KF->getName()+"output", RI);
     }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
@@ -726,76 +744,75 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
     DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
     DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
 
-    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
-  output.bytes}; CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
+    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
                                     ArrayRef<Value*>(GetOutputArgs, 4),
                                     "", RI);
   }*/
   switchToTimer(visc_TimerID_MEM_FREE, RI);
   // Clear Context and free device memory
-  DEBUG(errs() << "Clearing context"
-               << "\n");
+  DEBUG(errs() << "Clearing context" << "\n");
   // Free Device Memory
-  for (auto d_ptr : DevicePointers) {
-    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
+  for(auto d_ptr: DevicePointers) {
+    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
   }
   switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
   // Clear Context
-  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
-  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value *>(LI), "",
-                   CleanupCall);
+  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
   switchToTimer(visc_TimerID_NONE, CleanupCall);
 
   switchToTimer(visc_TimerID_MISC, RI);
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
   // Get Exit node
-  DFNode *C = N->getChildGraph()->getExit();
+  DFNode* C = N->getChildGraph()->getExit();
   // Get OutputType of this node
-  StructType *OutTy = N->getOutputType();
+  StructType* OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find the kernel's output arg map, to use instead of the bindings
   std::vector<unsigned> outArgMap = kernel->getOutArgMap();
   // Find all the input edges to exit node
-  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
+  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
     DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
-    DFEdge *E = C->getInDFEdgeAt(i);
+    DFEdge* E = C->getInDFEdgeAt(i);
 
     assert(E && "No Binding for output element!");
     // Find the Source DFNode associated with the incoming edge
-    DFNode *SrcDF = E->getSourceDF();
+    DFNode* SrcDF = E->getSourceDF();
 
-    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
-                 << "\n");
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
 
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
-    Value *inputVal;
-    if (SrcDF->isEntryNode()) {
+    Value* inputVal;
+    if(SrcDF->isEntryNode()) {
       inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
-    } else {
+      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+    }
+    else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
       // FIXME: Since the 2-level kernel code gen has aspecific structure, we
       // can assume the SrcDF is same as Kernel Leaf node.
       // Use outArgMap to get correct mapping
       SrcDF = K->KernelLeafNode;
-      assert(OutputMap.count(SrcDF) &&
-             "Source node call not found. Dependency violation!");
+      assert(OutputMap.count(SrcDF)
+             && "Source node call not found. Dependency violation!");
 
       // Find Output Value associated with the Source DFNode using OutputMap
-      Value *CI = OutputMap[SrcDF];
+      Value* CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
       // i is the destination of DFEdge E
       // Use the mapping instead of the bindings
-      //      IndexList.push_back(E->getSourcePosition());
+//      IndexList.push_back(E->getSourcePosition());
       IndexList.push_back(outArgMap[i]);
-      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
-      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                             "",RI);
       inputVal = EI;
     }
     std::vector<unsigned> IdxList;
@@ -806,31 +823,29 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
   DEBUG(errs() << "Extracted all\n");
   switchToTimer(visc_TimerID_NONE, RI);
   retVal->setName("output");
-  ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
 }
 
+
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
-void CGT_NVPTX::codeGen(DFInternalNode *N) {
-  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
-               << "\n");
-  if (KernelLaunchNode == NULL)
-    DEBUG(errs() << "No kernel launch node\n");
+void CGT_NVPTX::codeGen(DFInternalNode* N) {
+  errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n";
+  if(KernelLaunchNode == NULL)
+    errs () << "No kernel launch node\n";
   else {
-    DEBUG(errs() << "KernelLaunchNode: "
-                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
+    errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
   }
 
   if (!KernelLaunchNode) {
-    DEBUG(errs()
-          << "No code generated (host code for kernel launch complete).\n");
+    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
     return;
   }
 
   if (N == KernelLaunchNode) {
     DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-    // TODO
+    //TODO
 
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
@@ -845,8 +860,7 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) {
     // TODO: Structure assumed: one thread node, one allocation node (at most),
     // TB node
     std::map<unsigned, unsigned> inmapFinal;
-    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(),
-                                                ie = inmap2.end();
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
          ib != ie; ++ib) {
       inmapFinal[ib->first] = inmap1[ib->second];
     }
@@ -863,9 +877,8 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) {
     // 0 ... outmap2.size()-1
     // The limit is the size of outmap2, because this is the number of kernel
     // output arguments for which the mapping matters
-    // For now, it reasonable to assume that all the kernel arguments are
-    // returned, maybe plys some others from other nodes, thus outmap2.size() <=
-    // outmap1.size()
+    // For now, it reasonable to assume that all the kernel arguments are returned,
+    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
     for (unsigned i = 0; i < outmap2.size(); i++) {
       outmap1[i] = outmap2[outmap1[i]];
     }
@@ -873,14 +886,15 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) {
 
     // Track the source of local dimlimits for the kernel
     // Dimension limit can either be a constant or an argument of parent
-    // function. Since Internal node would no longer exist, we need to insert
-    // the localWGSize with values from the parent of N.
-    std::vector<Value *> localWGSizeMapped;
+    // function. Since Internal node would no longer exist, we need to insert the
+    // localWGSize with values from the parent of N.
+    std::vector<Value*> localWGSizeMapped;
     for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
       if (isa<Constant>(kernel->localWGSize[i])) {
         // if constant, use as it is
         localWGSizeMapped.push_back(kernel->localWGSize[i]);
-      } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+      }
+      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
         // if argument, find the argument location in N. Use InArgMap of N to
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
@@ -890,49 +904,46 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) {
         assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
 
         unsigned parentArgNum = N->getInArgMap()[argNum];
-        Argument *A =
-            getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
-      } else {
-        assert(
-            false &&
-            "LocalWGsize using value which is neither argument nor constant!");
+      }
+      else {
+        assert(false && "LocalWGsize using value which is neither argument nor constant!");
       }
     }
     // Update localWGSize vector of kernel
     kernel->setLocalWGSize(localWGSizeMapped);
   }
+
 }
 
-void CGT_NVPTX::codeGen(DFLeafNode *N) {
-  DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName()
-               << "\n");
+void CGT_NVPTX::codeGen(DFLeafNode* N) {
+  errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n";
 
   // Skip code generation if it is a dummy node
-  if (N->isDummyNode()) {
+  if(N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
 
   // Skip code generation if it is an allocation node
-  if (N->isAllocationNode()) {
+  if(N->isAllocationNode()) {
     DEBUG(errs() << "Skipping allocation node\n");
     return;
   }
 
   // Generate code only if it has the right hint
-  //  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
-  //    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-  //    return;
-  //  }
-  if (!preferredTargetIncludes(N, visc::GPU_TARGET)) {
-    DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName()
-                 << "\n");
+//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+//    return;
+//  }
+  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
     return;
   }
 
   // Checking which node is the kernel launch
-  DFNode *PNode = N->getParent();
+  DFNode* PNode = N->getParent();
   int pLevel = PNode->getLevel();
   int pReplFactor = PNode->getNumOfDim();
 
@@ -940,40 +951,42 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
   // (1) Parent is the top level node i.e., Root of DFG
   //                    OR
   // (2) Parent does not have multiple instances
-  DEBUG(errs() << "pLevel = " << pLevel << "\n");
-  DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n");
+  errs() << "pLevel = " << pLevel << "\n";
+  errs() << "pReplFactor = " << pReplFactor << "\n";
   assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
 
   // Only these options are supported
-  enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy;
-  if (pLevel == 1 || !pReplFactor) {
-    DEBUG(errs()
-          << "*************** Kernel Gen: 1-Level Hierarchy **************\n");
+  enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy;
+  if(pLevel == 1 || !pReplFactor) {
+    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
     SelectedHierarchy = ONE_LEVEL;
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(),
-                        N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits());
-  } else {
+    kernel = new Kernel(NULL,
+                        N,
+                        N->getInArgMap(),
+                        N->getSharedInArgMap(),
+                        N->getOutArgMap(),
+                        N->getNumOfDim(),
+                        N->getDimLimits());
+  }
+  else {
     // Converting a 2-level DFG to opencl kernel
-    DEBUG(errs()
-          << "*************** Kernel Gen: 2-Level Hierarchy **************\n");
-    assert((pLevel >= 2) &&
-           "Selected node not nested deep enough to be Kernel Node.");
+    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
+    assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node.");
     SelectedHierarchy = TWO_LEVEL;
     KernelLaunchNode = PNode->getParent();
-    assert((PNode->getNumOfDim() == N->getNumOfDim()) &&
-           "Dimension number must match");
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    kernel = new Kernel(NULL,             // kernel function
-                        N,                // kernel leaf node
-                        N->getInArgMap(), // kenel argument mapping
+    kernel = new Kernel(NULL,                 // kernel function
+                        N,                    // kernel leaf node
+                        N->getInArgMap(),     // kenel argument mapping
                         N->getSharedInArgMap(),
-                        N->getOutArgMap(),     // kernel output mapping from the
-                                               // leaf to the interemediate node
-                        PNode->getNumOfDim(),  // gridDim
-                        PNode->getDimLimits(), // grid size
-                        N->getNumOfDim(),      // blockDim
-                        N->getDimLimits());    // block size
+                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
+                        PNode->getNumOfDim(), // gridDim
+                        PNode->getDimLimits(),// grid size
+                        N->getNumOfDim(),     // blockDim
+                        N->getDimLimits());   // block size
+
   }
 
   std::vector<Instruction *> IItoRemove;
@@ -985,62 +998,58 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
-  //  Function *F_nvptx = N->getGenFunc();
+//  Function *F_nvptx = N->getGenFunc();
   Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
 
-  assert(F_nvptx == NULL &&
-         "Error: Visiting a node for which code already generated");
+  assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
   // Clone the function
   ValueToValueMapTy VMap;
 
-  // F_nvptx->setName(FName+"_nvptx");
+  //F_nvptx->setName(FName+"_nvptx");
 
   Twine FName = F->getName();
   StringRef fStr = FName.getSingleStringRef();
-  Twine newFName = Twine(fStr, "_nvptx");
+  Twine newFName = Twine(fStr, "_nvptx"); 
   F_nvptx = CloneFunction(F, VMap);
   F_nvptx->setName(newFName);
 
+  
   //  errs() << "Old Function Name: " << F->getName() << "\n";
   //  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
 
   F_nvptx->removeFromParent();
 
+
   // Insert the cloned function into the kernels module
   KernelM->getFunctionList().push_back(F_nvptx);
 
-  // TODO: Iterate over all the instructions of F_nvptx and identify the
-  // callees and clone them into this module.
+
+  //TODO: Iterate over all the instructions of F_nvptx and identify the
+  //callees and clone them into this module.
   DEBUG(errs() << *F_nvptx->getType());
   DEBUG(errs() << *F_nvptx);
 
   // Transform  the function to void and remove all target dependent attributes
   // from the function
   F_nvptx = transformFunctionToVoid(F_nvptx);
-
-  // Add generated function info to DFNode
-  //  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
+  
+  //Add generated function info to DFNode
+//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
   N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
 
-  DEBUG(
-      errs()
-      << "Removing all attributes from Kernel Function and adding nounwind\n");
-  F_nvptx->removeAttributes(AttributeList::FunctionIndex,
-                            F_nvptx->getAttributes().getFnAttributes());
+  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
   F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
 
-  // FIXME: For now, assume only one allocation node
+  //FIXME: For now, assume only one allocation node
   kernel->AllocationNode = NULL;
 
-  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(),
-                                       iee = N->indfedge_end();
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
        ieb != iee; ++ieb) {
     DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    DEBUG(errs() << "Found edge from node: "
-                 << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
     DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode()
-                 << "\n");
+    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
     if (!SrcDFNode->isDummyNode()) {
       assert(SrcDFNode->isAllocationNode());
       kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
@@ -1055,11 +1064,10 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
   // If no allocation node was found, SharedMemArgs is empty
   if (kernel->AllocationNode) {
     ValueToValueMapTy VMap;
-    Function *F_alloc =
-        CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
-    // F_alloc->removeFromParent();
+    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    //F_alloc->removeFromParent();
     // Insert the cloned function into the kernels module
-    // M.getFunctionList().push_back(F_alloc);
+    //M.getFunctionList().push_back(F_alloc);
 
     std::vector<IntrinsicInst *> ViscMallocInstVec;
     findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
@@ -1067,8 +1075,7 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
     for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
       IntrinsicInst *II = ViscMallocInstVec[i];
       assert(II->hasOneUse() && "visc_malloc result is used more than once");
-      II->replaceAllUsesWith(
-          ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
       II->eraseFromParent();
     }
     kernel->AllocationFunction = F_alloc;
@@ -1083,19 +1090,15 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
         assert(RetStructTy && "Allocation node does not return a struct type");
         unsigned numFields = RetStructTy->getNumElements();
     */
-    std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap =
-        kernel->getSharedInArgMap();
-    AllocationNodeProperty *APN =
-        (AllocationNodeProperty *)kernel->AllocationNode->getProperty(
-            DFNode::Allocation);
-    for (auto &AllocPair : APN->getAllocationList()) {
+    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
+    AllocationNodeProperty* APN =
+      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
+    for (auto& AllocPair: APN->getAllocationList()) {
       unsigned destPos = AllocPair.first->getDestPosition();
       unsigned srcPos = AllocPair.first->getSourcePosition();
       SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] =
-          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
-      sharedInMap[destPos + 1] =
-          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
+      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
     }
     kernel->setSharedInArgMap(sharedInMap);
   }
@@ -1105,14 +1108,12 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
   // global address space
   unsigned argIndex = 0;
   std::vector<unsigned> GlobalMemArgs;
-  for (Function::arg_iterator ai = F_nvptx->arg_begin(),
-                              ae = F_nvptx->arg_end();
-       ai != ae; ++ai) {
-    if (ai->getType()->isPointerTy()) {
-      // If the arguement is already chosen for shared memory arguemnt list,
-      // skip. Else put it in Global memory arguement list
-      if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) ==
-          0) {
+  for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end();
+      ai != ae; ++ai) {
+    if (ai->getType()->isPointerTy()) {    
+      // If the arguement is already chosen for shared memory arguemnt list, skip.
+      // Else put it in Global memory arguement list
+      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
         GlobalMemArgs.push_back(argIndex);
       }
     }
@@ -1126,21 +1127,20 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
   // Optimization: Gloabl memory arguments, which are not modified and whose
   // loads are not dependent on node id of current node, should be moved to
   // constant memory, subject to size of course
-  std::vector<unsigned> ConstantMemArgs =
-      globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
+  std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
   F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
-  // Function to replace call instructions to functions in the kernel
+// Function to replace call instructions to functions in the kernel
   std::map<Function *, Function *> OrgToClonedFuncMap;
   std::vector<Function *> FuncToBeRemoved;
-  auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) {
-    Function *NewFunc;
+  auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) {
+    Function* NewFunc;
     // Check if the called function has already been cloned before.
     auto It = OrgToClonedFuncMap.find(OrgFunc);
-    if (It == OrgToClonedFuncMap.end()) {
+    if(It == OrgToClonedFuncMap.end()) {
       ValueToValueMapTy VMap;
       NewFunc = CloneFunction(OrgFunc, VMap);
       OrgToClonedFuncMap[OrgFunc] = NewFunc;
@@ -1149,47 +1149,42 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
       NewFunc = (*It).second;
     }
     // Replace the calls to this function
-    std::vector<Value *> args;
-    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+    std::vector<Value*> args;
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
       args.push_back(CI->getArgOperand(i));
     }
-    CallInst *Inst = CallInst::Create(
-        NewFunc, args,
-        OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
+    CallInst* Inst = CallInst::Create(NewFunc, args,
+        OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
     CI->replaceAllUsesWith(Inst);
     IItoRemove.push_back(CI);
     return NewFunc;
   };
 
+
   // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
-       ++i) {
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
     Instruction *I = &(*i);
     // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) &&
-           "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) &&
-           "VISC graph intrinsic within a leaf dataflow node!");
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
 
     if (BuildDFG::isViscIntrinsic(I)) {
-      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
-      IntrinsicInst *ArgII;
-      DFNode *ArgDFNode;
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
 
-      /************************ Handle VISC Query intrinsics
-       * ************************/
+      /************************ Handle VISC Query intrinsics ************************/
 
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode()
-       * *****************************/
+      /**************************** llvm.visc.getNode() *****************************/
       case Intrinsic::visc_getNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
-      } break;
-      /************************* llvm.visc.getParentNode()
-       * **************************/
+      }
+      break;
+      /************************* llvm.visc.getParentNode() **************************/
       case Intrinsic::visc_getParentNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
         // get the parent node of the arg node
@@ -1203,9 +1198,9 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
         Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
 
         IItoRemove.push_back(II);
-      } break;
-      /*************************** llvm.visc.getNumDims()
-       * ***************************/
+      }
+      break;
+      /*************************** llvm.visc.getNumDims() ***************************/
       case Intrinsic::visc_getNumDims: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
         // get node from map
@@ -1214,48 +1209,47 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         int numOfDim = ArgDFNode->getNumOfDim();
         DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext());
-        ConstantInt *numOfDimConstant =
-            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
+        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
 
         // Replace the result of the intrinsic with the computed value
         II->replaceAllUsesWith(numOfDimConstant);
 
         IItoRemove.push_back(II);
-      } break;
-      /*********************** llvm.visc.getNodeInstanceID()
-       * ************************/
+      }
+      break;
+      /*********************** llvm.visc.getNodeInstanceID() ************************/
       case Intrinsic::visc_getNodeInstanceID_x:
       case Intrinsic::visc_getNodeInstanceID_y:
       case Intrinsic::visc_getNodeInstanceID_z: {
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"
-                     << "\t: " << *II << "\n");
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         assert(ArgDFNode && "Arg node is NULL");
         // A leaf node always has a parent
-        DFNode *ParentDFNode = ArgDFNode->getParent();
+        DFNode* ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim =
-            II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x;
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNodeInstanceID_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt *DimConstant =
-            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        // ArrayRef<Value *> Args(DimConstant);
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function *OpenCLFunction;
+        Function * OpenCLFunction;
 
-        FunctionType *FT =
-            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()), false);
+        FunctionType* FT =
+          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                            Type::getInt32Ty(KernelM->getContext()),
+                            false);
         if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
@@ -1264,46 +1258,43 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
           // itself
           DEBUG(errs() << "Substitute with get_global_id()\n");
           DEBUG(errs() << *II << "\n");
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT))
-                  .getCallee());
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          // DEBUG(errs() << "Here inside cond 2\n");
+          //DEBUG(errs() << "Here inside cond 2\n");
           // We are asking for this node's id with respect to its parent
           // this is a local id call
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT))
-                  .getCallee());
-          // DEBUG(errs() << "exiting condition 2\n");
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee());
+          //DEBUG(errs() << "exiting condition 2\n");
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's id with respect to its
           // parent: this is a group id call
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT))
-                  .getCallee());
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee());
         } else {
-          DEBUG(errs() << N->getFuncPointer()->getName() << "\n");
-          DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n");
-          DEBUG(errs() << *II << "\n");
+          errs() << N->getFuncPointer()->getName() << "\n";
+          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
+          errs() << *II << "\n";
 
           assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
-        // DEBUG(errs() << "Create call instruction, insert it before the
-        // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction <<
-        // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
-        // DEBUG(errs() << "Argument: " << Args[0] << "\n");
-        // DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
+        //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n");
+        //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n");
+        //DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
+        //DEBUG(errs() << "Argument: " << Args[0] << "\n");
+        //DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        // DEBUG(errs() << "Replace uses\n");
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        //DEBUG(errs() << "Replace uses\n");
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      } break;
-      /********************** llvm.visc.getNumNodeInstances()
-       * ***********************/
+      }
+      break;
+      /********************** llvm.visc.getNumNodeInstances() ***********************/
       case Intrinsic::visc_getNumNodeInstances_x:
       case Intrinsic::visc_getNumNodeInstances_y:
       case Intrinsic::visc_getNumNodeInstances_z: {
@@ -1312,89 +1303,78 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
         // then, why do we need to keep that info in the graph?  (only for the
         // kernel configuration during the call)
 
-        DEBUG(errs() << F_nvptx->getName()
-                     << "\t: Handling getNumNodeInstances\n");
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         // A leaf node always has a parent
-        DFNode *ParentDFNode = ArgDFNode->getParent();
+        DFNode* ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim =
-            II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x;
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNumNodeInstances_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt *DimConstant =
-            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        // ArrayRef<Value *> Args(DimConstant);
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function *OpenCLFunction;
-        FunctionType *FT =
+        Function * OpenCLFunction;
+        FunctionType* FT =
             FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()), false);
+                              Type::getInt32Ty(KernelM->getContext()),
+                              false);
 
         if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT))
-                  .getCallee());
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT))
-                  .getCallee());
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's instances
           // this is a (global_size/local_size) (grid dim) call
-          OpenCLFunction = cast<Function>(
-              (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT))
-                  .getCallee());
+          OpenCLFunction = cast<Function>
+                           ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee());
         } else {
           assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      } break;
-      case Intrinsic::visc_barrier: {
+      }
+      break;
+      case Intrinsic::visc_barrier:
+      {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
         DEBUG(errs() << "Substitute with barrier()\n");
         DEBUG(errs() << *II << "\n");
-        FunctionType *FT = FunctionType::get(
-            Type::getVoidTy(KernelM->getContext()),
-            std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())),
-            false);
-        Function *OpenCLFunction = cast<Function>(
-            (KernelM->getOrInsertFunction(StringRef("barrier"), FT))
-                .getCallee());
-        CallInst *CI =
-            CallInst::Create(OpenCLFunction,
-                             ArrayRef<Value *>(ConstantInt::get(
-                                 Type::getInt32Ty(KernelM->getContext()), 1)),
-                             "", II);
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
+                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
+                                             false);
+        Function* OpenCLFunction = cast<Function>
+                                   ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee());
+        CallInst* CI = CallInst::Create(OpenCLFunction,
+                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
+                                        "", II);
         II->replaceAllUsesWith(CI);
         IItoRemove.push_back(II);
-<<<<<<< HEAD
       }
       break;
-=======
-      } break;
-      case Intrinsic::visc_atomic_cmpxchg:
-        break;
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
       case Intrinsic::visc_atomic_add:
       case Intrinsic::visc_atomic_sub:
       case Intrinsic::visc_atomic_xchg:
@@ -1403,7 +1383,6 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
       case Intrinsic::visc_atomic_and:
       case Intrinsic::visc_atomic_or:
       case Intrinsic::visc_atomic_xor:
-<<<<<<< HEAD
       {
         DEBUG(errs() << *II << "\n");
         // Only have support for i32 atomic intrinsics
@@ -1420,735 +1399,697 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) {
         if (PtrTy != TargetTy) {
           Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
           PtrTy = TargetTy;
-=======
-        // case Intrinsic::visc_atomic_inc:
-        // case Intrinsic::visc_atomic_dec:
-        {
-          DEBUG(errs() << *II << "\n");
-          // Only have support for i32 atomic intrinsics
-          assert(II->getType() == Type::getInt32Ty(II->getContext()) &&
-                 "Only support i32 atomic intrinsics for now");
-          // Substitute with atomicrmw instruction
-          assert(II->getNumArgOperands() == 2 &&
-                 "Expecting 2 operands for these atomics");
-          Value *Ptr = II->getArgOperand(0);
-          Value *Val = II->getArgOperand(1);
-          assert(Ptr->getType()->isPointerTy() &&
-                 "First argument of supported atomics is expected to be a "
-                 "pointer");
-          PointerType *PtrTy = cast<PointerType>(Ptr->getType());
-          PointerType *TargetTy =
-              Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
-          if (PtrTy != TargetTy) {
-            Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
-            PtrTy = TargetTy;
-          }
-
-          std::string name;
-          if (II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-            name = "atomic_add";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-            name = "atomic_sub";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-            name = "atomic_xchg";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-            name = "atomic_min";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-            name = "atomic_max";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-            name = "atomic_and";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-            name = "atomic_or";
-          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-            name = "atomic_xor";
-          Type *paramTypes[] = {PtrTy, Val->getType()};
-          FunctionType *AtomFuncT = FunctionType::get(
-              II->getType(), ArrayRef<Type *>(paramTypes, 2), false);
-          FunctionCallee AtomFunc =
-              KernelM->getOrInsertFunction(name, AtomFuncT);
-
-          Value *Params[] = {Ptr, Val};
-          CallInst *AtomCI = CallInst::Create(
-              AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II);
-          DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
-          II->replaceAllUsesWith(AtomCI);
-          IItoRemove.push_back(II);
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
-        }
-        break;
-      default:
-        llvm_unreachable("Unknown VISC Intrinsic!");
-        break;
-      }
-
-    } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-      IRBuilder<> Builder(I);
-      Value *Source = MemCpyI->getSource();
-      Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-      Value *Length = MemCpyI->getOperand(2);
-      DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-      DEBUG(errs() << "Source: " << *Source << "\n");
-      DEBUG(errs() << "Destination: " << *Destination << "\n");
-      DEBUG(errs() << "Length: " << *Length << "\n");
-
-      size_t memcpy_length;
-      unsigned int memcpy_count;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) {
-        if (CI->getBitWidth() <= 64) {
-          memcpy_length = CI->getSExtValue();
-          DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-          Type *Source_Type = Source->getType()->getPointerElementType();
-          DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-          memcpy_count =
-              memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-          DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-          if (GetElementPtrInst *sourceGEPI =
-                  dyn_cast<GetElementPtrInst>(Source)) {
-            if (GetElementPtrInst *destGEPI =
-                    dyn_cast<GetElementPtrInst>(Destination)) {
-              Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-              Value *DestPtrOperand = destGEPI->getPointerOperand();
-              for (int i = 0; i < memcpy_count; ++i) {
-                Constant *increment;
-                LoadInst *newLoadI;
-                StoreInst *newStoreI;
-                // First, need to increment the correct index for both source
-                // and dest This invluves checking to see how many indeces the
-                // GEP has Assume for now only 1 or 2 are the viable options.
-
-                std::vector<Value *> GEPlIndex;
-                if (sourceGEPI->getNumIndices() == 1) {
-                  Value *Index = sourceGEPI->getOperand(1);
-                  increment = ConstantInt::get(Index->getType(), i, false);
-                  Value *incAdd = Builder.CreateAdd(Index, increment);
-                  DEBUG(errs() << "Add: " << *incAdd << "\n");
-                  GEPlIndex.push_back(incAdd);
-                  Value *newGEPIl = Builder.CreateGEP(
-                      SourcePtrOperand, ArrayRef<Value *>(GEPlIndex));
-                  DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-                  newLoadI = Builder.CreateLoad(newGEPIl);
-                  DEBUG(errs() << "Load: " << *newLoadI << "\n");
-                } else {
-                  llvm_unreachable("Unhandled case where source GEPI has more "
-                                   "than 1 indices!\n");
-                }
-
-                std::vector<Value *> GEPsIndex;
-                if (destGEPI->getNumIndices() == 1) {
-
-                } else if (destGEPI->getNumIndices() == 2) {
-                  Value *Index0 = destGEPI->getOperand(1);
-                  GEPsIndex.push_back(Index0);
-                  Value *Index1 = destGEPI->getOperand(2);
-                  increment = ConstantInt::get(Index1->getType(), i, false);
-                  Value *incAdd = Builder.CreateAdd(Index1, increment);
-                  DEBUG(errs() << "Add: " << *incAdd << "\n");
-                  GEPsIndex.push_back(incAdd);
-                  Value *newGEPIs = Builder.CreateGEP(
-                      DestPtrOperand, ArrayRef<Value *>(GEPsIndex));
-                  DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-                  newStoreI = Builder.CreateStore(newLoadI, newGEPIs,
-                                                  MemCpyI->isVolatile());
-                  DEBUG(errs() << "Store: " << *newStoreI << "\n");
-                } else {
-                  llvm_unreachable("Unhandled case where dest GEPI has more "
-                                   "than 2 indices!\n");
-                }
-              }
-              IItoRemove.push_back(sourceGEPI);
-              IItoRemove.push_back(destGEPI);
-              Instruction *destBitcastI =
-                  dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-              Instruction *sourceBitcastI =
-                  dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-              IItoRemove.push_back(destBitcastI);
-              IItoRemove.push_back(sourceBitcastI);
-              IItoRemove.push_back(MemCpyI);
-            }
-          }
         }
-      } else {
-        llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-      }
-      //      llvm_unreachable("HERE!");
-    }
-
-    else if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      DEBUG(errs() << "Found a call: " << *CI << "\n");
-      Function *calleeF =
-          cast<Function>(CI->getCalledValue()->stripPointerCasts());
-      if (calleeF->isDeclaration()) {
-        // Add the declaration to kernel module
-        if (calleeF->getName() == "sqrtf") {
-          calleeF->setName(Twine("sqrt"));
-          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-          DEBUG(errs() << "CI: " << *CI << "\n");
-        } else if (calleeF->getName() == "rsqrtf") {
-          calleeF->setName(Twine("rsqrt"));
-          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-          DEBUG(errs() << "CI: " << *CI << "\n");
-        }
-        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF
-                     << "\n");
-        KernelM->getOrInsertFunction(calleeF->getName(),
-                                     calleeF->getFunctionType());
-      } else {
-        // Check if the called function has already been cloned before.
-        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-        // Iterate over the new function to see if it calls any other functions
-        // in the module.
-        for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc);
-             i != e; ++i) {
-          if (auto *Call = dyn_cast<CallInst>(&*i)) {
-            Function *CalledFunc =
-                cast<Function>(Call->getCalledValue()->stripPointerCasts());
-            CloneAndReplaceCall(Call, CalledFunc);
-          }
-        }
-      }
-      // TODO: how to handle address space qualifiers in load/store
-    }
-  }
-  // search for pattern where float is being casted to int and loaded/stored and
-  // change it.
-  DEBUG(errs() << "finding pattern for replacement!\n");
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
-       ++i) {
-    bool cont = false;
-    bool keepGEPI = false;
-    bool keepGEPI2 = false;
-    Instruction *I = &(*i);
-    GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
 
-    if (!GEPI) {
-      // did nod find pattern start, continue
-      continue;
-    }
-    // may have found pattern, check
-    DEBUG(errs() << "GEPI " << *GEPI << "\n");
-    // print whatever we want for debug
-    Value *PtrOp = GEPI->getPointerOperand();
-    Type *SrcTy = GEPI->getSourceElementType();
-    unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-    if (SrcTy->isArrayTy())
-      DEBUG(errs() << *SrcTy << " is an array type! "
-                   << *(SrcTy->getArrayElementType()) << "\n");
-    else
-      DEBUG(errs() << *SrcTy << " is not an array type!\n");
-    // check that source element type is float
-    if (SrcTy->isArrayTy()) {
-      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-        DEBUG(errs() << "GEPI type is array but not float!\n");
-        continue;
-      }
-    } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) {
-      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-      // does not fit this pattern - no float GEP instruction
-      continue;
-    }
-    // check that addressspace is 1
-    //	  if (GEPIaddrspace != 1) {
-    //			// does not fit this pattern - addrspace of pointer argument is
-    //not global 			continue;
-    //		}
-    if (!(GEPI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      // continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI = true;
-    }
-    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-    // 1st GEPI it has one use
-    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-    // See if it is a bitcast
-    BitCastInst *BitCastI;
-    for (User *U : GEPI->users()) {
-      if (Instruction *ui = dyn_cast<Instruction>(U)) {
-        DEBUG(errs() << "--" << *ui << "\n");
-        if (isa<BitCastInst>(ui)) {
-          BitCastI = dyn_cast<BitCastInst>(ui);
-          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-          break;
-        }
-      }
-      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = GEPI->user_begin(),
-    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
-    //        DEBUG(errs() << "--" << *ui << "\n");
-    //			if (isa<BitCastInst>(*ui)) {
-    //				BitCastI = dyn_cast<BitCastInst>(*ui);
-    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-    //			}
-    //		}
-
-    if (cont /*!BitCastI*/) {
-      continue; // not in pattern
-    }
-
-    //    DEBUG(errs() << *BitCastI << "\n");
-    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand
-    // has to be the GEP, since this is a use of the GEP.
-    Value *Op2 = BitCastI->getOperand(0);
-    DEBUG(errs() << "----" << *Op2 << "\n");
-    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-    //		Type *OpTy = cast<Type>(Op2);
-    Type *OpTy = BitCastI->getDestTy();
-    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) <<
-    //    "\n");
-    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-      continue; // not in pattern
-    }
-
-    DEBUG(errs() << "----Here!\n");
-    // We are in GEP, bitcast.
-
-    // user_iterator, to find the load.
-
-    if (!(BitCastI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-    }
-    DEBUG(errs() << "----Bitcast has one use!\n");
-    // it has one use
-    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-    LoadInst *LoadI;
-    for (User *U : BitCastI->users()) {
-      if (Instruction *ui = dyn_cast<Instruction>(U)) {
-        DEBUG(errs() << "-----" << *ui << "\n");
-        if (isa<LoadInst>(ui)) {
-          LoadI = dyn_cast<LoadInst>(ui);
-          DEBUG(errs() << "-----Found load as only use of bitcast\n");
-          break;
-        }
-      }
-      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-      cont = true;
-    }
-    //		for (Value::user_iterator ui = BitCastI->user_begin(),
-    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-    //			if (isa<LoadInst>(*ui)) {
-    //				LoadI = dyn_cast<LoadInst>(*ui);
-    //        errs() << "Found load as only use of bitcast\n";
-    //			}
-    //		}
-
-    if (cont) {
-      continue; // not in pattern
-    }
-
-    DEBUG("HERE!\n");
-    // check that we load from pointer we got from bitcast - assert - the unique
-    // argument must be the use we found it from
-    assert(LoadI->getPointerOperand() == BitCastI &&
-           "Unexpected Load Instruction Operand\n");
-
-    // Copy user_iterator, to find the store.
-
-    if (!(LoadI->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      continue;
-      // TODO: generalize: one load can have more than one store users
-    }
-
-    // it has one use
-    assert(LoadI->hasOneUse() && "LoadI has a single use");
-    Value::user_iterator ui = LoadI->user_begin();
-    // skipped loop, because is has a single use
-    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-    if (!StoreI) {
-      continue; // not in pattern
-    }
-
-    // Also check that the store uses the loaded value as the value operand
-    if (StoreI->getValueOperand() != LoadI) {
-      continue;
-    }
-
-    DEBUG(errs() << "-------Found store instruction\n");
-
-    // Look for its bitcast, which is its pointer operand
-    Value *StPtrOp = StoreI->getPointerOperand();
-    DEBUG(errs() << "-------" << *StPtrOp << "\n");
-    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-    if (!BitCastI2) {
-      continue; // not in pattern
-    }
-
-    DEBUG(errs() << "-------- Found Bit Cast of store!\n");
-    // found bitcast. Look for the second GEP, its from operand.
-    Value *BCFromOp = BitCastI2->getOperand(0);
-    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-    if (!GEPI2) {
-      continue; // not in pattern
-    }
-
-    if (!(GEPI2->hasOneUse())) {
-      // does not fit this pattern - more than one uses
-      // continue;
-      // Keep GEPI around if it has other uses
-      keepGEPI2 = true;
-    }
-    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n");
-
-    Value *PtrOp2 = GEPI2->getPointerOperand();
-
-    // Found GEPI2. TODO: kind of confused as o what checks I need to add here,
-    // let's add them together- all the code for int-float type checks is
-    // already above.
-
-    // Assume we found pattern
-    if (!keepGEPI) {
-      IItoRemove.push_back(GEPI);
-      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-    } else {
-      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-    }
-    IItoRemove.push_back(BitCastI);
-    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-    IItoRemove.push_back(LoadI);
-    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-    IItoRemove.push_back(GEPI2);
-    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-    IItoRemove.push_back(BitCastI2);
-    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-    if (!keepGEPI2) {
-      IItoRemove.push_back(StoreI);
-      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-    } else {
-
-      DEBUG(errs() << "Keeping " << *StoreI
-                   << " since it has multiple uses!\n");
-    }
-
-    std::vector<Value *> GEPlIndex;
-    if (GEPI->hasIndices()) {
-      for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-        GEPlIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-    std::vector<Value *> GEPsIndex;
-    if (GEPI2->hasIndices()) {
-      for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-        Value *Index = dyn_cast<Value>(&*ii);
-        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-        GEPsIndex.push_back(Index);
-      }
-    }
-    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-    GetElementPtrInst *newlGEP = GetElementPtrInst::Create(
-        GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-        PtrOp,                        // operand from 1st GEP
-        ArrayRef<Value *>(GEPlIndex), Twine(), StoreI);
-    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-    // insert load before GEPI
-    LoadInst *newLoadI =
-        new LoadInst(Type::getFloatTy(M.getContext()),
-                     newlGEP, // new GEP
-                     Twine(), LoadI->isVolatile(), LoadI->getAlignment(),
-                     LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI);
-    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-    // same for GEP for store, for store operand
-    GetElementPtrInst *newsGEP = GetElementPtrInst::Create(
-        GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-        PtrOp2,                        // operand from 2nd GEP
-        ArrayRef<Value *>(GEPsIndex), Twine(), StoreI);
-    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-    // insert store before GEPI
-    StoreInst *newStoreI =
-        new StoreInst(newLoadI,
-                      newsGEP, // new GEP
-                      StoreI->isVolatile(), StoreI->getAlignment(),
-                      StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI);
-    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
-  }
+			 std::string name;
+			 if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
+				 name = "atomic_add";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
+				 name = "atomic_sub";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
+				 name = "atomic_xchg";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
+				 name = "atomic_min";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
+				 name = "atomic_max";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
+				 name = "atomic_and";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
+				 name = "atomic_or";
+			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
+				 name = "atomic_xor";
+			 Type* paramTypes[] = {PtrTy, Val->getType()};
+			 FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
+			 FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
+
+			 Value* Params[] = {Ptr, Val};
+			 CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
+			 DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
+			 II->replaceAllUsesWith(AtomCI);
+			 IItoRemove.push_back(II);
+			}
+			break;
+			default:
+			llvm_unreachable("Unknown VISC Intrinsic!");
+			break;
+			}
+
+		}
+		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+			IRBuilder<> Builder(I);
+			Value *Source = MemCpyI->getSource();
+			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+			Value *Length = MemCpyI->getOperand(2);
+			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+			DEBUG(errs() << "Source: " << *Source << "\n"); 
+			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
+			DEBUG(errs() << "Length: " << *Length << "\n");
+
+			size_t memcpy_length;
+			unsigned int memcpy_count;
+			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
+				if (CI->getBitWidth() <= 64) {
+					memcpy_length = CI->getSExtValue();
+					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+					Type *Source_Type = Source->getType()->getPointerElementType();
+					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
+						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
+							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+							Value *DestPtrOperand = destGEPI->getPointerOperand();
+							for(int i = 0; i < memcpy_count; ++i) {
+								Constant *increment;
+								LoadInst *newLoadI;
+								StoreInst *newStoreI;
+								// First, need to increment the correct index for both source and dest 
+								// This invluves checking to see how many indeces the GEP has
+								// Assume for now only 1 or 2 are the viable options.
+
+								std::vector<Value*> GEPlIndex;
+								if (sourceGEPI->getNumIndices() == 1) {
+									Value *Index = sourceGEPI->getOperand(1);      
+									increment = ConstantInt::get(Index->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPlIndex.push_back(incAdd);
+									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
+									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+									newLoadI = Builder.CreateLoad(newGEPIl);
+									DEBUG(errs() << "Load: " << *newLoadI << "\n");
+								} else { 
+									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
+								}
+
+
+								std::vector<Value*> GEPsIndex;
+								if (destGEPI->getNumIndices() == 1) {
+
+								} else if (destGEPI->getNumIndices() == 2) {
+									Value *Index0 = destGEPI->getOperand(1);      
+									GEPsIndex.push_back(Index0);
+									Value *Index1 = destGEPI->getOperand(2);      
+									increment = ConstantInt::get(Index1->getType(), i, false);
+									Value *incAdd = Builder.CreateAdd(Index1, increment);
+									DEBUG(errs() << "Add: " << *incAdd << "\n");
+									GEPsIndex.push_back(incAdd);
+									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
+									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
+									DEBUG(errs() << "Store: " << *newStoreI << "\n");
+								} else {
+									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
+								}
+							}
+							IItoRemove.push_back(sourceGEPI);
+							IItoRemove.push_back(destGEPI);
+							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+							IItoRemove.push_back(destBitcastI);
+							IItoRemove.push_back(sourceBitcastI);
+							IItoRemove.push_back(MemCpyI);
+						}
+					}
+
+				}
+			} else {
+				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+			}
+			//      llvm_unreachable("HERE!");
+		}
+
+		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+			DEBUG(errs() << "Found a call: " << *CI << "\n");
+			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+			if(calleeF->isDeclaration()) {
+				// Add the declaration to kernel module
+				if (calleeF->getName() == "sqrtf") {
+					calleeF->setName(Twine("sqrt"));
+					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+					DEBUG(errs() << "CI: " << *CI << "\n");
+				} else if (calleeF->getName() == "rsqrtf") {
+					calleeF->setName(Twine("rsqrt"));
+					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+					DEBUG(errs() << "CI: " << *CI << "\n");
+				}  
+				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+			}
+			else {
+				// Check if the called function has already been cloned before.
+				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+				// Iterate over the new function to see if it calls any other functions
+				// in the module.
+				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
+					if(auto *Call = dyn_cast<CallInst>(&*i)) {
+						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
+						CloneAndReplaceCall(Call, CalledFunc);
+					}
+				}
+			}
+			//TODO: how to handle address space qualifiers in load/store
+		}
+
+	}
+	// search for pattern where float is being casted to int and loaded/stored and change it.	
+	DEBUG(errs() << "finding pattern for replacement!\n");
+	for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+		bool cont = false;
+		bool keepGEPI = false;
+		bool keepGEPI2= false;
+		Instruction *I = &(*i);
+		GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
+
+		if (!GEPI) {
+			// did nod find pattern start, continue
+			continue;
+		}
+		// may have found pattern, check
+		DEBUG(errs() << "GEPI " << *GEPI << "\n");
+		// print whatever we want for debug
+		Value* PtrOp = GEPI->getPointerOperand();
+		Type *SrcTy = GEPI->getSourceElementType();
+		unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+		if (SrcTy->isArrayTy()) 
+			DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
+		else
+			DEBUG(errs() << *SrcTy << " is not an array type!\n");
+		// check that source element type is float
+		if (SrcTy->isArrayTy()) {
+			if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+				DEBUG(errs() << "GEPI type is array but not float!\n");
+				continue;
+			}
+		}
+		else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
+			DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+			// does not fit this pattern - no float GEP instruction
+			continue;
+		}
+		// check that addressspace is 1
+		//	  if (GEPIaddrspace != 1) {
+		//			// does not fit this pattern - addrspace of pointer argument is not global
+		//			continue;
+		//		}
+		if (!(GEPI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			//continue;
+			// Keep GEPI around if it has other uses
+			keepGEPI = true;
+		}
+		DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+		// 1st GEPI it has one use
+		//		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+		// See if it is a bitcast
+		BitCastInst *BitCastI;
+		for (User * U : GEPI->users()) {
+			if(Instruction *ui = dyn_cast<Instruction> (U)) { 
+				DEBUG(errs() << "--" << *ui << "\n");
+				if (isa<BitCastInst>(ui)) {
+					BitCastI = dyn_cast<BitCastInst>(ui);
+					DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+					break;
+				}
+			}
+			DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+			cont = true;
+		}
+		//		for (Value::user_iterator ui = GEPI->user_begin(),
+		//				ue = GEPI->user_end(); ui!=ue; ++ui) {
+		//        DEBUG(errs() << "--" << *ui << "\n");
+		//			if (isa<BitCastInst>(*ui)) {
+		//				BitCastI = dyn_cast<BitCastInst>(*ui);
+		//        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+		//			}
+		//		}
+
+		if (cont/*!BitCastI*/) {
+			continue; // not in pattern
+		}
+
+		//    DEBUG(errs() << *BitCastI << "\n");
+		// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
+		Value *Op2 = BitCastI->getOperand(0);
+		DEBUG(errs() << "----" << *Op2 << "\n");
+		//		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+		//		Type *OpTy = cast<Type>(Op2);
+		Type *OpTy = BitCastI->getDestTy();
+		DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+		//    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
+		if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+			// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+			continue; // not in pattern
+		}
+
+		DEBUG(errs() << "----Here!\n");
+		// We are in GEP, bitcast.
+
+		// user_iterator, to find the load.
+
+		if (!(BitCastI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			continue;
+		}
+		DEBUG(errs() << "----Bitcast has one use!\n");
+		// it has one use
+		assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+		LoadInst *LoadI;
+		for (User * U : BitCastI->users()) { 
+			if (Instruction *ui = dyn_cast<Instruction> (U)) {
+				DEBUG(errs() << "-----" << *ui << "\n");
+				if (isa<LoadInst>(ui)) {
+					LoadI = dyn_cast<LoadInst>(ui);
+					DEBUG(errs() << "-----Found load as only use of bitcast\n");
+					break;
+				}
+			}
+			DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+			cont = true;
+		}
+		//		for (Value::user_iterator ui = BitCastI->user_begin(),
+		//				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+		//			if (isa<LoadInst>(*ui)) {
+		//				LoadI = dyn_cast<LoadInst>(*ui);
+		//        errs() << "Found load as only use of bitcast\n";
+		//			}
+		//		}
+
+		if (cont) {
+			continue; // not in pattern
+		}
+
+		DEBUG("HERE!\n");
+		// check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
+		assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
+
+		// Copy user_iterator, to find the store.
+
+		if (!(LoadI->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			continue;
+			// TODO: generalize: one load can have more than one store users
+		}
+
+		// it has one use
+		assert(LoadI->hasOneUse() && "LoadI has a single use");
+		Value::user_iterator ui = LoadI->user_begin();
+		// skipped loop, because is has a single use
+		StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+		if (!StoreI) {
+			continue; // not in pattern
+		}
+
+		// Also check that the store uses the loaded value as the value operand
+		if (StoreI->getValueOperand() != LoadI) {
+			continue;
+		}
+
+		DEBUG(errs() << "-------Found store instruction\n");
+
+		// Look for its bitcast, which is its pointer operand
+		Value *StPtrOp = StoreI->getPointerOperand();
+		DEBUG(errs() << "-------" << *StPtrOp << "\n");
+		BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+		DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+		if (!BitCastI2) {
+			continue; //not in pattern
+		}
+
+		DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
+		// found bitcast. Look for the second GEP, its from operand.
+		Value *BCFromOp = BitCastI2->getOperand(0);
+		GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+		DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+		if (!GEPI2) {
+			continue; //not in pattern
+		}
+
+		if (!(GEPI2->hasOneUse())) {
+			// does not fit this pattern - more than one uses
+			//continue;
+			// Keep GEPI around if it has other uses
+			keepGEPI2 = true;
+		}
+		DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
+
+		Value *PtrOp2 = GEPI2->getPointerOperand();
+
+		// Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
+
+		// Assume we found pattern
+		if (!keepGEPI) {  
+			IItoRemove.push_back(GEPI);
+			DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+		} else {
+			DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+		}
+		IItoRemove.push_back(BitCastI);
+		DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+		IItoRemove.push_back(LoadI);
+		DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+		IItoRemove.push_back(GEPI2);
+		DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+		IItoRemove.push_back(BitCastI2);
+		DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+		if (!keepGEPI2) {
+			IItoRemove.push_back(StoreI);
+			DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+		} else {
+
+			DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
+		}
+
+		std::vector<Value*> GEPlIndex;
+		if (GEPI->hasIndices()) {
+			for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+				Value *Index = dyn_cast<Value>(&*ii);
+				DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+				GEPlIndex.push_back(Index);
+			}
+		}
+		//    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+		std::vector<Value*> GEPsIndex;
+		if (GEPI2->hasIndices()) {
+			for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+				Value *Index = dyn_cast<Value>(&*ii);
+				DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+				GEPsIndex.push_back(Index);
+			}
+		}
+		//    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+
+
+		//    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+		GetElementPtrInst* newlGEP =
+			GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
+					PtrOp, // operand from 1st GEP
+					ArrayRef<Value*>(GEPlIndex),
+					Twine(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+		// insert load before GEPI
+		LoadInst *newLoadI =
+			new LoadInst(Type::getFloatTy(M.getContext()),
+					newlGEP, // new GEP
+					Twine(),
+					LoadI->isVolatile(),
+					LoadI->getAlignment(),
+					LoadI->getOrdering(),
+					LoadI->getSyncScopeID(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+		// same for GEP for store, for store operand
+		GetElementPtrInst* newsGEP =
+			GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+					PtrOp2, // operand from 2nd GEP
+					ArrayRef<Value*>(GEPsIndex),
+					Twine(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+		// insert store before GEPI
+		StoreInst *newStoreI =
+			new StoreInst(newLoadI,
+					newsGEP, // new GEP
+					StoreI->isVolatile(),
+					StoreI->getAlignment(),
+					StoreI->getOrdering(),
+					StoreI->getSyncScopeID(),
+					StoreI);
+		DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+
+	}
+
+	// We need to do this explicitly: DCE pass will not remove them because we
+	// have assumed theworst memory behaviour for these function calls
+	// Traverse the vector backwards, otherwise definitions are deleted while
+	// their subsequent uses are still around
+	for (auto *I : reverse(IItoRemove)) {
+		DEBUG(errs() << "Erasing: " << *I << "\n");
+		I->eraseFromParent();
+	}
+
+	// Removed the cloned functions from the parent module into the new module 
+	for(auto *F : FuncToBeRemoved) {
+		F->removeFromParent(); //TODO: MARIA check
+		KernelM->getFunctionList().push_back(F);
+	}
+
+	addCLMetadata(F_nvptx);
+	kernel->KernelFunction = F_nvptx;
+	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+	DEBUG(errs() << *KernelM);
+
+	return;
+}
 
-  // We need to do this explicitly: DCE pass will not remove them because we
-  // have assumed theworst memory behaviour for these function calls
-  // Traverse the vector backwards, otherwise definitions are deleted while
-  // their subsequent uses are still around
-  for (auto *I : reverse(IItoRemove)) {
-    DEBUG(errs() << "Erasing: " << *I << "\n");
-    I->eraseFromParent();
-  }
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+	errs() << "\nDFG2LLVM_NVPTX PASS\n";
 
-  // Removed the cloned functions from the parent module into the new module
-  for (auto *F : FuncToBeRemoved) {
-    F->removeFromParent(); // TODO: MARIA check
-    KernelM->getFunctionList().push_back(F);
-  }
+	// Get the BuildDFG Analysis Results:
+	// - Dataflow graph
+	// - Maps from i8* hansles to DFNode and DFEdge
+	BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  addCLMetadata(F_nvptx);
-  kernel->KernelFunction = F_nvptx;
-  DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName()
-               << "\n");
-  DEBUG(errs() << *KernelM);
+	// DFInternalNode *Root = DFG.getRoot();
+	std::vector<DFInternalNode*> Roots = DFG.getRoots();
+	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  return;
-}
+	// Visitor for Code Generation Graph Traversal
+	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
 
-bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-  DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n");
-
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
-
-  // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode *> Roots = DFG.getRoots();
-  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap =
-  //    DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap
-  //    = DFG.getHandleToDFEdgeMap();
-
-  // Visitor for Code Generation Graph Traversal
-  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
-
-  // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode : Roots) {
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(rootNode);
-  }
+	// Iterate over all the DFGs and produce code for each one of them
+	for (auto rootNode: Roots) {
+		// Initiate code generation for root DFNode
+		CGTVisitor->visit(rootNode);
+	}
 
-  CGTVisitor->writeKernelsModule();
+	CGTVisitor->writeKernelsModule();
 
-  // TODO: Edit module epilogue to remove the VISC intrinsic declarations
-  delete CGTVisitor;
+	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
+	delete CGTVisitor;
 
-  return true;
+	return true;
 }
 
 std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-  /*SmallString<128> currentDir;
-          llvm::sys::fs::current_path(currentDir);
-          std::string fileName = getFilenameFromModule(M);
-          Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-          return output.str().append(".kernels.ll");*/
-  std::string mid = M.getModuleIdentifier();
-  return mid.append(".kernels.ll");
+	/*SmallString<128> currentDir;
+		llvm::sys::fs::current_path(currentDir);
+		std::string fileName = getFilenameFromModule(M);
+		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+		return output.str().append(".kernels.ll");*/
+	std::string mid = M.getModuleIdentifier();
+	return mid.append(".kernels.ll");
 }
 
-void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) {
-  assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!");
-  PointerType *OldTy = cast<PointerType>(V->getType());
-  PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-  V->mutateType(NewTy);
-  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
-       ui++) {
-    // Change all uses producing pointer type in same address space to new
-    // addressspace.
-    if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) {
-      if (PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-        fixValueAddrspace(*ui, addrspace);
-      }
-    }
-  }
+void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
+	assert(isa<PointerType>(V->getType())
+			&& "Value should be of Pointer Type!");
+	PointerType* OldTy = cast<PointerType>(V->getType());
+	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+	V->mutateType(NewTy);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+		// Change all uses producing pointer type in same address space to new
+		// addressspace.
+		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+				fixValueAddrspace(*ui, addrspace);
+			}
+		}
+	}
 }
 
-std::vector<unsigned>
-CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs,
-                                     Function *F) {
-  std::vector<unsigned> ConstantMemArgs;
-  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
-       ++ai) {
-    Argument *arg = &*ai;
-    std::vector<unsigned>::iterator pos = std::find(
-        GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo());
-    // It has to be a global memory argument to be promotable
-    if (pos == GlobalMemArgs->end())
-      continue;
-
-    // Check if it can/should be promoted
-    if (canBePromoted(arg, F)) {
-      DEBUG(errs() << "Promoting << " << arg->getName()
-                   << " to constant memory."
-                   << "\n");
-      ConstantMemArgs.push_back(arg->getArgNo());
-      GlobalMemArgs->erase(pos);
-    }
-  }
-  return ConstantMemArgs;
-}
-
-Function *CGT_NVPTX::changeArgAddrspace(Function *F,
-                                        std::vector<unsigned> &Args,
-                                        unsigned addrspace) {
-  unsigned idx = 0;
-  std::vector<Type *> ArgTypes;
-  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
-       ++ai) {
-    Argument *arg = &*ai;
-    DEBUG(errs() << *arg << "\n");
-    unsigned argno = arg->getArgNo();
-    if ((idx < Args.size()) && (argno == Args[idx])) {
-      fixValueAddrspace(arg, addrspace);
-      idx++;
-    }
-    ArgTypes.push_back(arg->getType());
-  }
-  FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
 
-  // F->mutateType(PTy);
-  Function *newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
+	std::vector<unsigned> ConstantMemArgs;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument* arg = &*ai; 
+		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
+				GlobalMemArgs->end(), arg->getArgNo());
+		// It has to be a global memory argument to be promotable
+		if(pos == GlobalMemArgs->end())
+			continue;
+
+		// Check if it can/should be promoted
+		if(canBePromoted(arg, F)) {
+			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
+			ConstantMemArgs.push_back(arg->getArgNo());
+			GlobalMemArgs->erase(pos);
+		}
+	}
+	return ConstantMemArgs;
+}
 
-  DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n");
-  return newF;
+Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
+	unsigned idx = 0;
+	std::vector<Type*> ArgTypes;
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Argument *arg = &*ai;
+		DEBUG(errs() << *arg << "\n");
+		unsigned argno = arg->getArgNo();
+		if ((idx < Args.size()) && (argno == Args[idx])) {
+			fixValueAddrspace(arg, addrspace);
+			idx++;
+		}
+		ArgTypes.push_back(arg->getType());
+	}
+	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+	return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-  IRBuilder<> Builder(&*F->begin());
-
-  SmallVector<Metadata *, 8> KernelMD;
-  KernelMD.push_back(ValueAsMetadata::get(F));
-
-  // TODO: There is additional metadata used by kernel files but we skip them as
-  // they are not mandatory. In future they might be useful to enable
-  // optimizations
-
-  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_kernels =
-      KernelM->getOrInsertNamedMetadata("opencl.kernels");
-  MDN_kernels->addOperand(MDKernelNode);
-
-  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-  // TODO: Replace 1 with the number of the kernel.
-  // Add when support for multiple launces is added
-  KernelMD.push_back(ValueAsMetadata::get(
-      ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)));
-  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-  NamedMDNode *MDN_annotations =
-      KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
-}
+	IRBuilder<> Builder(&*F->begin());
 
-void CGT_NVPTX::writeKernelsModule() {
+	SmallVector<Metadata*,8> KernelMD;
+	KernelMD.push_back(ValueAsMetadata::get(F));
 
-  // In addition to deleting all other functions, we also want to spiff it
-  // up a little bit.  Do this now.
-  legacy::PassManager Passes;
+	// TODO: There is additional metadata used by kernel files but we skip them as
+	// they are not mandatory. In future they might be useful to enable
+	// optimizations
 
-  DEBUG(errs() << "Writing to File --- ");
-  DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
-  std::error_code EC;
-  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-  if (EC) {
-    DEBUG(errs() << EC.message() << '\n');
-  }
-
-  Passes.add(createPrintModulePass(Out.os()));
+	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+	MDN_kernels->addOperand(MDKernelNode);
 
-  Passes.run(*KernelM);
+	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+	// TODO: Replace 1 with the number of the kernel.
+	// Add when support for multiple launces is added
+	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
-  // Declare success.
-  Out.keep();
 }
 
-Function *CGT_NVPTX::transformFunctionToVoid(Function *F) {
-
-  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-  // FIXME: Maybe do that using the Node?
-  StructType *FRetTy = dyn_cast<StructType>(F->getReturnType());
-  assert(FRetTy && "Return Type must always be a struct");
-
-  // Keeps return statements, because we will need to replace them
-  std::vector<ReturnInst *> RItoRemove;
-  findReturnInst(F, RItoRemove);
-
-  std::vector<Type *> RetArgTypes;
-  std::vector<Argument *> RetArgs;
-  std::vector<Argument *> Args;
-  // Check for { } return struct, which means that the function returns void
-  if (FRetTy->isEmptyTy()) {
-
-    DEBUG(errs() << "\tFunction output struct is void\n");
-    DEBUG(errs() << "\tNo parameters added\n");
+void CGT_NVPTX::writeKernelsModule() {
 
-    // Replacing return statements with others returning void
-    for (auto *RI : RItoRemove) {
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-    }
-    DEBUG(errs() << "\tChanged return statements to return void\n");
-  } else {
-    // The struct has return values, thus needs to be converted to parameter
-
-    // Iterate over all element types of return struct and add arguments to the
-    // function
-    for (unsigned i = 0; i < FRetTy->getNumElements(); i++) {
-      Argument *RetArg =
-          new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-      RetArgs.push_back(RetArg);
-      RetArgTypes.push_back(RetArg->getType());
-      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-    }
+	// In addition to deleting all other functions, we also want to spiff it
+	// up a little bit.  Do this now.
+	legacy::PassManager Passes;
 
-    DEBUG(errs() << "\tReplacing Return statements\n");
-    // Replace return statements with extractValue and store instructions
-    for (auto *RI : RItoRemove) {
-      Value *RetVal = RI->getReturnValue();
-      for (unsigned i = 0; i < RetArgs.size(); i++) {
-        ExtractValueInst *EI = ExtractValueInst::Create(
-            RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI);
-        new StoreInst(EI, RetArgs[i], RI);
-      }
-      // assert(RetVal && "Return value should not be null at this point");
-      // StructType* RetType = cast<StructType>(RetVal->getType());
-      // assert(RetType && "Return type is not a struct");
+	errs() << "Writing to File --- ";
+	errs() << getKernelsModuleName(M).c_str() << "\n";
+	std::error_code EC;
+	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+	if (EC) {
+		errs() << EC.message() << '\n';
+	}
 
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-    }
-  }
-  DEBUG(errs() << "\tReplaced return statements\n");
+	Passes.add(
+			createPrintModulePass(Out.os()));
 
-  // Create the argument type list with the added argument's type
-  std::vector<Type *> ArgTypes;
-  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-       ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-  for (auto *RATy : RetArgTypes) {
-    ArgTypes.push_back(RATy);
-  }
+	Passes.run(*KernelM);
 
-  // Creating Args vector to use in cloning!
-  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
-       ++ai) {
-    Args.push_back(&*ai);
-  }
-  for (auto *ai : RetArgs) {
-    Args.push_back(ai);
-  }
+	// Declare success.
+	Out.keep();
+}
 
-  // Adding new arguments to the function argument list, would not change the
-  // function type. We need to change the type of this function to reflect the
-  // added arguments
-  Type *VoidRetType = Type::getVoidTy(F->getContext());
-  FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-  // Change the function type
-  // F->mutateType(PTy);
-  Function *newF = cloneFunction(F, newFT, false, NULL, &Args);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-  // F->eraseFromParent();
-  return newF;
+Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
+
+	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+	// FIXME: Maybe do that using the Node?
+	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
+	assert(FRetTy && "Return Type must always be a struct");
+
+	// Keeps return statements, because we will need to replace them
+	std::vector<ReturnInst *> RItoRemove;
+	findReturnInst(F, RItoRemove);
+
+	std::vector<Type *> RetArgTypes;
+	std::vector<Argument*> RetArgs;
+	std::vector<Argument*> Args;
+	// Check for { } return struct, which means that the function returns void
+	if (FRetTy->isEmptyTy()) {
+
+		DEBUG(errs() << "\tFunction output struct is void\n");
+		DEBUG(errs() << "\tNo parameters added\n");
+
+		// Replacing return statements with others returning void
+		for (auto *RI : RItoRemove) {
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+		}
+		DEBUG(errs() << "\tChanged return statements to return void\n");
+	}
+	else {
+		// The struct has return values, thus needs to be converted to parameter
+
+		// Iterate over all element types of return struct and add arguments to the
+		// function
+		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+			RetArgs.push_back(RetArg);
+			RetArgTypes.push_back(RetArg->getType());
+			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+		}
+
+		DEBUG(errs() << "\tReplacing Return statements\n");
+		// Replace return statements with extractValue and store instructions
+		for (auto *RI : RItoRemove) {
+			Value* RetVal = RI->getReturnValue();
+			for(unsigned i = 0; i < RetArgs.size(); i++) {
+				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+						RetArgs[i]->getName()+".val", RI);
+				new StoreInst(EI, RetArgs[i], RI);
+			}
+			// assert(RetVal && "Return value should not be null at this point");
+			// StructType* RetType = cast<StructType>(RetVal->getType());
+			// assert(RetType && "Return type is not a struct");
+
+			ReturnInst::Create((F->getContext()), 0, RI);
+			RI->eraseFromParent();
+
+		}
+	}
+	DEBUG(errs() << "\tReplaced return statements\n");
+
+	// Create the argument type list with the added argument's type
+	std::vector<Type*> ArgTypes;
+	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		ArgTypes.push_back(ai->getType());
+	}
+	for(auto *RATy: RetArgTypes) {
+		ArgTypes.push_back(RATy);
+	}
+
+	// Creating Args vector to use in cloning!
+	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+			ai != ae; ++ai) {
+		Args.push_back(&*ai);
+	}
+	for(auto *ai : RetArgs) {
+		Args.push_back(ai);
+	}
+
+	// Adding new arguments to the function argument list, would not change the
+	// function type. We need to change the type of this function to reflect the
+	// added arguments
+	Type* VoidRetType = Type::getVoidTy(F->getContext());
+	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+	// Change the function type
+	//F->mutateType(PTy);
+	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
+	replaceNodeFunctionInIR(*F->getParent(), F, newF);
+	//F->eraseFromParent();
+	return newF;
 }
 
 /******************************************************************************
@@ -2159,289 +2100,269 @@ Function *CGT_NVPTX::transformFunctionToVoid(Function *F) {
 // 1. No stores
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
-static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList,
-                              std::vector<Value *> *VisitedList) {
-  if (std::find(VisitedList->begin(), VisitedList->end(), V) !=
-      VisitedList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  VisitedList->push_back(V);
-  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
-       ++ui) {
-    Instruction *I = dyn_cast<Instruction>(*ui);
-    if (!I) {
-      // if use is not an instruction, then skip it
-      continue;
-    }
-    DEBUG(errs() << "\t" << *I << "\n");
-    if (isa<LoadInst>(I)) {
-      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-      UseList->push_back(V);
-    } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-      // found a store in use chain
-      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-      return true;
-    } else if (BuildDFG::isViscIntrinsic(I)) {
-      // If it is an atomic intrinsic, we found a store
-      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
-      assert(II &&
-             II->getCalledValue()->getName().startswith("llvm.visc.atomic") &&
-             "Only visc atomic intrinsics can have an argument as input");
-      return true;
-    } else {
-      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-      if (findLoadStoreUses(I, UseList, VisitedList))
-        return true;
-    }
-  }
-  return false;
+static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
+	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	VisitedList->push_back(V);
+	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
+			ui != ue; ++ui) {
+		Instruction* I = dyn_cast<Instruction>(*ui);
+		if(!I) {
+			// if use is not an instruction, then skip it
+			continue;
+		}
+		DEBUG(errs() << "\t" << *I << "\n");
+		if(isa<LoadInst>(I)) {
+			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+			UseList->push_back(V);
+		}
+		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+			// found a store in use chain
+			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+			return true;
+		}
+		else if(BuildDFG::isViscIntrinsic(I)) {
+			// If it is an atomic intrinsic, we found a store
+			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
+					&& "Only visc atomic intrinsics can have an argument as input");
+			return true;
+		}
+		else {
+			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+			if(findLoadStoreUses(I, UseList, VisitedList))
+				return true;
+		}
+	}
+	return false;
 }
 
-static bool isDependentOnNodeInstanceID(Value *V,
-                                        std::vector<Value *> *DependenceList) {
-  if (std::find(DependenceList->begin(), DependenceList->end(), V) !=
-      DependenceList->end()) {
-    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-    return false;
-  }
-  DependenceList->push_back(V);
-  // If not an instruction, then not dependent on node instance id
-  if (!isa<Instruction>(V) || isa<Constant>(V)) {
-    DEBUG(errs() << "\tStop\n");
-    return false;
-  }
-
-  Instruction *I = cast<Instruction>(V);
-  for (unsigned i = 0; i < I->getNumOperands(); i++) {
-    Value *operand = I->getOperand(i);
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) {
-      if ((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x ||
-           II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y ||
-           II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-        Value *Node = II->getArgOperand(0);
-        IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node);
-        assert(
-            GN &&
-            "NodeInstanceID operande should be node/parent node intrinsic\n");
-        if (GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II
-                       << "\n");
-          return true;
-        }
-      }
-    }
-    if (CmpInst *CI = dyn_cast<CmpInst>(operand)) {
-      DEBUG(errs() << "Found compare instruction: " << *CI
-                   << "\nNot following its dependency list\n");
-      continue;
-    }
-    DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-    if (isDependentOnNodeInstanceID(operand, DependenceList)) {
-      return true;
-    }
-  }
-  return false;
+static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
+	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
+		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+		return false;
+	}
+	DependenceList->push_back(V);
+	// If not an instruction, then not dependent on node instance id
+	if(!isa<Instruction>(V) || isa<Constant>(V)) {
+		DEBUG(errs() << "\tStop\n");
+		return false;
+	}
+
+	Instruction* I = cast<Instruction>(V);
+	for(unsigned i = 0; i < I->getNumOperands(); i++) {
+		Value* operand = I->getOperand(i);
+		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
+			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
+						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+				Value* Node = II->getArgOperand(0);
+				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
+				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
+				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
+					return true;
+				}
+			}
+		}
+		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
+			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
+			continue;
+		}
+		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
+			return true;
+		}
+	}
+	return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
-static bool canBePromoted(Argument *arg, Function *F) {
-  DEBUG(errs() << "OPT: Check if Argument " << *arg
-               << " can be changed to constant memory\n");
-  std::vector<Value *> UseList;
-  std::vector<Value *> VisitedList;
-  // recursively traverse use chain
-  // if find a store instruction return false, everything fails, cannot be
-  // promoted
-  // if find a load instruction as use, add the GEP instruction to list
-  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-  if (foundStore == true)
-    return false;
-  // See that the GEP instructions are not dependent on getNodeInstanceID
-  // intrinsic
-  DEBUG(errs() << foundStore
-               << "\tNo Store Instruction found. Check dependence on node "
-                  "instance ID\n");
-  std::vector<Value *> DependenceList;
-  for (auto U : UseList) {
-    if (isDependentOnNodeInstanceID(U, &DependenceList))
-      return false;
-  }
-  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-  return true;
+static bool canBePromoted(Argument* arg, Function* F) {
+	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
+	std::vector<Value*> UseList;
+	std::vector<Value*> VisitedList;
+	// recursively traverse use chain
+	// if find a store instruction return false, everything fails, cannot be
+	// promoted
+	// if find a load instruction as use, add the GEP instruction to list
+	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+	if(foundStore == true)
+		return false;
+	// See that the GEP instructions are not dependent on getNodeInstanceID
+	// intrinsic
+	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
+	std::vector<Value*>DependenceList;
+	for(auto U: UseList) {
+		if(isDependentOnNodeInstanceID(U, &DependenceList))
+			return false;
+	}
+	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+	return true;
 }
 
+
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
-static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr,
-                                 Value *&GlobalWGPtr, Kernel *kernel,
-                                 ValueToValueMapTy &VMap, Instruction *IB) {
-
-  // Assign number of dimenstions a constant value
-  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-  // If local work group size if null
-  if (!kernel->hasLocalWG()) {
-    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-  } else {
-    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-      if (isa<Argument>(kernel->localWGSize[i]))
-        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-    }
-    LocalWGPtr =
-        genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-  }
-
-  for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-    if (isa<Argument>(kernel->globalWGSize[i]))
-      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-  }
-
-  // For OpenCL, global work group size is the total bumber of instances in each
-  // dimension. So, multiply local and global dim limits.
-  std::vector<Value *> globalWGSizeInsts;
-  if (kernel->hasLocalWG()) {
-    for (unsigned i = 0; i < kernel->gridDim; i++) {
-      BinaryOperator *MulInst =
-          BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i],
-                                 kernel->localWGSize[i], "", IB);
-      globalWGSizeInsts.push_back(MulInst);
-    }
-  } else {
-    globalWGSizeInsts = kernel->globalWGSize;
-  }
-  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
+		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+	// Assign number of dimenstions a constant value
+	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+	// If local work group size if null
+	if(!kernel->hasLocalWG()) {
+		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+	}
+	else {
+		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+			if(isa<Argument>(kernel->localWGSize[i]))
+				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+		}
+		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+	}
+
+	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+		if(isa<Argument>(kernel->globalWGSize[i]))
+			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+	}
+
+	// For OpenCL, global work group size is the total bumber of instances in each
+	// dimension. So, multiply local and global dim limits.
+	std::vector<Value*> globalWGSizeInsts;
+	if(kernel->hasLocalWG()) {
+		for (unsigned i = 0; i < kernel->gridDim; i++) {
+			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+			globalWGSizeInsts.push_back(MulInst);
+		}
+	}
+	else {
+		globalWGSizeInsts = kernel->globalWGSize;
+	}
+	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
-static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
-                              ValueToValueMapTy &VMap, Instruction *IB,
-                              const Twine &WGName) {
-  Value *WGPtr;
-  // Get int64_t and or ease of use
-  Type *Int64Ty = Type::getInt64Ty(M.getContext());
-
-  // Work Group type is [#dim x i64]
-  Type *WGTy = ArrayType::get(Int64Ty, WGSize.size());
-  // Allocate space of Global work group data on stack and get pointer to
-  // first element.
-  AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB);
-  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(),
-                                         WG->getName() + ".0", IB);
-  Value *nextDim = WGPtr;
-  DEBUG(errs() << *WGPtr << "\n");
-
-  // Iterate over the number of dimensions and store the global work group
-  // size in that dimension
-  for (unsigned i = 0; i < WGSize.size(); i++) {
-    DEBUG(errs() << *WGSize[i] << "\n");
-    assert(WGSize[i]->getType()->isIntegerTy() &&
-           "Dimension not an integer type!");
-
-    if (WGSize[i]->getType() != Int64Ty) {
-      // If number of dimensions are mentioned in any other integer format,
-      // generate code to extend it to i64. We need to use the mapped value in
-      // the new generated function, hence the use of VMap
-      // FIXME: Why are we changing the kernel WGSize vector here?
-      DEBUG(errs() << "Not i64. Zero extend required.\n");
-      DEBUG(errs() << *WGSize[i] << "\n");
-      CastInst *CI =
-          BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-      DEBUG(errs() << "Bitcast done.\n");
-      StoreInst *SI = new StoreInst(CI, nextDim, IB);
-      DEBUG(errs() << "Zero extend done.\n");
-      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-    } else {
-      // Store the value representing work group size in ith dimension on
-      // stack
-      StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB);
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
+	Value* WGPtr;
+	// Get int64_t and or ease of use
+	Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+	// Work Group type is [#dim x i64]
+	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+	// Allocate space of Global work group data on stack and get pointer to
+	// first element.
+	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
+	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+	Value* nextDim = WGPtr;
+	DEBUG(errs() << *WGPtr << "\n");
+
+	// Iterate over the number of dimensions and store the global work group
+	// size in that dimension
+	for(unsigned i=0; i < WGSize.size(); i++) {
+		DEBUG(errs() << *WGSize[i] << "\n");
+		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+		if(WGSize[i]->getType() != Int64Ty) {
+			// If number of dimensions are mentioned in any other integer format,
+			// generate code to extend it to i64. We need to use the mapped value in
+			// the new generated function, hence the use of VMap
+			// FIXME: Why are we changing the kernel WGSize vector here?
+			DEBUG(errs() << "Not i64. Zero extend required.\n");
+			DEBUG(errs() << *WGSize[i] << "\n");
+			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+			DEBUG(errs() << "Bitcast done.\n");
+			StoreInst* SI = new StoreInst(CI, nextDim, IB);
+			DEBUG(errs() << "Zero extend done.\n");
+			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+		} else {
+			// Store the value representing work group size in ith dimension on
+			// stack
+			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+		}
+		if(i+1 < WGSize.size()) {
+			// Move to next dimension
+			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+					WG->getName()+"."+Twine(i+1),
+					IB);
+			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+			nextDim = GEP;
+		}
+	}
+	return WGPtr;
 
-      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-    }
-    if (i + 1 < WGSize.size()) {
-      // Move to next dimension
-      GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)),
-          WG->getName() + "." + Twine(i + 1), IB);
-      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-      nextDim = GEP;
-    }
-  }
-  return WGPtr;
 }
 
 // Get generated PTX binary name
-static std::string getPTXFilename(const Module &M) {
-  std::string moduleID = M.getModuleIdentifier();
-  moduleID.append(".kernels.cl");
-  return moduleID;
+static std::string getPTXFilename(const Module& M) {
+	std::string moduleID = M.getModuleIdentifier();
+	moduleID.append(".kernels.cl");
+	return moduleID;
 }
 
 // Get the name of the input file from module ID
-static std::string getFilenameFromModule(const Module &M) {
-  std::string moduleID = M.getModuleIdentifier();
-  return moduleID.substr(moduleID.find_last_of("/") + 1);
+static std::string getFilenameFromModule(const Module& M) {
+	std::string moduleID = M.getModuleIdentifier();
+	return moduleID.substr(moduleID.find_last_of("/")+1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-  if (TARGET_PTX == 32)
-    M.setDataLayout(StringRef(nvptx32_layoutStr));
-  else if (TARGET_PTX == 64)
-    M.setDataLayout(StringRef(nvptx64_layoutStr));
-  else
-    assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setDataLayout(StringRef(nvptx32_layoutStr));
+	else if (TARGET_PTX == 64)
+		M.setDataLayout(StringRef(nvptx64_layoutStr));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 static void changeTargetTriple(Module &M) {
-  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-  if (TARGET_PTX == 32)
-    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-  else if (TARGET_PTX == 64)
-    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-  else
-    assert(false && "Invalid PTX target");
+	if (TARGET_PTX == 32)
+		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+	else if (TARGET_PTX == 64)
+		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+	else assert(false && "Invalid PTX target");
 
-  return;
+	return;
 }
 
 // Helper function, populate a vector with all return statements in a function
-static void findReturnInst(Function *F,
-                           std::vector<ReturnInst *> &ReturnInstVec) {
-  for (auto &BB : *F) {
-    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-      ReturnInstVec.push_back(RI);
-  }
+static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+	for (auto &BB : *F) {
+		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+			ReturnInstVec.push_back(RI);
+	}	
 }
 
-// Helper function, populate a vector with all IntrinsicID intrinsics in a
-// function
-static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID,
-                              std::vector<IntrinsicInst *> &IntrinsicInstVec) {
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction *I = &(*i);
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
-    if (II && II->getIntrinsicID() == IntrinsicID) {
-      IntrinsicInstVec.push_back(II);
-    }
-  }
+// Helper function, populate a vector with all IntrinsicID intrinsics in a function
+static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
+	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+		Instruction *I = &(*i);
+		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+		if (II && II->getIntrinsicID() == IntrinsicID) {
+			IntrinsicInstVec.push_back(II);
+		}
+	}
 }
 
-// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic
-// op
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-<<<<<<< HEAD
 	switch(ID) {
 		case Intrinsic::visc_atomic_add:
 			return AtomicRMWInst::Add;
@@ -2462,39 +2383,11 @@ static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
 		default:
 			llvm_unreachable("Unsupported atomic intrinsic!");
 	};
-=======
-  switch (ID) {
-  case Intrinsic::visc_atomic_add:
-    return AtomicRMWInst::Add;
-  case Intrinsic::visc_atomic_sub:
-    return AtomicRMWInst::Sub;
-  case Intrinsic::visc_atomic_min:
-    return AtomicRMWInst::Min;
-  case Intrinsic::visc_atomic_umin:
-    return AtomicRMWInst::UMin;
-  case Intrinsic::visc_atomic_max:
-    return AtomicRMWInst::Max;
-  case Intrinsic::visc_atomic_umax:
-    return AtomicRMWInst::UMax;
-    // case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
-    // case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
-  case Intrinsic::visc_atomic_xchg:
-    return AtomicRMWInst::Xchg;
-  case Intrinsic::visc_atomic_and:
-    return AtomicRMWInst::And;
-  case Intrinsic::visc_atomic_or:
-    return AtomicRMWInst::Or;
-  case Intrinsic::visc_atomic_xor:
-    return AtomicRMWInst::Xor;
-  default:
-    llvm_unreachable("Unsupported atomic intrinsic!");
-  };
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
 }
 
+
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-<<<<<<< HEAD
 	switch(ID) {
 		case Intrinsic::visc_atomic_add:
 			return "atom_add";
@@ -2515,34 +2408,6 @@ static std::string getAtomicOpName(Intrinsic::ID ID) {
 		default:
 			llvm_unreachable("Unsupported atomic intrinsic!");
 	};
-=======
-  switch (ID) {
-  case Intrinsic::visc_atomic_cmpxchg:
-    return "atom_cmpxchg";
-  case Intrinsic::visc_atomic_add:
-    return "atom_add";
-  case Intrinsic::visc_atomic_sub:
-    return "atom_sub";
-  case Intrinsic::visc_atomic_min:
-    return "atom_min";
-  case Intrinsic::visc_atomic_max:
-    return "atom_max";
-  case Intrinsic::visc_atomic_inc:
-    return "atom_inc";
-  case Intrinsic::visc_atomic_dec:
-    return "atom_dec";
-  case Intrinsic::visc_atomic_xchg:
-    return "atom_xchg";
-  case Intrinsic::visc_atomic_and:
-    return "atom_and";
-  case Intrinsic::visc_atomic_or:
-    return "atom_or";
-  case Intrinsic::visc_atomic_xor:
-    return "atom_xor";
-  default:
-    llvm_unreachable("Unsupported atomic intrinsic!");
-  };
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
 }
 
 } // End of namespace
@@ -2553,3 +2418,4 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
 		false /* does not modify the CFG */,
 		true /* transformation,   *
 					* not just analysis */);
+
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
index f6a958f79e..d39003a294 100644
--- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -10,118 +10,112 @@
 #define DEBUG_TYPE "genvisc"
 #include "GenVISC/GenVISC.h"
 
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "SupportVISC/VISCHint.h"
+#include "SupportVISC/VISCUtils.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "SupportVISC/VISCUtils.h"
+
 
-#define TIMER(X)                                                               \
-  do {                                                                         \
-    if (VISCTimer) {                                                           \
-      X;                                                                       \
-    }                                                                          \
-  } while (0)
+#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
 
 using namespace llvm;
 using namespace viscUtils;
 
+
 // VISC Command line option to use timer or not
-static cl::opt<bool> VISCTimer("visc-timers-gen",
-                               cl::desc("Enable GenVISC timer"));
+static cl::opt<bool>
+VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer"));
 
 namespace genvisc {
 
 // Helper Functions
 
-static inline ConstantInt *getTimerID(Module &, enum visc_TimerID);
-static Function *transformReturnTypeToStruct(Function *F);
-static Type *getReturnTypeFromReturnInst(Function *F);
+static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
+static Function* transformReturnTypeToStruct(Function* F);
+static Type* getReturnTypeFromReturnInst(Function* F);
 
 // Check if the dummy function call is a __visc__node call
-#define IS_VISC_CALL(callName)                                                 \
-  static bool isVISCCall_##callName(Instruction *I) {                          \
-    if (!isa<CallInst>(I))                                                     \
-      return false;                                                            \
-    CallInst *CI = cast<CallInst>(I);                                          \
-    return (CI->getCalledValue()->stripPointerCasts()->getName())              \
-        .equals("__visc__" #callName);                                         \
+#define IS_VISC_CALL(callName) \
+  static bool isVISCCall_##callName(Instruction* I) { \
+    if(!isa<CallInst>(I)) \
+      return false; \
+    CallInst* CI = cast<CallInst>(I); \
+    return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
   }
 
-static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID,
-                                     std::vector<Instruction *> *Erase) {
+static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
   // Check if the instruction is Call Instruction
   assert(isa<CallInst>(I) && "Expecting CallInst");
-  CallInst *CI = cast<CallInst>(I);
+  CallInst* CI = cast<CallInst>(I);
   DEBUG(errs() << "Found call: " << *CI << "\n");
 
   // Find the correct intrinsic call
-  Module *M = CI->getParent()->getParent()->getParent();
-  Function *F;
-  std::vector<Type *> ArgTypes;
-  std::vector<Value *> args;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
+  Module* M = CI->getParent()->getParent()->getParent();
+  Function* F;
+  std::vector<Type*> ArgTypes;
+  std::vector<Value*> args;
+  if(Intrinsic::isOverloaded(IntrinsicID)) {
     // This is an overloaded intrinsic. The types must exactly match. Get the
     // argument types
-    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
       ArgTypes.push_back(CI->getArgOperand(i)->getType());
       args.push_back(CI->getArgOperand(i));
     }
     F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
     DEBUG(errs() << *F << "\n");
-  } else { // Non-overloaded intrinsic
+  }
+  else { // Non-overloaded intrinsic
     F = Intrinsic::getDeclaration(M, IntrinsicID);
-    FunctionType *FTy = F->getFunctionType();
+    FunctionType* FTy = F->getFunctionType();
     DEBUG(errs() << *F << "\n");
 
     // Create argument list
-    assert(CI->getNumArgOperands() == FTy->getNumParams() &&
-           "Number of arguments of call do not match with Intrinsic");
-    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
-      Value *V = CI->getArgOperand(i);
+    assert(CI->getNumArgOperands() == FTy->getNumParams()
+        && "Number of arguments of call do not match with Intrinsic");
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+      Value* V = CI->getArgOperand(i);
       // Either the type should match or both should be of pointer type
       assert((V->getType() == FTy->getParamType(i) ||
-              (V->getType()->isPointerTy() &&
-               FTy->getParamType(i)->isPointerTy())) &&
-             "Dummy function call argument does not match with Intrinsic "
-             "argument!");
+          (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
+          && "Dummy function call argument does not match with Intrinsic argument!");
       // If the types do not match, then both must be pointer type and pointer
       // cast needs to be performed
-      if (V->getType() != FTy->getParamType(i)) {
+      if(V->getType() != FTy->getParamType(i)) {
         V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
       }
       args.push_back(V);
     }
   }
   // Insert call instruction
-  CallInst *Inst = CallInst::Create(
-      F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
+  CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
 
   DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
 
   CI->replaceAllUsesWith(Inst);
   // If the previous instruction needs to be erased, insert it in the vector
   // Erased
-  if (Erase != NULL)
+  if(Erase != NULL)
     Erase->push_back(CI);
 }
 
 IS_VISC_CALL(launch) /* Exists but not required */
-IS_VISC_CALL(edge)   /* Exists but not required */
+IS_VISC_CALL(edge) /* Exists but not required */
 IS_VISC_CALL(createNodeND)
-// IS_VISC_CALL(createNode)
-// IS_VISC_CALL(createNode1D)
-// IS_VISC_CALL(createNode2D)
-// IS_VISC_CALL(createNode3D)
+//IS_VISC_CALL(createNode)
+//IS_VISC_CALL(createNode1D)
+//IS_VISC_CALL(createNode2D)
+//IS_VISC_CALL(createNode3D)
 IS_VISC_CALL(bindIn)
 IS_VISC_CALL(bindOut)
 IS_VISC_CALL(push)
@@ -130,7 +124,7 @@ IS_VISC_CALL(getNode)
 IS_VISC_CALL(getParentNode)
 IS_VISC_CALL(barrier)
 IS_VISC_CALL(malloc)
-IS_VISC_CALL(return )
+IS_VISC_CALL(return)
 IS_VISC_CALL(getNodeInstanceID_x)
 IS_VISC_CALL(getNodeInstanceID_y)
 IS_VISC_CALL(getNodeInstanceID_z)
@@ -158,6 +152,7 @@ IS_VISC_CALL(sqrt)
 IS_VISC_CALL(sin)
 IS_VISC_CALL(cos)
 
+
 IS_VISC_CALL(init)
 IS_VISC_CALL(cleanup)
 IS_VISC_CALL(wait)
@@ -168,137 +163,136 @@ IS_VISC_CALL(attributes)
 IS_VISC_CALL(hint)
 
 // Return the constant integer represented by value V
-static unsigned getNumericValue(Value *V) {
-  assert(
-      isa<ConstantInt>(V) &&
-      "Value indicating the number of arguments should be a constant integer");
+static unsigned getNumericValue(Value* V) {
+  assert(isa<ConstantInt>(V)
+         && "Value indicating the number of arguments should be a constant integer");
   return cast<ConstantInt>(V)->getZExtValue();
 }
 
 // Take the __visc__return instruction and generate code for combining the
 // values being returned into a struct and returning it.
 // The first operand is the number of returned values
-static Value *genCodeForReturn(CallInst *CI) {
-  LLVMContext &Ctx = CI->getContext();
-  assert(isVISCCall_return(CI) && "__visc__return instruction expected!");
+static Value* genCodeForReturn(CallInst* CI) {
+  LLVMContext& Ctx = CI->getContext();
+  assert(isVISCCall_return(CI)
+      && "__visc__return instruction expected!");
 
   // Parse the dummy function call here
-  assert(CI->getNumArgOperands() > 0 &&
-         "Too few arguments for __visc_return call!\n");
+  assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n");
   unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
 
-  assert(CI->getNumArgOperands() - 1 == numRetVals &&
+  assert(CI->getNumArgOperands()-1 == numRetVals &&
          "Too few arguments for __visc_return call!\n");
   DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
 
-  std::vector<Type *> ArgTypes;
-  for (unsigned i = 1; i < CI->getNumArgOperands(); i++) {
+  std::vector<Type*> ArgTypes;
+  for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
     ArgTypes.push_back(CI->getArgOperand(i)->getType());
   }
   Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
-  StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
+  StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
 
-  InsertValueInst *IV = InsertValueInst::Create(
-      UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI);
+  InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy),
+                                                CI->getArgOperand(1),
+                                                0,
+                                                "returnStruct",
+                                                CI);
   DEBUG(errs() << "Code generation for return:\n");
   DEBUG(errs() << *IV << "\n");
 
-  for (unsigned i = 2; i < CI->getNumArgOperands(); i++) {
-    IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(),
+  for(unsigned i=2; i < CI->getNumArgOperands(); i++) {
+    IV = InsertValueInst::Create(IV,
+                                 CI->getArgOperand(i),
+                                 i-1,
+                                 IV->getName(),
                                  CI);
     DEBUG(errs() << *IV << "\n");
   }
-
+  
   return IV;
 }
 
 // Analyse the attribute call for this function. Add the in and out
 // attributes to pointer parameters.
-static void handleVISCAttributes(Function *F, CallInst *CI) {
-  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n"
-               << *F << "\n");
+static void handleVISCAttributes(Function* F, CallInst* CI) {
+  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n");
   // Parse the dummy function call here
   unsigned offset = 0;
   // Find number of In pointers
-  assert(CI->getNumArgOperands() > offset &&
-         "Too few arguments for __visc__attributes call!");
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__attributes call!");
   unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
   DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
 
-  for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) {
-    Value *V = CI->getArgOperand(i);
-    if (Argument *arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1 + arg->getArgNo(), Attribute::In);
-    } else {
-      DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n");
-      llvm_unreachable(
-          "Only pointer arguments can be passed to __visc__attributes call");
+  for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) {
+    Value* V = CI->getArgOperand(i);
+    if(Argument* arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1+arg->getArgNo(), Attribute::In);
+    }
+    else {
+      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
+      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
     }
   }
   // Find number of Out Pointers
   offset += 1 + numInPtrs;
-  assert(CI->getNumArgOperands() > offset &&
-         "Too few arguments for __visc__attributes call!");
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__attributes call!");
   unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
   DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
-  for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) {
-    Value *V = CI->getArgOperand(i);
-    if (Argument *arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1 + arg->getArgNo(), Attribute::Out);
-    } else {
-      DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n");
-      llvm_unreachable(
-          "Only pointer arguments can be passed to __visc__attributes call");
+  for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) {
+    Value* V = CI->getArgOperand(i);
+    if(Argument* arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1+arg->getArgNo(), Attribute::Out);
+    }
+    else {
+      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
+      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
     }
   }
-  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n"
-               << *F << "\n");
+  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n");
 }
 
 // Public Functions of GenVISC pass
 bool GenVISC::runOnModule(Module &M) {
-  DEBUG(errs() << "\nGENVISC PASS\n");
+  errs() << "\nGENVISC PASS\n";
   this->M = &M;
 
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL &&
+         "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
   Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll";
-  DEBUG(errs() << llvmSrcRoot << "\n");
+  errs() << llvmSrcRoot << "\n";
 
-  std::unique_ptr<Module> runtimeModule =
-      parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
 
-  if (runtimeModule == NULL)
-    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
   else
     DEBUG(errs() << "Successfully loaded visc-rt API module\n");
 
-  llvm_visc_initializeTimerSet = M.getOrInsertFunction(
-      "llvm_visc_initializeTimerSet",
-      runtimeModule->getFunction("llvm_visc_initializeTimerSet")
-          ->getFunctionType());
-  // DEBUG(errs() << *llvm_visc_initializeTimerSet);
+  llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet",
+                                 runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType());
+  //DEBUG(errs() << *llvm_visc_initializeTimerSet);
 
-  llvm_visc_switchToTimer = M.getOrInsertFunction(
-      "llvm_visc_switchToTimer",
-      runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
-  // DEBUG(errs() << *llvm_visc_switchToTimer);
+  llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer",
+                            runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
+ // DEBUG(errs() << *llvm_visc_switchToTimer);
 
-  llvm_visc_printTimerSet = M.getOrInsertFunction(
-      "llvm_visc_printTimerSet",
-      runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
-  // DEBUG(errs() << *llvm_visc_printTimerSet);
+  llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet",
+                            runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
+  //DEBUG(errs() << *llvm_visc_printTimerSet);
 
   // Insert init context in main
   DEBUG(errs() << "Locate __visc__init()\n");
-  Function *VI = M.getFunction("__visc__init");
+  Function* VI = M.getFunction("__visc__init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-  Instruction *I = cast<Instruction>(*VI->user_begin());
+  Instruction* I = cast<Instruction>(*VI->user_begin());
 
   DEBUG(errs() << "Initialize Timer Set\n");
   initializeTimerSet(I);
@@ -306,17 +300,18 @@ bool GenVISC::runOnModule(Module &M) {
 
   // Insert print instruction at visc exit
   DEBUG(errs() << "Locate __visc__cleanup()\n");
-  Function *VC = M.getFunction("__visc__cleanup");
+  Function* VC = M.getFunction("__visc__cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
   I = cast<Instruction>(*VC->user_begin());
   printTimerSet(I);
 
+
   DEBUG(errs() << "-------- Searching for launch sites ----------\n");
 
-  std::vector<Instruction *> toBeErased;
-  std::vector<Function *> functions;
+  std::vector<Instruction*> toBeErased;
+  std::vector<Function*> functions;
 
-  for (auto &F : M)
+  for (auto &F : M) 
     functions.push_back(&F);
 
   // Iterate over all functions in the module
@@ -324,7 +319,7 @@ bool GenVISC::runOnModule(Module &M) {
     DEBUG(errs() << "Function: " << f->getName() << "\n");
 
     // List with the required additions in the function's return type
-    std::vector<Type *> FRetTypes;
+    std::vector<Type*> FRetTypes;
 
     enum mutateTypeCause {
       mtc_None,
@@ -335,106 +330,98 @@ bool GenVISC::runOnModule(Module &M) {
     bind = mutateTypeCause::mtc_None;
 
     // Iterate over all the instructions in this function
-    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) {
-      Instruction *I = &*i; // Grab pointer to Instruction
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
+      Instruction* I = &*i; // Grab pointer to Instruction
       // If not a call instruction, move to next instruction
-      if (!isa<CallInst>(I))
+      if(!isa<CallInst>(I))
         continue;
 
-      CallInst *CI = cast<CallInst>(I);
-      LLVMContext &Ctx = CI->getContext();
+      CallInst* CI = cast<CallInst>(I);
+      LLVMContext& Ctx = CI->getContext();
 
-      if (isVISCCall_init(I)) {
+      if(isVISCCall_init(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
       }
-      if (isVISCCall_cleanup(I)) {
+      if(isVISCCall_cleanup(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
       }
-      if (isVISCCall_wait(I)) {
+      if(isVISCCall_wait(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
       }
-      if (isVISCCall_trackMemory(I)) {
+      if(isVISCCall_trackMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
       }
-      if (isVISCCall_untrackMemory(I)) {
+      if(isVISCCall_untrackMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
       }
-      if (isVISCCall_requestMemory(I)) {
+      if(isVISCCall_requestMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
       }
-      if (isVISCCall_hint(I)) {
-        assert(isa<ConstantInt>(CI->getArgOperand(0)) &&
-               "Argument to hint must be constant integer!");
-        ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0));
+      if(isVISCCall_hint(I)) {
+        assert(isa<ConstantInt>(CI->getArgOperand(0))
+               && "Argument to hint must be constant integer!");
+        ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
 
-        visc::Target t = (visc::Target)hint->getZExtValue();
+        visc::Target t = (visc::Target) hint->getZExtValue();
         addHint(CI->getParent()->getParent(), t);
         DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
         toBeErased.push_back(CI);
       }
-      if (isVISCCall_launch(I)) {
-        Function *LaunchF =
-            Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
+      if(isVISCCall_launch(I)) {
+        Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
         DEBUG(errs() << *LaunchF << "\n");
         // Get i8* cast to function pointer
-        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
         graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant *F =
-            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-        assert(
-            F &&
-            "Function invoked by VISC launch has to be define and constant.");
-
-        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0));
-        assert(Op && "VISC launch's streaming argument is a constant value.");
-        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
-                                          : ConstantInt::getTrue(Ctx);
-
+        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+	assert(F && "Function invoked by VISC launch has to be define and constant.");
+
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
+	assert(Op && "VISC launch's streaming argument is a constant value.");
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        
         auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
         assert(ArgTy && "VISC launch argument should be pointer type.");
         Value *Arg = CI->getArgOperand(2);
-        if (!ArgTy->getElementType()->isIntegerTy(8))
-          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2),
-                                               Type::getInt8PtrTy(Ctx), "", CI);
-        Value *LaunchArgs[] = {F, Arg, isStreaming};
-        CallInst *LaunchInst = CallInst::Create(
-            LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI);
+        if(!ArgTy->getElementType()->isIntegerTy(8))
+          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI);
+        Value* LaunchArgs[] = {F, Arg, isStreaming};
+        CallInst* LaunchInst = CallInst::Create(LaunchF,
+                                                ArrayRef<Value*>(LaunchArgs, 3),
+                                                "graphID", CI);
         DEBUG(errs() << "Found visc launch call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
         CI->replaceAllUsesWith(LaunchInst);
         toBeErased.push_back(CI);
       }
-      if (isVISCCall_push(I)) {
+      if(isVISCCall_push(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
       }
-      if (isVISCCall_pop(I)) {
+      if(isVISCCall_pop(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
       }
-      if (isVISCCall_createNodeND(I)) {
+      if(isVISCCall_createNodeND(I)) {
         assert(CI->getNumArgOperands() > 0 &&
                "Too few arguments for __visc__createNodeND call");
         unsigned numDims = getNumericValue(CI->getArgOperand(0));
         // We need as meny dimension argments are there are dimensions
-        assert(CI->getNumArgOperands() - 2 == numDims &&
-               "Too few arguments for __visc_createNodeND call!\n");
+        assert(CI->getNumArgOperands()-2 == numDims &&
+              "Too few arguments for __visc_createNodeND call!\n");
 
-        Function *CreateNodeF;
+        Function* CreateNodeF;
         switch (numDims) {
         case 0:
-          CreateNodeF =
-              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
           break;
         case 1:
-          CreateNodeF =
-              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
           break;
         case 2:
-          CreateNodeF =
-              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
           break;
         case 3:
-          CreateNodeF =
-              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
           break;
         default:
           llvm_unreachable("Unsupported number of dimensions\n");
@@ -442,57 +429,63 @@ bool GenVISC::runOnModule(Module &M) {
         }
         DEBUG(errs() << *CreateNodeF << "\n");
         DEBUG(errs() << *I << "\n");
-        DEBUG(errs() << "in " << I->getParent()->getParent()->getName()
-                     << "\n");
+        DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n");
 
         // Get i8* cast to function pointer
-        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
         graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant *F =
-            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
 
-        CallInst *CreateNodeInst;
+        CallInst* CreateNodeInst;
         switch (numDims) {
         case 0:
-          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F),
-                                            graphFunc->getName() + ".node", CI);
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(F),
+                                            graphFunc->getName()+".node", CI);
           break;
-        case 1: {
+        case 1:
+          {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
-          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)};
-          CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2),
-              graphFunc->getName() + ".node", CI);
-        } break;
-        case 2: {
+          Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 2),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
+        case 2:
+          {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
           assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 3, expected to be i64\n");
-          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+          Value* CreateNodeArgs[] = {F,
+                                     CI->getArgOperand(2),
                                      CI->getArgOperand(3)};
-          CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
-              graphFunc->getName() + ".node", CI);
-        } break;
-        case 3: {
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 3),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
+        case 3:
+          {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
           assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 3, expected to be i64\n");
           assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 4, expected to be i64\n");
-          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+          Value* CreateNodeArgs[] = {F,
+                                     CI->getArgOperand(2),
                                      CI->getArgOperand(3),
                                      CI->getArgOperand(4)};
-          CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
-              graphFunc->getName() + ".node", CI);
-        } break;
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 4),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
         default:
-          llvm_unreachable(
-              "Impossible path: number of dimensions is 0, 1, 2, 3\n");
+          llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n");
           break;
         }
 
@@ -502,104 +495,99 @@ bool GenVISC::runOnModule(Module &M) {
         toBeErased.push_back(CI);
       }
 
-      if (isVISCCall_edge(I)) {
-        Function *EdgeF =
-            Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
+      if(isVISCCall_edge(I)) {
+        Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
         DEBUG(errs() << *EdgeF << "\n");
-        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5));
-        ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
-        assert(Op && EdgeTypeOp &&
-               "Arguments of CreateEdge are not constant integers.");
-        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
-                                          : ConstantInt::getTrue(Ctx);
-        Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx)
-                                                 : ConstantInt::getTrue(Ctx);
-        Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                             isAllToAll,           CI->getArgOperand(3),
-                             CI->getArgOperand(4), isStreaming};
-        CallInst *EdgeInst = CallInst::Create(
-            EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI);
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5));
+        ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
+	assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers.");
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx)
+                                                : ConstantInt::getTrue(Ctx);
+        Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                             isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4),
+                             isStreaming
+                            };
+        CallInst* EdgeInst = CallInst::Create(EdgeF,
+                                              ArrayRef<Value*>(EdgeArgs, 6),
+                                              "output", CI);
         DEBUG(errs() << "Found visc edge call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
         CI->replaceAllUsesWith(EdgeInst);
         toBeErased.push_back(CI);
       }
-      if (isVISCCall_bindIn(I)) {
-        Function *BindInF =
-            Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
+      if(isVISCCall_bindIn(I)) {
+        Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
         DEBUG(errs() << *BindInF << "\n");
         // Check if this is a streaming bind or not
-        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
-        assert(Op && "Streaming argument for bind in intrinsic should be a "
-                     "constant integer.");
-        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
-                                          : ConstantInt::getTrue(Ctx);
-        Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), isStreaming};
-        CallInst *BindInInst =
-            CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI);
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
+	assert(Op && "Streaming argument for bind in intrinsic should be a constant integer.");
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), isStreaming
+                              };
+        CallInst* BindInInst = CallInst::Create(BindInF,
+                                                ArrayRef<Value*>(BindInArgs, 4),
+                                                "", CI);
         DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
         CI->replaceAllUsesWith(BindInInst);
         toBeErased.push_back(CI);
       }
-      if (isVISCCall_bindOut(I)) {
-        Function *BindOutF =
-            Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
+      if(isVISCCall_bindOut(I)) {
+        Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
         DEBUG(errs() << *BindOutF << "\n");
         // Check if this is a streaming bind or not
-        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
-        assert(Op && "Streaming argument for bind out intrinsic should be a "
-                     "constant integer.");
-        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
-                                          : ConstantInt::getTrue(Ctx);
-        Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                                CI->getArgOperand(2), isStreaming};
-        CallInst *BindOutInst = CallInst::Create(
-            BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI);
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
+	assert(Op && "Streaming argument for bind out intrinsic should be a constant integer.");
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                                CI->getArgOperand(2), isStreaming
+                               };
+        CallInst* BindOutInst = CallInst::Create(BindOutF,
+                                ArrayRef<Value*>(BindOutArgs, 4),
+                                "", CI);
         DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
 
         DEBUG(errs() << "Fixing the return type of the function\n");
         // FIXME: What if the child node function has not been visited already.
         // i.e., it's return type has not been fixed.
-        Function *F = I->getParent()->getParent();
+        Function* F = I->getParent()->getParent();
         DEBUG(errs() << F->getName() << "\n";);
-        IntrinsicInst *NodeIntrinsic =
-            cast<IntrinsicInst>(CI->getArgOperand(0));
-        assert(NodeIntrinsic &&
-               "Instruction value in bind out is not a create node intrinsic.");
+        IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
+	assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic.");
         DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
-        assert(
-            (NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
-             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
-             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
-             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
-            "Instruction value in bind out is not a create node intrinsic.");
-        Function *ChildF = cast<Function>(
-            NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
+	assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
+		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
+		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
+		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
+		"Instruction value in bind out is not a create node intrinsic.");
+        Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
         DEBUG(errs() << ChildF->getName() << "\n";);
         int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
         int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
-        StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType());
+        StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
 
-        Type *ReturnType = F->getReturnType();
+        Type* ReturnType = F->getReturnType();
         DEBUG(errs() << *ReturnType << "\n";);
-        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) &&
-               "Return type should either be a struct or void type!");
+        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
+            && "Return type should either be a struct or void type!");
 
-        FRetTypes.insert(FRetTypes.begin() + destpos,
-                         ChildReturnTy->getElementType(srcpos));
+        FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos));
         assert(((bind == mutateTypeCause::mtc_BIND) ||
                 (bind == mutateTypeCause::mtc_None)) &&
-               "Both bind_out and visc_return detected");
+                "Both bind_out and visc_return detected");
         bind = mutateTypeCause::mtc_BIND;
 
         CI->replaceAllUsesWith(BindOutInst);
         toBeErased.push_back(CI);
       }
-      if (isVISCCall_attributes(I)) {
-        Function *F = CI->getParent()->getParent();
+      if(isVISCCall_attributes(I)) {
+        Function* F = CI->getParent()->getParent();
         handleVISCAttributes(F, CI);
         toBeErased.push_back(CI);
       }
@@ -616,80 +604,65 @@ bool GenVISC::runOnModule(Module &M) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
       }
       if (isVISCCall_return(I)) {
-        DEBUG(errs() << "Function before visc return processing\n"
-                     << *I->getParent()->getParent() << "\n");
+        DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n");
         // The operands to this call are the values to be returned by the node
-        Value *ReturnVal = genCodeForReturn(CI);
+        Value* ReturnVal = genCodeForReturn(CI);
         DEBUG(errs() << *ReturnVal << "\n");
-        Type *ReturnType = ReturnVal->getType();
-        assert(isa<StructType>(ReturnType) &&
-               "Return type should be a struct type!");
+        Type* ReturnType = ReturnVal->getType();
+        assert(isa<StructType>(ReturnType)
+               && "Return type should be a struct type!");
 
         assert(((bind == mutateTypeCause::mtc_RETURN) ||
                 (bind == mutateTypeCause::mtc_None)) &&
-               "Both bind_out and visc_return detected");
+                "Both bind_out and visc_return detected");
 
         if (bind == mutateTypeCause::mtc_None) {
           // If this is None, this is the first __visc__return
           // instruction we have come upon. Place the return type of the
           // function in the return type vector
           bind = mutateTypeCause::mtc_RETURN;
-          StructType *ReturnStructTy = cast<StructType>(ReturnType);
+          StructType* ReturnStructTy = cast<StructType>(ReturnType);
           for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
             FRetTypes.push_back(ReturnStructTy->getElementType(i));
         } else { // bind == mutateTypeCause::mtc_RETURN
           // This is not the first __visc__return
-          // instruction we have come upon.
+          // instruction we have come upon. 
           // Check that the return types are the same
-          assert((ReturnType == FRetTypes[0]) &&
-                 "Multiple returns with mismatching types");
+          assert((ReturnType == FRetTypes[0])
+                 && "Multiple returns with mismatching types");
         }
 
-        ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal);
+        ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
         DEBUG(errs() << "Found visc return call: " << *CI << "\n");
-        Instruction *oldReturn = CI->getParent()->getTerminator();
-        assert(isa<ReturnInst>(oldReturn) &&
-               "Expecting a return to be the terminator of this BB!");
+        Instruction* oldReturn = CI->getParent()->getTerminator();
+        assert(isa<ReturnInst>(oldReturn)
+                && "Expecting a return to be the terminator of this BB!");
         DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
         DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
-        // CI->replaceAllUsesWith(RetInst);
+        //CI->replaceAllUsesWith(RetInst);
         toBeErased.push_back(CI);
         ReplaceInstWithInst(oldReturn, RetInst);
-        DEBUG(errs() << "Function after visc return processing\n"
-                     << *I->getParent()->getParent() << "\n");
+        DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n");
       }
 
       if (isVISCCall_getNodeInstanceID_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
       }
       if (isVISCCall_getNodeInstanceID_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
       }
       if (isVISCCall_getNodeInstanceID_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z,
-                                 &toBeErased);
-      }
-<<<<<<< HEAD
-=======
-      if (isVISCCall_atomic_cmpxchg(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg,
-                                 &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
       }
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
       if (isVISCCall_atomic_add(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
       }
@@ -714,19 +687,6 @@ bool GenVISC::runOnModule(Module &M) {
       if (isVISCCall_atomic_xor(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased);
       }
-<<<<<<< HEAD
-=======
-      if (isVISCCall_floor(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased);
-      }
-      if (isVISCCall_rsqrt(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f,
-                                 &toBeErased);
-      }
-      if (isVISCCall_sqrt(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased);
-      }
->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f
       if (isVISCCall_sin(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
       }
@@ -737,155 +697,148 @@ bool GenVISC::runOnModule(Module &M) {
 
     // Erase the __visc__node calls
     DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
-    for (auto I : toBeErased) {
+    for(auto I: toBeErased) {
       DEBUG(errs() << *I << "\n");
     }
-    while (!toBeErased.empty()) {
-      Instruction *I = toBeErased.back();
+    while(!toBeErased.empty()) {
+      Instruction* I = toBeErased.back(); 
       DEBUG(errs() << "\tErasing " << *I << "\n");
       I->eraseFromParent();
-      toBeErased.pop_back();
+      toBeErased.pop_back(); 
     }
 
-    if (bind == mutateTypeCause::mtc_BIND ||
-        bind == mutateTypeCause::mtc_RETURN) {
-      DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
-      // Argument type list.
-      std::vector<Type *> FArgTypes;
-      for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
-           ai != ae; ++ai) {
-        FArgTypes.push_back(ai->getType());
-      }
-
-      // Find new return type of function
-      Type *NewReturnTy;
-      if (bind == mutateTypeCause::mtc_BIND) {
-
-        std::vector<Type *> TyList;
-        for (unsigned i = 0; i < FRetTypes.size(); i++)
-          TyList.push_back(FRetTypes[i]);
-
-        NewReturnTy =
-            StructType::create(f->getContext(), TyList,
-                               Twine("struct.out." + f->getName()).str(), true);
-      } else {
-        NewReturnTy = getReturnTypeFromReturnInst(f);
-        assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
-      }
-
-      FunctionType *FTy =
-          FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
-
-      // Change the function type
-      Function *newF = cloneFunction(f, FTy, false);
-      DEBUG(errs() << *newF << "\n");
-
-      if (bind == mutateTypeCause::mtc_BIND) {
-        // This is certainly an internal node, and hence just one BB with one
-        // return terminator instruction. Change return statement
-        ReturnInst *RI =
-            cast<ReturnInst>(newF->getEntryBlock().getTerminator());
-        ReturnInst *newRI = ReturnInst::Create(newF->getContext(),
-                                               UndefValue::get(NewReturnTy));
-        ReplaceInstWithInst(RI, newRI);
-      }
-      if (bind == mutateTypeCause::mtc_RETURN) {
-        // Nothing
-      }
-      replaceNodeFunctionInIR(*f->getParent(), f, newF);
-      DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
+    if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
+        DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
+        // Argument type list.
+        std::vector<Type*> FArgTypes;
+        for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
+            ai != ae; ++ai) {
+          FArgTypes.push_back(ai->getType());
+        }
+
+        // Find new return type of function
+        Type* NewReturnTy;
+        if(bind == mutateTypeCause::mtc_BIND) {
+
+          std::vector<Type*> TyList;
+          for (unsigned i = 0; i < FRetTypes.size(); i++)
+            TyList.push_back(FRetTypes[i]);
+
+          NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true);
+        }
+        else {
+          NewReturnTy = getReturnTypeFromReturnInst(f);
+          assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
+        }
+
+        FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
+
+        // Change the function type
+        Function* newF = cloneFunction(f, FTy, false);
+        DEBUG(errs() << *newF << "\n");
+
+        if (bind == mutateTypeCause::mtc_BIND) {
+          // This is certainly an internal node, and hence just one BB with one
+          // return terminator instruction. Change return statement
+          ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator());
+          ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy));
+          ReplaceInstWithInst(RI, newRI);        
+        }
+        if (bind == mutateTypeCause::mtc_RETURN) {
+          // Nothing
+        }
+        replaceNodeFunctionInIR(*f->getParent(), f, newF);
+        DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
     }
+
+
   }
-  return false; // TODO: What does returning "false" mean?
+  return false; //TODO: What does returning "false" mean?
 }
 
 // Generate Code for declaring a constant string [L x i8] and return a pointer
 // to the start of it.
-Value *GenVISC::getStringPointer(const Twine &S, Instruction *IB,
-                                 const Twine &Name) {
-  Constant *SConstant =
-      ConstantDataArray::getString(M->getContext(), S.str(), true);
-  Value *SGlobal =
-      new GlobalVariable(*M, SConstant->getType(), true,
-                         GlobalValue::InternalLinkage, SConstant, Name);
-  Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
-  Value *GEPArgs[] = {Zero, Zero};
-  GetElementPtrInst *SPtr = GetElementPtrInst::Create(
-      nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB);
+Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
+  Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true);
+  Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true,
+                                      GlobalValue::InternalLinkage, SConstant, Name);
+  Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+  Value* GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
+                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
   return SPtr;
 }
 
-void GenVISC::initializeTimerSet(Instruction *InsertBefore) {
-  Value *TimerSetAddr;
-  StoreInst *SI;
-  TIMER(TimerSet = new GlobalVariable(
-            *M, Type::getInt8PtrTy(M->getContext()), false,
-            GlobalValue::CommonLinkage,
-            Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
-            "viscTimerSet_GenVISC"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
-               << "\n");
-  // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet <<
-  // "\n");
-
-  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, None, "",
+void GenVISC::initializeTimerSet(Instruction* InsertBefore) {
+  Value* TimerSetAddr;
+  StoreInst* SI;
+  TIMER(TimerSet = new GlobalVariable(*M,
+                                      Type::getInt8PtrTy(M->getContext()),
+                                      false,
+                                      GlobalValue::CommonLinkage,
+                                      Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
+                                      "viscTimerSet_GenVISC"));
+  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n");
+  //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
+
+  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
+                                        None,
+                                        "",
                                         InsertBefore));
   DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
   TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
   DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
 }
 
-void GenVISC::switchToTimer(enum visc_TimerID timer,
-                            Instruction *InsertBefore) {
-  Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)};
+void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
+  Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)};
   TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
+                         ArrayRef<Value*>(switchArgs, 2),
+                         "",
+                         InsertBefore));
 }
 
-void GenVISC::printTimerSet(Instruction *InsertBefore) {
-  Value *TimerName;
+void GenVISC::printTimerSet(Instruction* InsertBefore) {
+  Value* TimerName;
   TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore));
-  Value *printArgs[] = {TimerSet, TimerName};
+  Value* printArgs[] = {TimerSet, TimerName};
   TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
+                         ArrayRef<Value*>(printArgs, 2),
+                         "",
+                         InsertBefore));
 }
 
-static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) {
+static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
 }
 
-static Function *transformReturnTypeToStruct(Function *F) {
+static Function* transformReturnTypeToStruct(Function* F) {
   // Currently only works for void return types
-  DEBUG(errs() << "Transforming return type of function to Struct: "
-               << F->getName() << "\n");
+  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
 
   if (isa<StructType>(F->getReturnType())) {
-    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": "
-                 << *F->getReturnType() << "\n");
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
     return F;
   }
 
-  assert(F->getReturnType()->isVoidTy() &&
-         "Unhandled case - Only void return type handled\n");
+  assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
 
   // Create the argument type list with added argument types
-  std::vector<Type *> ArgTypes;
-  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-       ai != ae; ++ai) {
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
     ArgTypes.push_back(ai->getType());
   }
-
-  StructType *RetTy =
-      StructType::create(F->getContext(), None, "emptyStruct", true);
-  FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
-
-  SmallVector<ReturnInst *, 8> Returns;
-  Function *newF = cloneFunction(F, FTy, false, &Returns);
+  
+  StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true);
+  FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
+  
+  SmallVector<ReturnInst*, 8> Returns;
+  Function* newF = cloneFunction(F, FTy, false, &Returns);
   // Replace ret void instruction with ret %RetTy undef
-  for (auto &RI : Returns) {
-    DEBUG(errs() << "Found return inst: " << *RI << "\n");
-    ReturnInst *newRI =
-        ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
+  for(auto &RI: Returns) {
+    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
+    ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
     ReplaceInstWithInst(RI, newRI);
   }
 
@@ -893,20 +846,19 @@ static Function *transformReturnTypeToStruct(Function *F) {
   return newF;
 }
 
-static Type *getReturnTypeFromReturnInst(Function *F) {
-  for (BasicBlock &BB : *F) {
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType()
-                   << "\n");
+static Type* getReturnTypeFromReturnInst(Function* F) {
+  for(BasicBlock &BB: *F) {
+    if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n");
       return RI->getReturnValue()->getType();
     }
   }
 }
 
+
 char genvisc::GenVISC::ID = 0;
-static RegisterPass<genvisc::GenVISC>
-    X("genvisc",
-      "Pass to generate VISC IR from LLVM IR (with dummy function calls)",
-      false, false);
+static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false);
 
 } // End of namespace genvisc
+
+
-- 
GitLab