From a098504f823bc45a613b0620609066b445e5cb40 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Mon, 17 Nov 2014 20:01:54 +0000
Subject: [PATCH] Refactored code to make insert runtime call function more
 readable

---
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 2375 +++++++++--------
 1 file changed, 1207 insertions(+), 1168 deletions(-)

diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index fafc892ea7..d7cdfa2e40 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -36,291 +36,637 @@ using namespace builddfg;
 //STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
 
 namespace {
+// Helper class declarations
+
+// Class to maintain the tuple of host pointer, device pointer and size
+// in bytes. Would have preferred to use tuple but support not yet available
+class OutputPtr {
+public:
+  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
+    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+
+  Value* h_ptr;
+  Value* d_ptr;
+  Value* bytes;
+};
+
+// Class to maintain important kernel info required for generating runtime
+// calls
+class Kernel {
+public:
+  Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*>
+         _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0,
+         std::vector<Value*> _localWGSize = std::vector<Value*>())
+    : KernelFunction(_KF),
+      gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+      localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size()
+           && "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size()
+           && "blockDim should be same as the size of vector localWGSize");
+  }
+
+  Function* KernelFunction;
+  unsigned gridDim;
+  unsigned blockDim;
+  std::vector<Value*> globalWGSize;
+  std::vector<Value*> localWGSize;
+};
+
 
 // Helper function declarations
-  static bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
-  static std::string getPTXFilename(const Module&);
-  static std::string getFilenameFromModule(const Module& M);
-  static void changeDataLayout(Module &);
-  static void changeTargetTriple(Module &);
-  static std::string printType(Type*);
-  static std::string convertInt(int);
-  static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-
-  // DFG2LLVM_NVPTX - The first implementation.
-  struct DFG2LLVM_NVPTX : public ModulePass {
-    static char ID; // Pass identification, replacement for typeid
-    DFG2LLVM_NVPTX() : ModulePass(ID) {}
-
-    private:
-    // Member variables
-
-    // Functions
-
-    public:
-    bool runOnModule(Module &M);
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<BuildDFG>();
-      AU.addPreserved<BuildDFG>();
-    }
+static void getExecuteNodeParams(Value* &, Value* &, Value* &, Kernel*,
+                                 ValueToValueMapTy&, Instruction*);
+static bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
+static std::string getPTXFilename(const Module&);
+static std::string getFilenameFromModule(const Module& M);
+static void changeDataLayout(Module &);
+static void changeTargetTriple(Module &);
+static std::string printType(Type*);
+static std::string convertInt(int);
+static void findReturnInst(Function *, std::vector<ReturnInst *> &);
+
+// DFG2LLVM_NVPTX - The first implementation.
+struct DFG2LLVM_NVPTX : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_NVPTX() : ModulePass(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addPreserved<BuildDFG>();
+  }
 
 
-  };
-
-  // Helper class to maintain the tuple of host pointer, device pointer and size
-  // in bytes. Would have preferred to use tuple but support not yet available
-  class OutputPtr {
-  public:
-    OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
-      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
-
-    Value* h_ptr;
-    Value* d_ptr;
-    Value* bytes;
-  };
-
-  // Visitor for Code generation traversal (tree traversal for now)
-  class Kernel {
-  public:
-    Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*>
-        _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0,
-        std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF),
-    gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
-    localWGSize(_localWGSize) {
-      assert(gridDim == globalWGSize.size()
-          && "gridDim should be same as the size of vector globalWGSize");
-      assert(blockDim == localWGSize.size()
-          && "blockDim should be same as the size of vector localWGSize");
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CodeGenTraversal : public DFNodeVisitor {
+
+private:
+  //Member variables
+  Module &M;
+  Module &KernelM;
+  BuildDFG &DFG;
+  DFNode* KernelLaunchNode;
+  Kernel* kernel;
+  // Map from Old function associated with DFNode to new cloned function with
+  // extra index and dimension arguments. This map also serves to find out if
+  // we already have an index and dim extended function copy or not (i.e.,
+  // "Have we visited this function before?")
+  DenseMap<DFNode*, Value*> OutputMap;
+
+  // VISC Runtime API
+  Module* runtimeModule;
+  Function* llvm_visc_ptx_launch;
+  Function* llvm_visc_ptx_wait;
+  Function* llvm_visc_ptx_initContext;
+  Function* llvm_visc_ptx_argument_scalar;
+  Function* llvm_visc_ptx_argument_ptr;
+  Function* llvm_visc_ptx_getOutput;
+  Function* llvm_visc_ptx_executeNode;
+
+
+  //Functions
+  std::string getKernelsModuleName(Module &M);
+  void fixValueAddrspace(Value* V, unsigned addrspace);
+  Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
+  void changeArgAddrspace(Function* F, unsigned i);
+  void addCLMetadata(Function* F);
+  void writeKernelsModule();
+  void transformFunctionToVoid(Function* F);
+  void initRuntimeAPI();
+  void addIdxDimArgs(Function* F);
+  Argument* getArgumentAt(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                      Instruction* InsertBefore);
+  void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName);
+
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG), KernelM(*CloneModule(&_M)) {
+    // Initialize Runtime API
+    initRuntimeAPI();
+
+    // Copying instead of creating new, in order to preserve required info (metadata)
+
+    // Remove functions, global variables and aliases
+    std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
+    for (Module::global_iterator mi = KernelM.global_begin(),
+         me = KernelM.global_end(); (mi != me); ++mi) {
+      GlobalVariable* gv = &*mi;
+      gvv.push_back(gv);
+    }
+    for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
     }
 
-    Function* KernelFunction;
-    unsigned gridDim;
-    unsigned blockDim;
-    std::vector<Value*> globalWGSize;
-    std::vector<Value*> localWGSize;
-  };
-
-  class CodeGenTraversal : public DFNodeVisitor {
-
-  private:
-    //Member variables
-    Module &M;
-    Module &KernelM;
-    BuildDFG &DFG;
-    DFNode* KernelLaunchNode;
-    Kernel* kernel;
-    // Map from Old function associated with DFNode to new cloned function with
-    // extra index and dimension arguments. This map also serves to find out if
-    // we already have an index and dim extended function copy or not (i.e.,
-    // "Have we visited this function before?")
-    DenseMap<DFNode*, Value*> OutputMap;
-
-    // VISC Runtime API
-    Module* runtimeModule;
-    Function* llvm_visc_ptx_launch;
-    Function* llvm_visc_ptx_wait;
-    Function* llvm_visc_ptx_initContext;
-    Function* llvm_visc_ptx_argument_scalar;
-    Function* llvm_visc_ptx_argument_ptr;
-    Function* llvm_visc_ptx_getOutput;
-    Function* llvm_visc_ptx_executeNode;
-
-
-    //Functions
-    std::string getKernelsModuleName(Module &M);
-    void fixValueAddrspace(Value* V, unsigned addrspace);
-    Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
-    void changeArgAddrspace(Function* F, unsigned i);
-    void addCLMetadata(Function* F);
-    void writeKernelsModule();
-    void transformFunctionToVoid(Function* F);
-    void initRuntimeAPI();
-    void addIdxDimArgs(Function* F);
-    Argument* getArgumentAt(Function* F, unsigned offset);
-    Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
-                        Instruction* InsertBefore);
-    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName);
-
-    void codeGen(DFInternalNode* N);
-    void codeGen(DFLeafNode* N);
-
-  public:
-
-    // Constructor
-    CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG), KernelM(*CloneModule(&_M)) {
-      // Initialize Runtime API
-      initRuntimeAPI();
-
-      // Copying instead of creating new, in order to preserve required info (metadata)
-
-      // Remove functions, global variables and aliases
-      std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
-      for (Module::global_iterator mi = KernelM.global_begin(),
-           me = KernelM.global_end(); (mi != me); ++mi) {
-        GlobalVariable* gv = &*mi;
-        gvv.push_back(gv);
-      }
-      for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) {
-        (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-        (*vi)->eraseFromParent();
-      }
+    std::vector<Function*> fv = std::vector<Function*>();
+    for (Module::iterator mi = KernelM.begin(),
+         me = KernelM.end(); (mi != me); ++mi) {
+      Function* f = &*mi;
+      fv.push_back(f);
+    }
+    for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
 
-      std::vector<Function*> fv = std::vector<Function*>();
-      for (Module::iterator mi = KernelM.begin(),
-           me = KernelM.end(); (mi != me); ++mi) {
-        Function* f = &*mi;
-        fv.push_back(f);
-      }
-      for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
-        (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-        (*vi)->eraseFromParent();
-      }
+    std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
+    for (Module::alias_iterator mi = KernelM.alias_begin(),
+         me = KernelM.alias_end(); (mi != me); ++mi) {
+      GlobalAlias* a = &*mi;
+      av.push_back(a);
+    }
+    for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
 
-      std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
-      for (Module::alias_iterator mi = KernelM.alias_begin(),
-           me = KernelM.alias_end(); (mi != me); ++mi) {
-        GlobalAlias* a = &*mi;
-        av.push_back(a);
-      }
-      for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) {
-        (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-        (*vi)->eraseFromParent();
-      }
+    changeDataLayout(KernelM);
+    changeTargetTriple(KernelM);
 
-      changeDataLayout(KernelM);
-      changeTargetTriple(KernelM);
+    DEBUG(errs() << KernelM);
 
-      DEBUG(errs() << KernelM);
+  }
 
+  virtual void visit(DFInternalNode* N) {
+    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
+        e = N->getChildGraph()->end(); i != e; ++i) {
+      DFNode* child = *i;
+      child->applyDFNodeVisitor(*this);
     }
 
-    virtual void visit(DFInternalNode* N) {
-      for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
-          e = N->getChildGraph()->end(); i != e; ++i) {
-        DFNode* child = *i;
-        child->applyDFNodeVisitor(*this);
-      }
-
-      DEBUG(errs() << "Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
-      codeGen(N);
-      DEBUG(errs() << "DONE" << "\n");
+    DEBUG(errs() << "Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    codeGen(N);
+    DEBUG(errs() << "DONE" << "\n");
 
-    }
+  }
 
-    virtual void visit(DFLeafNode* N) {
-      DEBUG(errs() << "Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n");
-      codeGen(N);
-      DEBUG(errs() << "DONE" << "\n");
-    }
+  virtual void visit(DFLeafNode* N) {
+    DEBUG(errs() << "Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n");
+    codeGen(N);
+    DEBUG(errs() << "DONE" << "\n");
+  }
 
-  };
+};
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CodeGenTraversal::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+  runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext());
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  llvm_visc_ptx_launch = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_launch",
+                                        runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_launch);
+
+  llvm_visc_ptx_wait = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_wait",
+                                      runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_wait);
+
+  llvm_visc_ptx_initContext = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
+                              runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_initContext);
+
+  llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar",
+                                  runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_argument_scalar);
+
+  llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr",
+                               runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_argument_ptr);
+
+  llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput",
+                            runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_getOutput);
+
+  llvm_visc_ptx_executeNode = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_executeNode",
+                              runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType()));
+  DEBUG(errs() << *llvm_visc_ptx_executeNode);
+
+}
+void CodeGenTraversal::addIdxDimArgs(Function* F) {
+  // Add Index and Dim arguments
+  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+  for (int i = 0; i < 6; ++i) {
+    new Argument(Type::getInt32Ty(F->getContext()), names[i], F);
+  }
 
-  // Initialize the VISC runtime API. This makes it easier to insert these calls
-  void CodeGenTraversal::initRuntimeAPI() {
+  // Create the argument type list with added argument types
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+
+  // Change the function type
+  F->mutateType(PTy);
+}
+
+/* Traverse the function F argument list to get argument at offset*/
+Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() > offset && offset >= 0)
+         && "Invalid offset to access arguments!");
+
+  Argument* arg;
+  Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
+  for(; offset != 0 && i!=e; i++) {
+    offset--;
+  }
+  arg = i;
+  DEBUG(errs() << *arg <<"\n");
+  return arg;
+}
+
+
+Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                                      Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using OutputMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+// Generate Code for declaring a constant string [L x i8] and return a pointer
+// to the start of it.
+Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
+  Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true);
+  Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true,
+                                      GlobalValue::InternalLinkage, SConstant, Name);
+  Value* Zero = ConstantInt::get(Type::getInt64Ty(getGlobalContext()), 0);
+  Value* GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst* SPtr = GetElementPtrInst::Create(SGlobal,
+                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
+  return SPtr;
+}
+
+// Generate Code to call the kernel
+// The plan is to replace the internal node with a leaf node. This method is
+// used to generate a function to associate with this leaf node. The function
+// is responsible for all the memory allocation/transfer and invoking the
+// kernel call on the device
+void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) {
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before.
+  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+
+  // Useful values
+  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+
+  // If kernel struct has not been initialized with kernel function, then fail
+  assert(kernel != NULL && "No kernel found!!");
+
+  DEBUG(errs() << "Generating kernel call code\n");
+
+  Function* F = N->getFuncPointer();
+
+
+  // Create of clone of F with no instructions. Only the type is the same as F
+  // without the extra arguments.
+  Function* F_X86;
+
+  // Clone the function, if we are seeing this function for the first time. We
+  // only need a clone in terms of type.
+  ValueToValueMapTy VMap;
+
+  // Create new function with the same type
+  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+  // Loop over the arguments, copying the names of arguments over.
+  Function::arg_iterator dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    dest_iterator->setName(i->getName()); // Copy the name over...
+    // Add mapping to VMap and increment dest iterator
+    VMap[i] = dest_iterator++;
+  }
 
-    // Load Runtime API Module
-    SMDiagnostic Err;
-    runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext());
-    if(runtimeModule == NULL)
-      DEBUG(errs() << Err.getMessage());
-    else
-      DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(F_X86->getReturnType()), BB);
 
-    // Get or insert the global declarations for launch/wait functions
-    llvm_visc_ptx_launch = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_launch",
-        runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_launch);
+  //Add the generated function info to DFNode
+  N->setGenFunc(F_X86, DFNode::X86);
 
-    llvm_visc_ptx_wait = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_wait",
-        runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_wait);
+  // FIXME: Adding Index and Dim arguments are probably not required except
+  // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
+  // have those arguments)
 
-    llvm_visc_ptx_initContext = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
-        runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_initContext);
+  // Add Index and Dim arguments except for the root node
+  if(!N->isRoot())
+    addIdxDimArgs(F_X86);
 
-    llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar",
-        runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_argument_scalar);
-    
-    llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr",
-        runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_argument_ptr);
+  // Sort children in topological order before code generation for kernel call
+  N->getChildGraph()->sortChildren();
 
-    llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput",
-      runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_getOutput);
+  // The DFNode N has the property that it has only one child (leaving Entry
+  // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
+  // generation for kernel calls significantly. All the inputs to this child
+  // node would either be constants or from the parent node N.
 
-    llvm_visc_ptx_executeNode = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_executeNode",
-        runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_executeNode);
+  assert(N->getChildGraph()->size() == 3
+         && "Node expected to have just one non-dummy node!");
 
+  DFNode* C;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    C = *ci;
+    // Skip dummy node call
+    if (!C->isDummyNode())
+      break;
   }
-  void CodeGenTraversal::addIdxDimArgs(Function* F) {
-    // Add Index and Dim arguments
-    std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
-    for (int i = 0; i < 6; ++i) {
-      new Argument(Type::getInt32Ty(F->getContext()), names[i], F);
-    }
 
-    // Create the argument type list with added argument types
-    std::vector<Type*> ArgTypes;
-    for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-        ai != ae; ++ai) {
-      ArgTypes.push_back(ai->getType());
-    }
-    // Adding new arguments to the function argument list, would not change the
-    // function type. We need to change the type of this function to reflect the
-    // added arguments
-    FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
-    PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
-
-    // Change the function type
-    F->mutateType(PTy);
-  }
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+
+  Function* CF = C->getFuncPointer();
+  // Initialize context
+  DEBUG(errs() << "Initializing context" << "\n");
+  CallInst::Create(llvm_visc_ptx_initContext, None, "", RI);
 
-  /* Traverse the function F argument list to get argument at offset*/
-  Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
-    assert((F->getFunctionType()->getNumParams() > offset && offset >= 0)
-           && "Invalid offset to access arguments!");
+  DEBUG(errs() << "Initializing commandQ" << "\n");
+  // Initialize command queue
+  Value* fileStr = getStringPointer(FileName, RI, "Filename");
+  errs() << *fileStr << "\n";
+  errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n";
+  Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
 
-    Argument* arg;
-    Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
-    for(; offset != 0 && i!=e; i++) {
-      offset--;
+  Value* LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call" << "\n");
+  CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
+                                       ArrayRef<Value*>(LaunchInstArgs, 2),
+                                       "graph"+CF->getName(),
+                                       RI);
+  DEBUG(errs() << *GraphID << "\n");
+  // Iterate over the required input edges of the node and use the visc-rt API
+  // to set inputs
+  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+
+  std::vector<OutputPtr> OutputPointers;
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+
+    Value* inputVal = getInValueAt(C, i, F_X86, RI);
+    // input value has been obtained.
+    // Check if input is a scalar value or a pointer operand
+    // For scalar values such as int, float, etc. the size is simply the size of
+    // type on target machine, but for pointers, the size of data would be the
+    // next integer argument
+    if(inputVal->getType()->isPointerTy()) {
+      // CheckAttribute
+      Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False;
+      Value* isInput = ((hasAttribute(CF, i, Attribute::Out))
+                        && !(hasAttribute(CF, i, Attribute::In)))? False : True;
+
+      Argument* A = getArgumentAt(CF, i);
+      if(isOutput == True) {
+        errs() << *A << " is an OUTPUT argument\n";
+      }
+      if(isInput == True) {
+        errs() << *A << " is an INPUT argument\n";
+      }
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+      // Pointer Input
+      Value* inputSize = getInValueAt(C, i+1, F_X86, RI);
+      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
+             && "Pointer type input must always be followed by size (integer type)");
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               inputSize,
+                               isInput,
+                               isOutput
+                              };
+      Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr,
+                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      // If this has out attribute, store the returned device pointer in
+      // memory to read device memory later
+      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    }
+    else { // Scalar Input
+      // Store the scalar value on stack and then pass the pointer to its
+      // location
+      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
+      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               ConstantExpr::getSizeOf(inputVal->getType())
+                              };
+      CallInst::Create(llvm_visc_ptx_argument_scalar,
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
     }
-    arg = i;
-    DEBUG(errs() << *arg <<"\n");
-    return arg;
-  }
 
+  }
+  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
+  // Setup output
+  // FIXME: Note - There is a tricky question. In X86 we do not need to care
+  // about pointer inputs which modify data in memory implicitly (without
+  // showing it as output). There is no extra cost needed to handle such inputs
+  // For PTX, we need to read back such data from device memory to host memory.
+  // The cost is huge and hence we need to differentiate between readonly
+  // pointer inputs vs read/write pointer inputs. Currently supporting only a
+  // simple model in which all input edges are readonly and output is
+  // writeonly.
+
+  // Set output
+  StructType* OutputTy = C->getOutputType();
+  unsigned outputIndex = CF->getFunctionType()->getNumParams();
+  Value* outputSize = ConstantExpr::getSizeOf(OutputTy);
+  Value* setOutputArgs[] = {GraphID,
+                            Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                            ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                            ConstantExpr::getSizeOf(OutputTy),
+                            False,
+                            True
+                           };
+
+  CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr,
+                                        ArrayRef<Value*>(setOutputArgs, 6),
+                                        "d_output."+CF->getName(),
+                                        RI);
 
-  Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
-                                        Instruction* InsertBefore) {
-    // TODO: Assumption is that each input port of a node has just one
-    // incoming edge. May change later on.
+  // Enqueue kernel
+  // Need work dim, localworksize, globalworksize
+  // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
+  // size and global work size
+  // Allocate size_t[numDims] space on stack. Store the work group sizes and
+  // pass it as an argument to ExecNode
+
+  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
+  getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, kernel, VMap, RI);
+  Value* ExecNodeArgs[] = {GraphID,
+                           workDim,
+                           LocalWGPtr,
+                           GlobalWGPtr
+                          };
+  CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
+                                     ArrayRef<Value*>(ExecNodeArgs, 4),
+                                     "event."+CF->getName(),
+                                     RI);
+  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
+  // Wait for Kernel to Finish
+  CallInst::Create(llvm_visc_ptx_wait,
+                   ArrayRef<Value*>(GraphID),
+                   "",
+                   RI);
+  // Read Output Struct
+  Value* GetOutputArgs[] = {GraphID,
+                            Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                            d_Output,
+                            outputSize
+                           };
+  CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
+                                        ArrayRef<Value*>(GetOutputArgs, 4),
+                                        "h_output."+CF->getName()+".addr",
+                                        RI);
+  // Read each device pointer listed in output struct
+  // Load the output struct
+  CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI);
+  Value* KernelOutput = new LoadInst(BI, "", RI);
+
+  // Read all the pointer arguments which had side effects i.e., had out
+  // attribute
+  for(auto output: OutputPointers) {
+    errs() << "Read: " << *output.d_ptr << "\n";
+    errs() << "\t To: " << *output.h_ptr << "\n";
+    errs() << "\t #bytes: " << *output.bytes << "\n";
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
+    CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput,
+                                    ArrayRef<Value*>(GetOutputArgs, 4),
+                                    "", RI);
+  }
+  /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+    Type* elemTy = OutputTy->getElementType(i);
+    if(elemTy->isPointerTy()) {
+      // Pointer type
+      assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext())
+          && "Every Pointer type must be followed by an integer");
+      ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI);
+      // Change d_ptr to i8*
+      CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI);
+      ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI);
+      // GetOutputPtr call
+      Value* GetOutputArgs[] = {GraphID,
+                                d_ptr_i8,
+                                len};
+      CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput,
+                                            ArrayRef<Value*>(GetOutputArgs, 3),
+                                            "",
+                                            RI);
+      // Change h_ptr to correct type
+      CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8,
+                                               cast<StructType>(KernelOutput->getType())->getElementType(i),
+                                               "",
+                                               RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI);
 
+    }
+  }*/
+
+  // Prepare output
+  KernelOutput->setName("output."+CF->getName());
+  OutputMap[C] = KernelOutput;
+
+  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+  // Generate code for output bindings
+  // Get Exit node
+  C = N->getChildGraph()->getExit();
+  // Get OutputType of this node
+  StructType* OutTy = N->getOutputType();
+  Value *retVal = UndefValue::get(F_X86->getReturnType());
+  // Find all the input edges to exit node
+  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
-    DFEdge* E = Child->getInDFEdgeAt(i);
-    assert(E && "No incoming edge or binding for input element!");
+    DFEdge* E = C->getInDFEdgeAt(i);
+
+    assert(E && "No Binding for output element!");
     // Find the Source DFNode associated with the incoming edge
     DFNode* SrcDF = E->getSourceDF();
 
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
     Value* inputVal;
     if(SrcDF->isEntryNode()) {
-      inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
+      inputVal = getArgumentAt(F_X86, i);
       DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
     }
     else {
-      // edge is from a sibling
+      // edge is from a internal node
       // Check - code should already be generated for this source dfnode
       assert(OutputMap.count(SrcDF)
              && "Source node call not found. Dependency violation!");
 
-      // Find CallInst associated with the Source DFNode using OutputMap
+      // Find Output Value associated with the Source DFNode using OutputMap
       Value* CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
@@ -328,1037 +674,730 @@ namespace {
       IndexList.push_back(E->getSourcePosition());
       DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
       ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "", InsertBefore);
+                             "",RI);
       inputVal = EI;
     }
-    return inputVal;
-  }
-
-  // Generate Code for declaring a constant string [L x i8] and return a pointer
-  // to the start of it.
-  Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
-    Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true);
-    Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true,
-                          GlobalValue::InternalLinkage, SConstant, Name);
-    Value* Zero = ConstantInt::get(Type::getInt64Ty(getGlobalContext()), 0);
-    Value* GEPArgs[] = {Zero, Zero};
-    GetElementPtrInst* SPtr = GetElementPtrInst::Create(SGlobal,
-        ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
-    return SPtr;
+    std::vector<unsigned> IdxList;
+    IdxList.push_back(i);
+    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
   }
-
-  // Generate Code to call the kernel
-  // The plan is to replace the internal node with a leaf node. This method is
-  // used to generate a function to associate with this leaf node. The function
-  // is responsible for all the memory allocation/transfer and invoking the
-  // kernel call on the device
-  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) {
-    // Check if clone already exists. If it does, it means we have visited this
-    // function before.
-    assert(N->getGenFunc() == NULL && "Code already generated for this node");
-
-    // Useful values
-    Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-    Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
-
-    // If kernel struct has not been initialized with kernel function, then fail
-    assert(kernel != NULL && "No kernel found!!");
-
-    DEBUG(errs() << "Generating kernel call code\n");
-
-    Function* F = N->getFuncPointer();
+  DEBUG(errs() << "Extracted all\n");
+  retVal->setName("output");
+  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReplaceInstWithInst(RI, newRI);
+}
 
 
-    // Create of clone of F with no instructions. Only the type is the same as F
-    // without the extra arguments.
-    Function* F_X86;
+// Right now, only targeting the one level case. In general, device functions
+// can return values so we don't need to change them
+void CodeGenTraversal::codeGen(DFInternalNode* N) {
 
-    // Clone the function, if we are seeing this function for the first time. We
-    // only need a clone in terms of type.
-    ValueToValueMapTy VMap;
+  if (!KernelLaunchNode) {
+    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    return;
+  }
 
-    // Create new function with the same type
-    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+  if (N == KernelLaunchNode) {
+    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
+    //TODO
 
-    // Loop over the arguments, copying the names of arguments over.
-    Function::arg_iterator dest_iterator = F_X86->arg_begin();
-    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-         i != e; ++i) {
-      dest_iterator->setName(i->getName()); // Copy the name over...
-      // Add mapping to VMap and increment dest iterator
-      VMap[i] = dest_iterator++;
-    }
+    // Now the remaining nodes to be visited should be ignored
+    KernelLaunchNode = NULL;
+    errs() << "Insert Runtime calls\n";
+    insertRuntimeCalls(N, getPTXFilename(M));
+    writeKernelsModule();
 
-    // Add a basic block to this empty function
-    BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-    ReturnInst* RI = ReturnInst::Create(M.getContext(),
-                                        UndefValue::get(F_X86->getReturnType()), BB);
+  } else {
+    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
+    //TODO : Check that the arguments order of root to intermediate matches
+    // the intermediate to leaf.
 
-    //Add the generated function info to DFNode
-    N->setGenFunc(F_X86, DFNode::X86);
-
-    // FIXME: Adding Index and Dim arguments are probably not required except
-    // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
-    // have those arguments)
+  }
 
-    // Add Index and Dim arguments except for the root node
-    if(!N->isRoot())
-      addIdxDimArgs(F_X86);
+}
 
-    // Sort children in topological order before code generation for kernel call
-    N->getChildGraph()->sortChildren();
+void CodeGenTraversal::codeGen(DFLeafNode* N) {
 
-    // The DFNode N has the property that it has only one child (leaving Entry
-    // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
-    // generation for kernel calls significantly. All the inputs to this child
-    // node would either be constants or from the parent node N.
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
 
-    assert(N->getChildGraph()->size() == 3
-        && "Node expected to have just one non-dummy node!");
+  // Checking which node is the kernel launch
+  DFNode* PNode = N->getParent();
+  int pLevel = PNode->getLevel();
+  int pReplFactor = PNode->getNumOfDim();
+
+  if (!pLevel || !pReplFactor) {
+    KernelLaunchNode = PNode;
+    kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits());
+    // TODO: Find a better way of choosing parameters
+    //kernel->gridDim = N->getNumOfDim();
+    //kernel->blockDim = N->getNumOfDim();
+    //kernel->globalWGSize = N->getDimLimits();
+    //kernel->localWGSize = N->getDimLimits();
+    //FIXME: Comment this out as we can provide localWGSize as null
+    //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
+    // TODO: How to choose the div factor;
+    //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
+    //std::vector<Value*> tmp(kernel->gridDim, divFactor);
+    //for (unsigned i = 0; i < kernel->gridDim; i++) {
+    //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
+    //  kernel->localWGSize.push_back(SDivInst);
+    //}
+  }
+  else {
+    errs() << "*************** Entering else part **************\n";
+    /*
+    KernelLaunchNode = PNode->getParent();
+    kernel->gridDim = PNode->getNumOfDim();
+    kernel->blockDim = N->getNumOfDim();
+    // TODO: Handle different number of dimensions
+    assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match");
+    std::vector<Value*> numOfBlocks = PNode->getDimLimits();
+    kernel->localWGSize = N->getDimLimits();
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]);
+      //kernel->globalWGSize.push_back(MulInst);
+    }*/
+  }
 
-    DFNode* C;
-    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-      C = *ci;
-      // Skip dummy node call
-      if (!C->isDummyNode())
-        break;
-    }
+  std::vector<IntrinsicInst *> IItoRemove;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
 
-    assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
 
-    Function* CF = C->getFuncPointer();
-    // Initialize context
-    DEBUG(errs() << "Initializing context" << "\n");
-    CallInst::Create(llvm_visc_ptx_initContext, None, "", RI);
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+  Function *F_nvptx = N->getGenFunc();
+  if(F_nvptx == NULL) {
+    // Clone the function
+    ValueToValueMapTy VMap;
+    F_nvptx = CloneFunction(F, VMap, true);
 
-    DEBUG(errs() << "Initializing commandQ" << "\n");
-    // Initialize command queue
-    Value* fileStr = getStringPointer(FileName, RI, "Filename");
-    errs() << *fileStr << "\n";
-    errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n";
-    Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
+    // Insert the cloned function into the kernels module
+    KernelM.getFunctionList().push_back(F_nvptx);
 
-    Value* LaunchInstArgs[] = {fileStr, kernelStr};
+    DEBUG(errs() << *F_nvptx->getType());
+    DEBUG(errs() << *F_nvptx);
 
-    DEBUG(errs() << "Inserting launch call" << "\n");
-    CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
-                                        ArrayRef<Value*>(LaunchInstArgs, 2),
-                                        "graph"+CF->getName(),
-                                        RI);
-    DEBUG(errs() << *GraphID << "\n");
-    // Iterate over the required input edges of the node and use the visc-rt API
-    // to set inputs
-    DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
-
-    std::vector<OutputPtr> OutputPointers;
-    for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
-
-      Value* inputVal = getInValueAt(C, i, F_X86, RI);
-      // input value has been obtained.
-      // Check if input is a scalar value or a pointer operand
-      // For scalar values such as int, float, etc. the size is simply the size of
-      // type on target machine, but for pointers, the size of data would be the
-      // next integer argument
-      if(inputVal->getType()->isPointerTy()) {
-        // CheckAttribute
-        Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False;
-        Value* isInput = ((hasAttribute(CF, i, Attribute::Out))
-                          && !(hasAttribute(CF, i, Attribute::In)))? False : True;
-
-        Argument* A = getArgumentAt(CF, i);
-        if(isOutput == True) {
-          errs() << *A << " is an OUTPUT argument\n";
-        }
-        if(isInput == True) {
-          errs() << *A << " is an INPUT argument\n";
-        }
+    //Add generated function info to DFNode
+    N->setGenFunc(F_nvptx, DFNode::PTX);
+  } else {
+    errs() << "WARNING: Visiting a node for which code already generated!\n";
+  }
 
-        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
-                                                          Type::getInt8PtrTy(M.getContext()),
-                                                          inputVal->getName()+".i8ptr",
-                                                          RI);
-        // Pointer Input
-        Value* inputSize = getInValueAt(C, i+1, F_X86, RI);
-        assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
-            && "Pointer type input must always be followed by size (integer type)");
-        Value* setInputArgs[] = {GraphID,
-                                inputValI8Ptr,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                inputSize,
-                                isInput,
-                                isOutput
-                                };
-        Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr,
-              ArrayRef<Value*>(setInputArgs, 6), "", RI);
-        // If this has out attribute, store the returned device pointer in
-        // memory to read device memory later
-        if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+  transformFunctionToVoid(F_nvptx);
+
+  // Go through all the instructions
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+    Instruction *I = &(*i);
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscQueryIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /************************ Handle VISC Query intrinsics ************************/
+
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *****************************/
+      case Intrinsic::visc_getNode: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
       }
-      else { // Scalar Input
-        // Store the scalar value on stack and then pass the pointer to its
-        // location
-        AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
-        StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
-
-        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                                                          Type::getInt8PtrTy(M.getContext()),
-                                                          inputVal->getName()+".i8ptr",
-                                                          RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                inputValI8Ptr,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                ConstantExpr::getSizeOf(inputVal->getType())
-                                };
-        CallInst::Create(llvm_visc_ptx_argument_scalar,
-              ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      break;
+      /************************* llvm.visc.getParentNode() **************************/
+      case Intrinsic::visc_getParentNode: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // get the parent node of the arg node
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+
+        IItoRemove.push_back(II);
       }
+      break;
+      /*************************** llvm.visc.getNumDims() ***************************/
+      case Intrinsic::visc_getNumDims: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        int numOfDim = ArgDFNode->getNumOfDim();
+        DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
+//            IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
+        IntegerType* IntTy = Type::getInt32Ty(getGlobalContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
 
-    }
-    DEBUG(errs() << "Setup output edges of node and insert visc api\n");
-    // Setup output
-    // FIXME: Note - There is a tricky question. In X86 we do not need to care
-    // about pointer inputs which modify data in memory implicitly (without
-    // showing it as output). There is no extra cost needed to handle such inputs
-    // For PTX, we need to read back such data from device memory to host memory.
-    // The cost is huge and hence we need to differentiate between readonly
-    // pointer inputs vs read/write pointer inputs. Currently supporting only a
-    // simple model in which all input edges are readonly and output is
-    // writeonly.
-    
-    // Set output
-    StructType* OutputTy = C->getOutputType();
-    unsigned outputIndex = CF->getFunctionType()->getNumParams();
-    Value* outputSize = ConstantExpr::getSizeOf(OutputTy);
-    Value* setOutputArgs[] = {GraphID,
-                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                              ConstantExpr::getSizeOf(OutputTy),
-                              False,
-                              True};
-
-    CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr,
-                                        ArrayRef<Value*>(setOutputArgs, 6),
-                                        "d_output."+CF->getName(),
-                                        RI);
+        // Replace the result of the intrinsic with the computed value
+        II->replaceAllUsesWith(numOfDimConstant);
 
-    // Enqueue kernel
-    // Need work dim, localworksize, globalworksize
-    // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
-    // size and global work size
-    // Allocate size_t[numDims] space on stack. Store the work group sizes and
-    // pass it as an argument to ExecNode
-    Type* Int64Ty = Type::getInt64Ty(M.getContext());
-    Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim);
-    AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", RI);
-    Value* GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", RI);
-    Value* nextDim = GlobalWGPtr;
-    errs() << *GlobalWGPtr << "\n";
-    Constant* IntOne = ConstantInt::get(Int64Ty, 1);
-    errs() << *IntOne << "\n";
-    for(unsigned i=0; i < kernel->gridDim; i++) {
-      errs() << *kernel->globalWGSize[i]->getType() << "\n";
-      errs() << *nextDim->getType() << "\n";
-      assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-      if(kernel->globalWGSize[i]->getType() != Int64Ty) {
-        kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", RI);
-        StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, RI);
-        errs() << *SI << "\n";
-      } else {
-        StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, RI);
-        errs() << *SI << "\n";
+        IItoRemove.push_back(II);
       }
-      if(i+1 < kernel->gridDim) {
-        GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, ArrayRef<Value*>(IntOne), GlobalWG->getName()+"."+Twine(i+1), RI);
-        errs() << *GEP << "\n";
-        nextDim = GEP;
-      }
-    }
-    errs() << *llvm_visc_ptx_executeNode << "\n";
-    errs() << *GlobalWGPtr << "\n";
-    Value* ExecNodeArgs[] = {GraphID,
-                            ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()),
-                            Constant::getNullValue(Type::getInt64PtrTy(M.getContext())),
-                            GlobalWGPtr
-                            };
-    CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
-                                       ArrayRef<Value*>(ExecNodeArgs, 4),
-                                       "event."+CF->getName(),
-                                       RI);
-    errs() << *Event << "\n";
-    // Wait for Kernel to Finish
-    CallInst::Create(llvm_visc_ptx_wait,
-                     ArrayRef<Value*>(GraphID),
-                     "",
-                     RI);
-    // Read Output Struct
-    Value* GetOutputArgs[] = {GraphID,
-                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              d_Output,
-                              outputSize};
-    CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
-                                          ArrayRef<Value*>(GetOutputArgs, 4),
-                                          "h_output."+CF->getName()+".addr",
-                                          RI);
-    // Read each device pointer listed in output struct
-    // Load the output struct
-    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI);
-    Value* KernelOutput = new LoadInst(BI, "", RI);
-
-    // Read all the pointer arguments which had side effects i.e., had out
-    // attribute
-    for(auto output: OutputPointers) {
-      errs() << "Read: " << *output.d_ptr << "\n";
-      errs() << "\t To: " << *output.h_ptr << "\n";
-      errs() << "\t #bytes: " << *output.bytes << "\n";
-      Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
-      CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput,
-                                      ArrayRef<Value*>(GetOutputArgs, 4),
-                                      "", RI);
-    }
-    /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      Type* elemTy = OutputTy->getElementType(i);
-      if(elemTy->isPointerTy()) {
-        // Pointer type
-        assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext())
-            && "Every Pointer type must be followed by an integer");
-        ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI);
-        // Change d_ptr to i8*
-        CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI);
-        ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI);
-        // GetOutputPtr call
-        Value* GetOutputArgs[] = {GraphID,
-                                  d_ptr_i8,
-                                  len};
-        CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput,
-                                              ArrayRef<Value*>(GetOutputArgs, 3),
-                                              "",
-                                              RI);
-        // Change h_ptr to correct type
-        CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8,
-                                                 cast<StructType>(KernelOutput->getType())->getElementType(i),
-                                                 "",
-                                                 RI);
-        KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI);
+      break;
+      /*********************** llvm.visc.getNodeInstanceID() ************************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        assert(ArgDFNode && "Arg node is NULL");
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNodeInstanceID_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(getGlobalContext()) /*KernelM.getContext()*/ , dim);
+        ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = ParentDFNode->getLevel();
+        int parentReplFactor = ParentDFNode->getNumOfDim();
+
+        if (!parentLevel || !parentReplFactor) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so we need to specify a global id
+
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+          // We are asking for this node's id with respect to its parent
+          // this is a local id call
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's id with respect to its
+          // parent: this is a group id call
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
+        } else {
+          assert(false && "Unable to translate this intrinsic");
+        }
 
-      }
-    }*/
+        // Create call instruction, insert it before the intrinsic and
+        // replace the uses of the previous instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II);
+        II->replaceAllUsesWith(CI);
 
-    // Prepare output
-    KernelOutput->setName("output."+CF->getName());
-    OutputMap[C] = KernelOutput;
-
-    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
-    // Generate code for output bindings
-    // Get Exit node
-    C = N->getChildGraph()->getExit();
-    // Get OutputType of this node
-    StructType* OutTy = N->getOutputType();
-    Value *retVal = UndefValue::get(F_X86->getReturnType());
-    // Find all the input edges to exit node
-    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
-      DEBUG(errs() << "Output Edge " << i << "\n");
-      // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-
-      assert(E && "No Binding for output element!");
-      // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-
-      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
-
-      // If Source DFNode is a dummyNode, edge is from parent. Get the
-      // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
-        inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
-        // edge is from a internal node
-        // Check - code should already be generated for this source dfnode
-        assert(OutputMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-
-        // Find Output Value associated with the Source DFNode using OutputMap
-        Value* CI = OutputMap[SrcDF];
-
-        // Extract element at source position from this call instruction
-        std::vector<unsigned> IndexList;
-        IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "",RI);
-        inputVal = EI;
+        IItoRemove.push_back(II);
       }
-      std::vector<unsigned> IdxList;
-      IdxList.push_back(i);
-      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
-    }
-    DEBUG(errs() << "Extracted all\n");
-    retVal->setName("output");
-    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
-    ReplaceInstWithInst(RI, newRI);
-  }
-
-
-// Right now, only targeting the one level case. In general, device functions
-// can return values so we don't need to change them
-  void CodeGenTraversal::codeGen(DFInternalNode* N) {
-
-    if (!KernelLaunchNode) {
-      DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
-      return;
-    }
-
-    if (N == KernelLaunchNode) {
-      DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-      //TODO
-
-      // Now the remaining nodes to be visited should be ignored
-      KernelLaunchNode = NULL;
-      errs() << "Insert Runtime calls\n";
-      insertRuntimeCalls(N, getPTXFilename(M));
-      writeKernelsModule();
-
-    } else {
-      DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
-      //TODO : Check that the arguments order of root to intermediate matches
-      // the intermediate to leaf.
-
-    }
-
-  }
-
-  void CodeGenTraversal::codeGen(DFLeafNode* N) {
-
-    // Skip code generation if it is a dummy node
-    if(N->isDummyNode()) {
-      DEBUG(errs() << "Skipping dummy node\n");
-      return;
-    }
-
-    // Checking which node is the kernel launch
-    DFNode* PNode = N->getParent();
-    int pLevel = PNode->getLevel();
-    int pReplFactor = PNode->getNumOfDim();
-
-    if (!pLevel || !pReplFactor) {
-      KernelLaunchNode = PNode;
-      kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits());
-      // TODO: Find a better way of choosing parameters
-      //kernel->gridDim = N->getNumOfDim();
-      //kernel->blockDim = N->getNumOfDim();
-      //kernel->globalWGSize = N->getDimLimits();
-      //kernel->localWGSize = N->getDimLimits();
-      //FIXME: Comment this out as we can provide localWGSize as null
-      //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
-      // TODO: How to choose the div factor;
-      //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
-      //std::vector<Value*> tmp(kernel->gridDim, divFactor);
-      //for (unsigned i = 0; i < kernel->gridDim; i++) {
-      //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
-      //  kernel->localWGSize.push_back(SDivInst);
-      //}
-    }
-    else {
-      errs() << "*************** Entering else part **************\n";
-      /*
-      KernelLaunchNode = PNode->getParent();
-      kernel->gridDim = PNode->getNumOfDim();
-      kernel->blockDim = N->getNumOfDim();
-      // TODO: Handle different number of dimensions
-      assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match");
-      std::vector<Value*> numOfBlocks = PNode->getDimLimits();
-      kernel->localWGSize = N->getDimLimits();
-      for (unsigned i = 0; i < kernel->gridDim; i++) {
-        //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]);
-        //kernel->globalWGSize.push_back(MulInst);
-      }*/
-    }
-
-    std::vector<IntrinsicInst *> IItoRemove;
-    BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
-
-    // Get the function associated with the dataflow node
-    Function *F = N->getFuncPointer();
-
-    // Look up if we have visited this function before. If we have, then just
-    // get the cloned function pointer from DFNode. Otherwise, create the cloned
-    // function and add it to the DFNode GenFunc.
-    Function *F_nvptx = N->getGenFunc();
-    if(F_nvptx == NULL) {
-      // Clone the function
-      ValueToValueMapTy VMap;
-      F_nvptx = CloneFunction(F, VMap, true);
-
-      // Insert the cloned function into the kernels module
-      KernelM.getFunctionList().push_back(F_nvptx);
-
-      DEBUG(errs() << *F_nvptx->getType());
-      DEBUG(errs() << *F_nvptx);
-
-      //Add generated function info to DFNode
-      N->setGenFunc(F_nvptx, DFNode::PTX);
-    } else {
-      errs() << "WARNING: Visiting a node for which code already generated!\n";
-    }
-
-    transformFunctionToVoid(F_nvptx);
-
-    // Go through all the instructions
-    for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-      Instruction *I = &(*i);
-      // Leaf nodes should not contain VISC graph intrinsics or launch
-      assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-      assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
-
-      if (BuildDFG::isViscQueryIntrinsic(I)) {
-        IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-        IntrinsicInst* ArgII;
-        DFNode* ArgDFNode;
-
-/************************ Handle VISC Query intrinsics ************************/
-
-        switch (II->getIntrinsicID()) {
-/**************************** llvm.visc.getNode() *****************************/
-          case Intrinsic::visc_getNode: {
-            DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
-            // add mapping <intrinsic, this node> to the node-specific map
-            Leaf_HandleToDFNodeMap[II] = N;
-            IItoRemove.push_back(II);
-            }
-            break;
-/************************* llvm.visc.getParentNode() **************************/
-          case Intrinsic::visc_getParentNode: {
-            DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
-            // get the parent node of the arg node
-            // get argument node
-            ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-            ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-            // get the parent node of the arg node
-            // Add mapping <intrinsic, parent node> to the node-specific map
-            // the argument node must have been added to the map, orelse the
-            // code could not refer to it
-            Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
-
-            IItoRemove.push_back(II);
-            }
-            break;
-/*************************** llvm.visc.getNumDims() ***************************/
-          case Intrinsic::visc_getNumDims: {
-            DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
-            // get node from map
-            // get the appropriate field
-            ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-            ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-            int numOfDim = ArgDFNode->getNumOfDim();
-            DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-//            IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
-            IntegerType* IntTy = Type::getInt32Ty(getGlobalContext());
-            ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
-
-            // Replace the result of the intrinsic with the computed value
-            II->replaceAllUsesWith(numOfDimConstant);
-
-            IItoRemove.push_back(II);
-            }
-            break;
-/*********************** llvm.visc.getNodeInstanceID() ************************/
-          case Intrinsic::visc_getNodeInstanceID_x:
-          case Intrinsic::visc_getNodeInstanceID_y:
-          case Intrinsic::visc_getNodeInstanceID_z: {
-            DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n");
-            ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-            ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-            assert(ArgDFNode && "Arg node is NULL");
-            // A leaf node always has a parent
-            DFNode* ParentDFNode = ArgDFNode->getParent();
-            assert(ParentDFNode && "Parent node of a leaf is NULL");
-
-            // Get the number associated with the required dimension
-            // FIXME: The order is important!
-            // These three intrinsics need to be consecutive x,y,z
-            uint64_t dim = II->getIntrinsicID() -
-                           Intrinsic::visc_getNodeInstanceID_x;
-            assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
-            DEBUG(errs() << "\t  dimension = " << dim << "\n");
-
-            // Argument of the function to be called
-            ConstantInt * DimConstant =
-              ConstantInt::get(Type::getInt32Ty(getGlobalContext()) /*KernelM.getContext()*/ , dim);
-            ArrayRef<Value *> Args(DimConstant);
-
-            // The following is to find which function to call
-            Function * OpenCLFunction;
-            int parentLevel = ParentDFNode->getLevel();
-            int parentReplFactor = ParentDFNode->getNumOfDim();
-
-            if (!parentLevel || !parentReplFactor) {
-            // We only have one level in the hierarchy or the parent node is not
-            // replicated. This indicates that the parent node is the kernel
-            // launch, so we need to specify a global id
-
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
-            } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
-              // We are asking for this node's id with respect to its parent
-              // this is a local id call
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT));
-            } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
-              // We are asking for this node's parent's id with respect to its
-              // parent: this is a group id call
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
-            } else {
-              assert(false && "Unable to translate this intrinsic");
-            }
-
-            // Create call instruction, insert it before the intrinsic and
-            // replace the uses of the previous instruction with the new one
-            CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II);
-            II->replaceAllUsesWith(CI);
-
-            IItoRemove.push_back(II);
-            }
-            break;
-/********************** llvm.visc.getNumNodeInstances() ***********************/
-          case Intrinsic::visc_getNumNodeInstances_x:
-          case Intrinsic::visc_getNumNodeInstances_y:
-          case Intrinsic::visc_getNumNodeInstances_z: {
+      break;
+      /********************** llvm.visc.getNumNodeInstances() ***********************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
 //TODO: think about whether this is the best way to go
 // there are hw specific registers. therefore it is good to have the intrinsic
 // but then, why do we need to keep that info in the graph?
 // (only for the kernel configuration during the call)
 
-            DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
-            ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-            ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-            // A leaf node always has a parent
-            DFNode* ParentDFNode = ArgDFNode->getParent();
-            assert(ParentDFNode && "Parent node of a leaf is NULL");
-
-            // Get the number associated with the required dimension
-            // FIXME: The order is important!
-            // These three intrinsics need to be consecutive x,y,z
-            uint64_t dim = II->getIntrinsicID() -
-                           Intrinsic::visc_getNumNodeInstances_x;
-            assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
-            DEBUG(errs() << "\t  dimension = " << dim << "\n");
-
-            // Argument of the function to be called
-            ConstantInt * DimConstant =
-              ConstantInt::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), dim);
-            ArrayRef<Value *> Args(DimConstant);
-
-            // The following is to find which function to call
-            Function * OpenCLFunction;
-            int parentLevel = ParentDFNode->getLevel();
-            int parentReplFactor = ParentDFNode->getNumOfDim();
-
-            if (!parentLevel || !parentReplFactor) {
-            // We only have one level in the hierarchy or the parent node is not
-            // replicated. This indicates that the parent node is the kernel
-            // launch, so the instances are global_size (gridDim x blockDim)
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
-            } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
-              // We are asking for this node's instances
-              // this is a local size (block dim) call
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT));
-            } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
-              // We are asking for this node's parent's instances
-              // this is a (global_size/local_size) (grid dim) call
-              FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
-                                  false);
-              OpenCLFunction = cast<Function>
-                (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
-            } else {
-              assert(false && "Unable to translate this intrinsic");
-            }
-
-            // Create call instruction, insert it before the intrinsic and
-            // replace the uses of the previous instruction with the new one
-            CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II);
-            II->replaceAllUsesWith(CI);
-
-            IItoRemove.push_back(II);
-            }
-            break;
-          default:
-            assert(false && "Unknown VISC Intrinsic!");
-            break;
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNumNodeInstances_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), dim);
+        ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = ParentDFNode->getLevel();
+        int parentReplFactor = ParentDFNode->getNumOfDim();
+
+        if (!parentLevel || !parentReplFactor) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so the instances are global_size (gridDim x blockDim)
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+          // We are asking for this node's instances
+          // this is a local size (block dim) call
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's instances
+          // this is a (global_size/local_size) (grid dim) call
+          FunctionType* FT =
+            FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+          OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
+        } else {
+          assert(false && "Unable to translate this intrinsic");
         }
 
-      } else {
-        //TODO: how to handle address space qualifiers in load/store
+        // Create call instruction, insert it before the intrinsic and
+        // replace the uses of the previous instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II);
+        II->replaceAllUsesWith(CI);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      default:
+        assert(false && "Unknown VISC Intrinsic!");
+        break;
       }
 
+    } else {
+      //TODO: how to handle address space qualifiers in load/store
     }
 
-    // We need to do this explicitly: DCE pass will not remove them because we
-    // have assumed theworst memory behaviour for these function calls
-    // Traverse the vector backwards, otherwise definitions are deleted while
-    // their subsequent uses are still around
-    for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
-         re = IItoRemove.rend(); ri != re; ++ri)
-      (*ri)->eraseFromParent();
-
-    addCLMetadata(F_nvptx);
-    kernel->KernelFunction = F_nvptx;
-    errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-    DEBUG(errs() << KernelM);
-
-    return;
   }
 
-  bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri)
+    (*ri)->eraseFromParent();
 
-    // Get the BuildDFG Analysis Results:
-    // - Dataflow graph
-    // - Maps from i8* hansles to DFNode and DFEdge
-    BuildDFG &DFG = getAnalysis<BuildDFG>();
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+  DEBUG(errs() << KernelM);
 
-    DFInternalNode *Root = DFG.getRoot();
-//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+  return;
+}
 
-    // Visitor for Code Generation Graph Traversal
-    CodeGenTraversal *CGTVisitor = new CodeGenTraversal(M, DFG);
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
 
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(Root);
-    //TODO: Edit module epilogue to remove the VISC intrinsic declarations
-    delete CGTVisitor;
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-    return true;
-  }
-
-  std::string CodeGenTraversal::getKernelsModuleName(Module &M) {
-    /*SmallString<128> currentDir;
-    llvm::sys::fs::current_path(currentDir);
-    std::string fileName = getFilenameFromModule(M);
-    Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-    return output.str().append(".kernels.ll");*/
-    std::string mid = M.getModuleIdentifier();
-    return mid.append(".kernels.ll");
-  }
+  DFInternalNode *Root = DFG.getRoot();
+//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) {
-    assert(isa<PointerType>(V->getType())
-        && "Value should be of Pointer Type!");
-    PointerType* OldTy = cast<PointerType>(V->getType());
-    PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-    V->mutateType(NewTy);
-    for(Value::use_iterator ui = V->use_begin(), ue = V->use_end(); ui != ue; ui++) {
-      // Change all uses producing pointer type in same address space to new
-      // addressspace.
-      if(PointerType* PTy = dyn_cast<PointerType>(ui->getType())) {
-        if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-          fixValueAddrspace(*ui, addrspace);
-        }
+  // Visitor for Code Generation Graph Traversal
+  CodeGenTraversal *CGTVisitor = new CodeGenTraversal(M, DFG);
+
+  // Initiate code generation for root DFNode
+  CGTVisitor->visit(Root);
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+std::string CodeGenTraversal::getKernelsModuleName(Module &M) {
+  /*SmallString<128> currentDir;
+  llvm::sys::fs::current_path(currentDir);
+  std::string fileName = getFilenameFromModule(M);
+  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+  return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
+}
+
+void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType())
+         && "Value should be of Pointer Type!");
+  PointerType* OldTy = cast<PointerType>(V->getType());
+  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for(Value::use_iterator ui = V->use_begin(), ue = V->use_end(); ui != ue; ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if(PointerType* PTy = dyn_cast<PointerType>(ui->getType())) {
+      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
       }
     }
   }
-
-  void CodeGenTraversal::changeArgAddrspace(Function* F, unsigned addrspace) {
-    std::vector<Type*> ArgTypes;
-    for(auto& arg: F->getArgumentList()) {
-      DEBUG(errs() << arg << "\n");
-      if(PointerType* argTy = dyn_cast<PointerType>(arg.getType())) {
-        if(argTy->getAddressSpace() == 0) {
-          fixValueAddrspace(&arg, addrspace);
-        }
+}
+
+void CodeGenTraversal::changeArgAddrspace(Function* F, unsigned addrspace) {
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    if(PointerType* argTy = dyn_cast<PointerType>(arg.getType())) {
+      if(argTy->getAddressSpace() == 0) {
+        fixValueAddrspace(&arg, addrspace);
       }
-      ArgTypes.push_back(arg.getType());
     }
-    FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false);
-    PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace());
-
-    F->mutateType(PTy);
-    DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
+    ArgTypes.push_back(arg.getType());
   }
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false);
+  PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace());
 
-  /* Add metadata to module KernelM, for OpenCL kernels */
-  void CodeGenTraversal::addCLMetadata(Function *F) {
+  F->mutateType(PTy);
+  DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
+}
 
-    IRBuilder<true> Builder(F->begin());
+/* Add metadata to module KernelM, for OpenCL kernels */
+void CodeGenTraversal::addCLMetadata(Function *F) {
 
-    SmallVector<Value*,8> KernelMD;
-    KernelMD.push_back(F);
+  IRBuilder<true> Builder(F->begin());
+
+  SmallVector<Value*,8> KernelMD;
+  KernelMD.push_back(F);
 
   //TODO: For now, we don not add any additional metadata
-/*
-    // MDNode for the kernel argument address space qualifiers.
-    SmallVector<llvm::Value*, 8> addressQuals;
-    addressQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_addr_space"));
-
-    // We don't support images
-    // MDNode for the kernel argument access qualifiers (images only).
-//    SmallVector<llvm::Value*, 8> accessQuals;
-//    accessQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_access_qual"));
-
-    // MDNode for the kernel argument type names.
-    SmallVector<llvm::Value*, 8> argTypeNames;
-    argTypeNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type"));
-
-    //TODO: MDNode for the kernel argument type qualifiers.
-//    SmallVector<llvm::Value*, 8> argTypeQuals;
-//    argTypeQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type_qual"));
-
-    // MDNode for the kernel argument names.
-    SmallVector<llvm::Value*, 8> argNames;
-    argNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_name"));
-
-    for (Function::arg_iterator ai = F->arg_begin(),
-                                  ae = F->arg_end(); ai != ae; ++ai) {
-      Argument *arg = &*ai;
-      Type *argTy = arg->getType();
-
-      if (argTy->isPointerTy()) {
-        Type *pointeeTy = argTy->getPointerElementType();
-        std::string typeName = printType(pointeeTy) + "*";
-        // Get argument type name.
-        argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName));
-
-        // Get address qualifier.
-        addressQuals.push_back(Builder.getInt32(argTy->getPointerAddressSpace()));
-      } else {
-        std::string typeName = printType(argTy);
-        // Get argument type name.
-        argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName));
-
-        // Get address qualifier.
-        addressQuals.push_back(Builder.getInt32(GENERIC_ADDRSPACE));
+  /*
+      // MDNode for the kernel argument address space qualifiers.
+      SmallVector<llvm::Value*, 8> addressQuals;
+      addressQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_addr_space"));
+
+      // We don't support images
+      // MDNode for the kernel argument access qualifiers (images only).
+  //    SmallVector<llvm::Value*, 8> accessQuals;
+  //    accessQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_access_qual"));
+
+      // MDNode for the kernel argument type names.
+      SmallVector<llvm::Value*, 8> argTypeNames;
+      argTypeNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type"));
+
+      //TODO: MDNode for the kernel argument type qualifiers.
+  //    SmallVector<llvm::Value*, 8> argTypeQuals;
+  //    argTypeQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type_qual"));
+
+      // MDNode for the kernel argument names.
+      SmallVector<llvm::Value*, 8> argNames;
+      argNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_name"));
+
+      for (Function::arg_iterator ai = F->arg_begin(),
+                                    ae = F->arg_end(); ai != ae; ++ai) {
+        Argument *arg = &*ai;
+        Type *argTy = arg->getType();
+
+        if (argTy->isPointerTy()) {
+          Type *pointeeTy = argTy->getPointerElementType();
+          std::string typeName = printType(pointeeTy) + "*";
+          // Get argument type name.
+          argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName));
+
+          // Get address qualifier.
+          addressQuals.push_back(Builder.getInt32(argTy->getPointerAddressSpace()));
+        } else {
+          std::string typeName = printType(argTy);
+          // Get argument type name.
+          argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName));
+
+          // Get address qualifier.
+          addressQuals.push_back(Builder.getInt32(GENERIC_ADDRSPACE));
 
-      }
+        }
 
-      // Get argument name.
-      argNames.push_back(MDString::get(KernelM.getContext(), arg->getName()));
-    }
+        // Get argument name.
+        argNames.push_back(MDString::get(KernelM.getContext(), arg->getName()));
+      }
 
-    KernelMD.push_back(MDNode::get(KernelM.getContext(), addressQuals));
-//    KernelMD.push_back(MDNode::get(KernelM.getContext(), accessQuals));
-    KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeNames));
-//    KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeQuals));
-    KernelMD.push_back(MDNode::get(KernelM.getContext(), argNames));
-*/
-    MDNode *MDKernelNode = MDNode::get(KernelM.getContext(), KernelMD);
-    NamedMDNode *MDN_kernels = KernelM.getOrInsertNamedMetadata("opencl.kernels");
-    MDN_kernels->addOperand(MDKernelNode);
-
-    KernelMD.push_back(MDNode::get(KernelM.getContext(),
-                                   MDString::get(KernelM.getContext(), "kernel")));
-    // TODO: Replace 1 with the number of the kernel.
-    // Add when support for multiple launces is added
-    KernelMD.push_back(MDNode::get(KernelM.getContext(),
-                                   ConstantInt::get(Type::getInt32Ty(KernelM.getContext()),1)));
-    MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM.getContext(), KernelMD);
-    NamedMDNode *MDN_annotations = KernelM.getOrInsertNamedMetadata("nvvm.annotations");
-    MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+      KernelMD.push_back(MDNode::get(KernelM.getContext(), addressQuals));
+  //    KernelMD.push_back(MDNode::get(KernelM.getContext(), accessQuals));
+      KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeNames));
+  //    KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeQuals));
+      KernelMD.push_back(MDNode::get(KernelM.getContext(), argNames));
+  */
+  MDNode *MDKernelNode = MDNode::get(KernelM.getContext(), KernelMD);
+  NamedMDNode *MDN_kernels = KernelM.getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  KernelMD.push_back(MDNode::get(KernelM.getContext(),
+                                 MDString::get(KernelM.getContext(), "kernel")));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(MDNode::get(KernelM.getContext(),
+                                 ConstantInt::get(Type::getInt32Ty(KernelM.getContext()),1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM.getContext(), KernelMD);
+  NamedMDNode *MDN_annotations = KernelM.getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
 //!1 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i32)* @matrixMul, metadata !"kernel", i32 1}
+}
+
+void CodeGenTraversal::writeKernelsModule() {
+
+  char* ErrorMessage = NULL;
+  LLVMModuleRef KernelMRef = wrap(&KernelM);
+  errs() << "Writing to File --- ";
+  errs() << getKernelsModuleName(M).c_str() << "\n";
+  LLVMPrintModuleToFile(KernelMRef,
+                        getKernelsModuleName(M).c_str(),
+                        &ErrorMessage);
+  if (ErrorMessage) {
+    LLVMDisposeMessage(ErrorMessage);
   }
+  LLVMDisposeModule(KernelMRef);
+}
 
-  void CodeGenTraversal::writeKernelsModule() {
-
-    char* ErrorMessage = NULL;
-    LLVMModuleRef KernelMRef = wrap(&KernelM);
-    errs() << "Writing to File --- ";
-    errs() << getKernelsModuleName(M).c_str() << "\n";
-    LLVMPrintModuleToFile(KernelMRef,
-                          getKernelsModuleName(M).c_str(),
-                          &ErrorMessage);
-    if (ErrorMessage) {
-      LLVMDisposeMessage(ErrorMessage);
-    }
-    LLVMDisposeModule(KernelMRef);
-  }
-
-  void CodeGenTraversal::transformFunctionToVoid(Function* F) {
+void CodeGenTraversal::transformFunctionToVoid(Function* F) {
 
-    // FIXME: Maybe do that using the Node?
-    StructType* FRetTy = cast<StructType>(F->getReturnType());
-    assert(FRetTy && "Return Type must always be a struct");
+  // FIXME: Maybe do that using the Node?
+  StructType* FRetTy = cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
 
-    // Keeps return statements, because we will need to replace them
-    std::vector<ReturnInst *> RItoRemove;
-    findReturnInst(F, RItoRemove);
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
 
 
-    // Check for { } return struct, which means that the function returns void
-    if (FRetTy->getNumElements() == 0) {
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->getNumElements() == 0) {
 
-      DEBUG(errs() << "\tFunction output struct is void\n");
-      DEBUG(errs() << "\tNo parameters added\n");
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
 
-      // Replacing return statements with others returning void
-      for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(),
-           e = RItoRemove.end(); i != e; ++i) {
-        ReturnInst::Create((F->getContext()), 0, (*i));
-        (*i)->eraseFromParent();
-      }
-      DEBUG(errs() << "\tChanged return statements to return void\n");
-
-      return;
+    // Replacing return statements with others returning void
+    for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(),
+         e = RItoRemove.end(); i != e; ++i) {
+      ReturnInst::Create((F->getContext()), 0, (*i));
+      (*i)->eraseFromParent();
     }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
 
-    // The struct has return values, thus needs to be converted to parameter
+    return;
+  }
 
-    int initialNumParams = F->arg_size();
+  // The struct has return values, thus needs to be converted to parameter
 
-    Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE);
-    new Argument(ArgType, "ret_struct_ptr", F);
-    DEBUG(errs() << "\tCreated parameter\n");
+  int initialNumParams = F->arg_size();
 
-    // Create the argument type list with the added argument's type
-    std::vector<Type*> ArgTypes;
-    for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-        ai != ae; ++ai) {
-      ArgTypes.push_back(ai->getType());
-    }
+  Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE);
+  new Argument(ArgType, "ret_struct_ptr", F);
+  DEBUG(errs() << "\tCreated parameter\n");
 
-    // Find where the new parameter is in the header
-    Function::arg_iterator ai, ae;
-    int check = 0;
-    for (ai = F->arg_begin(), ae = F->arg_end();
-         ai != ae; ++ai) {
-      if (ai->getName().equals("ret_struct_ptr")) break;
-      check++;
-    }
+  // Create the argument type list with the added argument's type
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  // Find where the new parameter is in the header
+  Function::arg_iterator ai, ae;
+  int check = 0;
+  for (ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    if (ai->getName().equals("ret_struct_ptr")) break;
+    check++;
+  }
 
 //    DEBUG(errs() << "\tcheck = " << check << "\tinitialNumParams = " << initialNumParams << "\n");
-    assert(check == initialNumParams);
-
-    DEBUG(errs() << "\tReplacing Return statements\n");
-    // Replace return statements with extractValue and store instructions
-    for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(),
-         rie = RItoRemove.end(); rii != rie; ++rii) {
-      ReturnInst* RI = (*rii);
-      Value* RetVal = RI->getReturnValue();
-     // assert(RetVal && "Return value should not be null at this point");
-     // StructType* RetType = cast<StructType>(RetVal->getType());
-     // assert(RetType && "Return type is not a struct");
-
-      new StoreInst(RetVal, &(*ai), RI);
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
+  assert(check == initialNumParams);
+
+  DEBUG(errs() << "\tReplacing Return statements\n");
+  // Replace return statements with extractValue and store instructions
+  for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(),
+       rie = RItoRemove.end(); rii != rie; ++rii) {
+    ReturnInst* RI = (*rii);
+    Value* RetVal = RI->getReturnValue();
+    // assert(RetVal && "Return value should not be null at this point");
+    // StructType* RetType = cast<StructType>(RetVal->getType());
+    // assert(RetType && "Return type is not a struct");
+
+    new StoreInst(RetVal, &(*ai), RI);
+    ReturnInst::Create((F->getContext()), 0, RI);
+    RI->eraseFromParent();
 
-    }
+  }
 
-    DEBUG(errs() << "\tReplaced return statements\n");
+  DEBUG(errs() << "\tReplaced return statements\n");
 
-    // Adding new arguments to the function argument list, would not change the
-    // function type. We need to change the type of this function to reflect the
-    // added arguments
-    Type* VoidRetType = Type::getVoidTy(F->getContext());
-    FunctionType* FTy = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-    PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type* VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType* FTy = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
 
-    // Change the function type
-    F->mutateType(PTy);
+  // Change the function type
+  F->mutateType(PTy);
 
-  }
+}
 
 /******************************************************************************
  *                              Helper functions                              *
  ******************************************************************************/
 
-  // Find if argument has the given attribute
-  static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) {
-    return F->getAttributes().hasAttribute(arg_index+1, AK);
-  }
-  // Get generated PTX binary name
-  static std::string getPTXFilename(const Module& M) {
-    std::string moduleID = M.getModuleIdentifier();
-    moduleID.append(".nvptx.s");
-    return moduleID;
-  }
-
-  // Get the name of the input file from module ID
-  static std::string getFilenameFromModule(const Module& M) {
-    std::string moduleID = M.getModuleIdentifier();
-    return moduleID.substr(moduleID.find_last_of("/")+1);
-  }
-
-  // Changes the data layout of the Module to be compiled with NVPTX backend
-  // TODO: Figure out when to call it, probably after duplicating the modules
-  static void changeDataLayout(Module &M) {
-    std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
-    std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
-
-    if (TARGET_PTX == 32)
-      M.setDataLayout(StringRef(nvptx32_layoutStr));
-    else if (TARGET_PTX == 64)
-      M.setDataLayout(StringRef(nvptx64_layoutStr));
-    else assert(false && "Invalid PTX target");
-
-    return;
+// Calculate execute node parameters which include, number of diemnsions for
+// dynamic instances of the kernel, local and global work group sizes.
+static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value*
+                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+  // Get int64_t and or ease of use
+  Type* Int64Ty = Type::getInt64Ty(getGlobalContext());
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(getGlobalContext()), kernel->gridDim);
+
+  // For now, local work group size if null
+  LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(getGlobalContext()));
+
+  // Global Work Group type is [#dim x i64]
+  Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim);
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", IB);
+  GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", IB);
+  Value* nextDim = GlobalWGPtr;
+  DEBUG(errs() << *GlobalWGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for(unsigned i=0; i < kernel->gridDim; i++) {
+    assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+    if(kernel->globalWGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", IB);
+      StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, IB);
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, IB);
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if(i+1 < kernel->gridDim) {
+      // Move to next dimension
+      GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim,
+                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+                               GlobalWG->getName()+"."+Twine(i+1),
+                               IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
   }
 
-  static void changeTargetTriple(Module &M) {
-    std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-    std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
-
-    if (TARGET_PTX == 32)
-      M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-    else if (TARGET_PTX == 64)
-      M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-    else assert(false && "Invalid PTX target");
-
-    return;
-  }
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+}
+
+// Find if argument has the given attribute
+static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) {
+  return F->getAttributes().hasAttribute(arg_index+1, AK);
+}
+// Get generated PTX binary name
+static std::string getPTXFilename(const Module& M) {
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".nvptx.s");
+  return moduleID;
+}
+
+// Get the name of the input file from module ID
+static std::string getFilenameFromModule(const Module& M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/")+1);
+}
+
+// Changes the data layout of the Module to be compiled with NVPTX backend
+// TODO: Figure out when to call it, probably after duplicating the modules
+static void changeDataLayout(Module &M) {
+  std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+  std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
+
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else assert(false && "Invalid PTX target");
+
+  return;
+}
+
+static void changeTargetTriple(Module &M) {
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else assert(false && "Invalid PTX target");
+
+  return;
+}
 
 // Helper function, generate a string representation of a type
-  static std::string printType(Type* ty) {
-    std::string type_str;
-    raw_string_ostream rso(type_str);
-    ty->print(rso);
-    return rso.str();
-  }
+static std::string printType(Type* ty) {
+  std::string type_str;
+  raw_string_ostream rso(type_str);
+  ty->print(rso);
+  return rso.str();
+}
 
 // Helper function, convert int to string
-  static std::string convertInt(int number) {
-   std::stringstream ss;//create a stringstream
-   ss << number;//add number to the stream
-   return ss.str();//return a string with the contents of the stream
-  }
+static std::string convertInt(int number) {
+  std::stringstream ss;//create a stringstream
+  ss << number;//add number to the stream
+  return ss.str();//return a string with the contents of the stream
+}
 
 // Helper function, populate a vector with all return statements in a function
-  static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-    for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-      Instruction *I = &(*i);
-      ReturnInst* RI = dyn_cast<ReturnInst>(I);
-      if (RI) {
-        ReturnInstVec.push_back(RI);
-      }
+static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    ReturnInst* RI = dyn_cast<ReturnInst>(I);
+    if (RI) {
+      ReturnInstVec.push_back(RI);
     }
   }
+}
 
 } // End of namespace
 
-- 
GitLab