From 86a02888b4a94e3aaf4fcda01bcb2e2cb133bf34 Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@tyler.cs.illinois.edu>
Date: Sun, 20 Dec 2020 15:45:06 -0600
Subject: [PATCH] Copy out the ApproxHPVM headers and passes

---
 include/BuildDFG/BuildDFG.h                   |   82 +
 include/DFGraph.h                             |  415 +++
 .../ExtractHPVMLeafNodes.h                    |   25 +
 .../FuseHPVMTensorNodes/FuseHPVMTensorNodes.h |  178 ++
 include/GenVISC/GenVISC.h                     |   52 +
 include/InPlaceDFG/InPlaceDFGAnalysis.h       |   52 +
 include/SupportVISC/DFG2LLVM.h                |  497 ++++
 include/SupportVISC/DFGTreeTraversal.h        |   64 +
 include/SupportVISC/VISCHint.h                |   35 +
 include/SupportVISC/VISCTimer.h               |  159 ++
 include/SupportVISC/VISCUtils.h               |  601 +++++
 lib/BuildDFG/BuildDFG.cpp                     |  395 +++
 lib/BuildDFG/BuildDFG.exports                 |    0
 lib/BuildDFG/CMakeLists.txt                   |   12 +
 lib/BuildDFG/LLVMBuild.txt                    |   21 +
 lib/ClearDFG/CMakeLists.txt                   |   12 +
 lib/ClearDFG/ClearDFG.cpp                     |  172 ++
 lib/ClearDFG/ClearDFG.exports                 |    0
 lib/ClearDFG/LLVMBuild.txt                    |   21 +
 lib/DFG2LLVM_CUDNN/CMakeLists.txt             |   12 +
 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp         |  645 +++++
 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports     |    0
 lib/DFG2LLVM_CUDNN/LLVMBuild.txt              |   21 +
 lib/DFG2LLVM_NVPTX/CMakeLists.txt             |   12 +
 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 2075 +++++++++++++++
 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports     |    0
 lib/DFG2LLVM_NVPTX/LLVMBuild.txt              |   21 +
 lib/DFG2LLVM_PROMISE/CMakeLists.txt           |   12 +
 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp     | 1283 +++++++++
 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports |    0
 lib/DFG2LLVM_PROMISE/LLVMBuild.txt            |   21 +
 lib/DFG2LLVM_SPIR/CMakeLists.txt              |   12 +
 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp           | 2010 ++++++++++++++
 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports       |    0
 lib/DFG2LLVM_SPIR/LLVMBuild.txt               |   21 +
 lib/DFG2LLVM_WrapperAPI/CMakeLists.txt        |   12 +
 .../DFG2LLVM_WrapperAPI.cpp                   | 1532 +++++++++++
 .../DFG2LLVM_WrapperAPI.exports               |    0
 lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt         |   21 +
 lib/DFG2LLVM_X86/CMakeLists.txt               |   11 +
 lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp             | 2082 +++++++++++++++
 lib/DFG2LLVM_X86/DFG2LLVM_X86.exports         |    0
 lib/DFG2LLVM_X86/LLVMBuild.txt                |   21 +
 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt          |   13 +
 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports    |    0
 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp   | 2128 +++++++++++++++
 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt           |   22 +
 lib/ExtractHPVMLeafNodes/CMakeLists.txt       |   13 +
 .../ExtractHPVMLeafNodes.cpp                  |  246 ++
 .../ExtractHPVMLeafNodes.exports              |    0
 lib/ExtractHPVMLeafNodes/LLVMBuild.txt        |   22 +
 lib/FuseHPVMTensorNodes/CMakeLists.txt        |   12 +
 .../FuseHPVMTensorNodes.cpp                   | 1007 +++++++
 .../FuseHPVMTensorNodes.exports               |    0
 lib/FuseHPVMTensorNodes/LLVMBuild.txt         |   21 +
 lib/GenVISC/CMakeLists.txt                    |   12 +
 lib/GenVISC/GenVISC.cpp                       | 1590 +++++++++++
 lib/GenVISC/GenVISC.exports                   |    0
 lib/GenVISC/LLVMBuild.txt                     |   21 +
 lib/InPlaceDFG/CMakeLists.txt                 |   12 +
 lib/InPlaceDFG/InPlaceDFGAnalysis.cpp         |  318 +++
 lib/InPlaceDFG/InPlaceDFGAnalysis.exports     |    0
 lib/InPlaceDFG/LLVMBuild.txt                  |   21 +
 lib/InlineTensorCalls/CMakeLists.txt          |   13 +
 lib/InlineTensorCalls/InlineTensorCalls.cpp   |   77 +
 .../InlineTensorCalls.exports                 |    0
 lib/InlineTensorCalls/LLVMBuild.txt           |   22 +
 lib/InsertApproxInfo/CMakeLists.txt           |   12 +
 lib/InsertApproxInfo/InsertApproxInfo.cpp     |  498 ++++
 lib/InsertApproxInfo/LLVMBuild.txt            |   21 +
 lib/LocalMem/CMakeLists.txt                   |   12 +
 lib/LocalMem/LLVMBuild.txt                    |   21 +
 lib/LocalMem/LocalMem.cpp                     |  224 ++
 lib/LocalMem/LocalMem.exports                 |    0
 lib/MergeDFN/CMakeLists.txt                   |   12 +
 lib/MergeDFN/LLVMBuild.txt                    |   21 +
 lib/MergeDFN/MergeDFN.cpp                     | 2338 +++++++++++++++++
 lib/MergeDFN/MergeDFN.exports                 |    0
 lib/ReplaceIntrinsics/CMakeLists.txt          |   13 +
 lib/ReplaceIntrinsics/LLVMBuild.txt           |   22 +
 lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp   |  516 ++++
 .../ReplaceIntrinsics.exports                 |    0
 82 files changed, 21897 insertions(+)
 create mode 100644 include/BuildDFG/BuildDFG.h
 create mode 100644 include/DFGraph.h
 create mode 100644 include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h
 create mode 100644 include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h
 create mode 100644 include/GenVISC/GenVISC.h
 create mode 100644 include/InPlaceDFG/InPlaceDFGAnalysis.h
 create mode 100644 include/SupportVISC/DFG2LLVM.h
 create mode 100644 include/SupportVISC/DFGTreeTraversal.h
 create mode 100644 include/SupportVISC/VISCHint.h
 create mode 100644 include/SupportVISC/VISCTimer.h
 create mode 100644 include/SupportVISC/VISCUtils.h
 create mode 100644 lib/BuildDFG/BuildDFG.cpp
 create mode 100644 lib/BuildDFG/BuildDFG.exports
 create mode 100644 lib/BuildDFG/CMakeLists.txt
 create mode 100644 lib/BuildDFG/LLVMBuild.txt
 create mode 100644 lib/ClearDFG/CMakeLists.txt
 create mode 100644 lib/ClearDFG/ClearDFG.cpp
 create mode 100644 lib/ClearDFG/ClearDFG.exports
 create mode 100644 lib/ClearDFG/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_CUDNN/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
 create mode 100644 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports
 create mode 100644 lib/DFG2LLVM_CUDNN/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_NVPTX/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
 create mode 100644 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports
 create mode 100644 lib/DFG2LLVM_NVPTX/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_PROMISE/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp
 create mode 100644 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports
 create mode 100644 lib/DFG2LLVM_PROMISE/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_SPIR/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
 create mode 100644 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports
 create mode 100644 lib/DFG2LLVM_SPIR/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_WrapperAPI/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
 create mode 100644 lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports
 create mode 100644 lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_X86/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp
 create mode 100644 lib/DFG2LLVM_X86/DFG2LLVM_X86.exports
 create mode 100644 lib/DFG2LLVM_X86/LLVMBuild.txt
 create mode 100644 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
 create mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports
 create mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
 create mode 100644 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt
 create mode 100644 lib/ExtractHPVMLeafNodes/CMakeLists.txt
 create mode 100644 lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp
 create mode 100644 lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports
 create mode 100644 lib/ExtractHPVMLeafNodes/LLVMBuild.txt
 create mode 100644 lib/FuseHPVMTensorNodes/CMakeLists.txt
 create mode 100644 lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
 create mode 100644 lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports
 create mode 100644 lib/FuseHPVMTensorNodes/LLVMBuild.txt
 create mode 100644 lib/GenVISC/CMakeLists.txt
 create mode 100644 lib/GenVISC/GenVISC.cpp
 create mode 100644 lib/GenVISC/GenVISC.exports
 create mode 100644 lib/GenVISC/LLVMBuild.txt
 create mode 100644 lib/InPlaceDFG/CMakeLists.txt
 create mode 100644 lib/InPlaceDFG/InPlaceDFGAnalysis.cpp
 create mode 100644 lib/InPlaceDFG/InPlaceDFGAnalysis.exports
 create mode 100644 lib/InPlaceDFG/LLVMBuild.txt
 create mode 100644 lib/InlineTensorCalls/CMakeLists.txt
 create mode 100644 lib/InlineTensorCalls/InlineTensorCalls.cpp
 create mode 100644 lib/InlineTensorCalls/InlineTensorCalls.exports
 create mode 100644 lib/InlineTensorCalls/LLVMBuild.txt
 create mode 100644 lib/InsertApproxInfo/CMakeLists.txt
 create mode 100644 lib/InsertApproxInfo/InsertApproxInfo.cpp
 create mode 100644 lib/InsertApproxInfo/LLVMBuild.txt
 create mode 100644 lib/LocalMem/CMakeLists.txt
 create mode 100644 lib/LocalMem/LLVMBuild.txt
 create mode 100644 lib/LocalMem/LocalMem.cpp
 create mode 100644 lib/LocalMem/LocalMem.exports
 create mode 100644 lib/MergeDFN/CMakeLists.txt
 create mode 100644 lib/MergeDFN/LLVMBuild.txt
 create mode 100644 lib/MergeDFN/MergeDFN.cpp
 create mode 100644 lib/MergeDFN/MergeDFN.exports
 create mode 100644 lib/ReplaceIntrinsics/CMakeLists.txt
 create mode 100644 lib/ReplaceIntrinsics/LLVMBuild.txt
 create mode 100644 lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp
 create mode 100644 lib/ReplaceIntrinsics/ReplaceIntrinsics.exports

diff --git a/include/BuildDFG/BuildDFG.h b/include/BuildDFG/BuildDFG.h
new file mode 100644
index 0000000000..7d51d32022
--- /dev/null
+++ b/include/BuildDFG/BuildDFG.h
@@ -0,0 +1,82 @@
+#ifndef __BUILD_DFG_H__
+#define __BUILD_DFG_H__
+
+//== BuildDFG.h - Header file for "Hierarchical Dataflow Graph Builder Pass" =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/ValueMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/DFGraph.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace builddfg {
+// BuildDFG - The first implementation.
+struct BuildDFG : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  BuildDFG() : ModulePass(ID) {}
+
+  typedef ValueMap<Value*, DFNode*> HandleToDFNode;
+  typedef ValueMap<Value*, DFEdge*> HandleToDFEdge;
+
+private:
+  // Member variables
+  DFInternalNode *Root;
+  std::vector<DFInternalNode*> Roots;
+
+  HandleToDFNode HandleToDFNodeMap;   // This map associates the i8* pointer
+  // with the DFNode structure that it
+  // represents
+  HandleToDFEdge HandleToDFEdgeMap;   // This map associates the i8* pointer
+  // with the DFEdge structure that it
+  // represents
+
+
+  // Functions
+public:
+  void handleCreateNode (DFInternalNode* N, IntrinsicInst* II);
+private:
+  void handleCreateEdge (DFInternalNode* N, IntrinsicInst* II);
+  void handleGetParentNode (DFInternalNode* N, IntrinsicInst* II);
+  void handleBindInput (DFInternalNode* N, IntrinsicInst* II);
+  void handleBindOutput (DFInternalNode* N, IntrinsicInst* II);
+
+  void BuildGraph (DFInternalNode* N, Function* F);
+
+public:
+  // Functions
+  virtual bool runOnModule(Module &M);
+
+  static bool isViscLaunchIntrinsic(Instruction * I);
+  static bool isViscGraphIntrinsic(Instruction * I);
+  static bool isViscQueryIntrinsic(Instruction* I);
+  static bool isViscIntrinsic(Instruction* I);
+  static bool isTypeCongruent(Type *L, Type *R);
+
+  //TODO: Maybe make these fields const
+  DFInternalNode *getRoot() const;
+  std::vector<DFInternalNode*> &getRoots();
+  HandleToDFNode &getHandleToDFNodeMap();
+  HandleToDFEdge &getHandleToDFEdgeMap();
+  void addElementToHandleToDFNodeMap(Value* V, DFNode* N);
+  void removeElementFromHandleToDFNodeMap(Value* V);
+  void addElementToHandleToDFEdgeMap(Value* V, DFEdge* E);
+  void removeElementFromHandleToDFEdgeMap(Value* V);
+
+};
+
+} // End of namespace
+
+#endif
+
diff --git a/include/DFGraph.h b/include/DFGraph.h
new file mode 100644
index 0000000000..8307e56889
--- /dev/null
+++ b/include/DFGraph.h
@@ -0,0 +1,415 @@
+//===----- llvm/IR/DFGraph.h - Classes to represent a Dataflow Graph ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of the following classes: 
+// 1. DFNode
+// 2. DFGraph
+// 3. DFInternalNode
+// 4. DFLeafNode
+// 5. DFEdge.
+//
+// FIXME : We still need to figure out whether these functions are independent
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_DFGRAPH_H
+#define LLVM_IR_DFGRAPH_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/GraphWriter.h"
+
+
+namespace llvm {
+
+class DFNode;
+class DFInternalNode;
+class DFLeafNode;
+class DFEdge;
+class DFNodeVisitor;
+class DFTreeTraversal;
+class DFEdgeVisitor;
+class DFGraph;
+
+//template<> struct GraphTraits
+
+typedef std::vector<DFNode*> DFNodeListType;
+
+class DFGraph {
+
+private:
+  typedef std::vector<DFEdge*> DFEdgeListType;
+
+  // Important things that make up a Dataflow graph
+  //  DFLeafNode* Entry;
+  DFInternalNode* Parent;
+  DFNodeListType ChildrenList;    ///< List of children Dataflow Nodes
+  DFEdgeListType DFEdgeList;      ///< List of Dataflow edges among children
+
+public:
+  DFGraph(DFInternalNode* P) {
+    //ChildrenList.push_back(llvm::DFLeafNode::Create(NULL, NULL, NULL));
+    Parent = P;
+  }
+
+  void addChildDFNode(DFNode* child) {
+    ChildrenList.push_back(child);
+  }
+
+  void addDFEdge(DFEdge* E) {
+    DFEdgeList.push_back(E);
+  }
+
+  // Iterators
+  typedef DFNodeListType::iterator children_iterator;
+  typedef DFNodeListType::const_iterator const_children_iterator;
+
+  typedef DFEdgeListType::iterator dfedge_iterator;
+  typedef DFEdgeListType::const_iterator const_dfedge_iterator;
+
+  //===--------------------------------------------------------------------===//
+  // DFNodeList iterator forwarding functions
+  //
+  children_iterator       begin()       { return ChildrenList.begin(); }
+  const_children_iterator begin() const { return ChildrenList.begin(); }
+  children_iterator       end  ()       { return ChildrenList.end();   }
+  const_children_iterator end  () const { return ChildrenList.end();   }
+
+  size_t                   size() const { return ChildrenList.size();  }
+  bool                    empty() const { return ChildrenList.empty(); }
+  const DFNode           *front() const { return ChildrenList.front(); }
+        DFNode           *front()       { return ChildrenList.front(); }
+  const DFNode            *back() const { return ChildrenList.back();  }
+        DFNode            *back()       { return ChildrenList.back();  }
+
+  //===--------------------------------------------------------------------===//
+
+  //===--------------------------------------------------------------------===//
+  // DFEdgeList iterator forwarding functions
+  //
+  dfedge_iterator       dfedge_begin()       { return DFEdgeList.begin(); }
+  const_dfedge_iterator dfedge_begin() const { return DFEdgeList.begin(); }
+  dfedge_iterator       dfedge_end  ()       { return DFEdgeList.end();   }
+  const_dfedge_iterator dfedge_end  () const { return DFEdgeList.end();   }
+
+  size_t                 dfedge_size() const { return DFEdgeList.size();  }
+  bool                  dfedge_empty() const { return DFEdgeList.empty(); }
+  const DFEdge         *dfedge_front() const { return DFEdgeList.front(); }
+        DFEdge         *dfedge_front()       { return DFEdgeList.front(); }
+  const DFEdge          *dfedge_back() const { return DFEdgeList.back();  }
+        DFEdge          *dfedge_back()       { return DFEdgeList.back();  }
+
+  //===--------------------------------------------------------------------===//
+
+  DFInternalNode* getParent() {
+    return Parent;
+  }
+
+};
+
+// DFNode represents a single VISC Dataflow Node in LLVM.
+// This is an abstract class.
+//
+// A Dataflow Node basically consists of
+// 1. Pointer to a function describing this dataflow node
+// 2. Number of dimensions in which the node is replicated
+// 3. Number of instances in each dimension
+// 4. Pointer to parent Dataflow Node
+// 5. List of children Dataflow Nodes (empty if it is a leaf node)
+// 6. List of Dataflow Edges among children
+
+class DFNode {
+
+  public:
+  enum DFNodeKind {
+    Internal,
+    Leaf
+  };
+
+  private:
+
+  const DFNodeKind Kind;
+
+  // Important things that make up a Dataflow Node
+  IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
+  Function* FuncPointer;          ///< Associated Function
+  DFNode* Parent;                 ///< Pointer to parent dataflow Node
+  int NumOfDim;                   ///< Number of dimensions
+  std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
+  DFNodeListType Successors;      ///< List of successors i.e.,
+                                  ///< destination DFNodes to DFEdges
+                                  ///< originating from this DFNode
+
+  public:
+  DFNodeKind getKind() const {return Kind;}
+
+  // Iterators
+  typedef DFNodeListType::iterator successor_iterator;
+  typedef DFNodeListType::const_iterator const_successor_iterator;
+
+  //===--------------------------------------------------------------------===//
+  // DFNodeList iterator forwarding functions
+  //
+  successor_iterator       successors_begin()        { return Successors.begin(); }
+  const_successor_iterator successors_begin()  const { return Successors.begin(); }
+  successor_iterator       successors_end  ()        { return Successors.end();   }
+  const_successor_iterator successors_end  ()  const { return Successors.end();   }
+
+  size_t                   successors_size()   const { return Successors.size();  }
+  bool                     successors_empty()  const { return Successors.empty(); }
+  const DFNode*            successors_front()  const { return Successors.front(); }
+        DFNode*            successors_front()        { return Successors.front(); }
+  const DFNode*            successors_back()   const { return Successors.back();  }
+        DFNode*            successors_back()         { return Successors.back();  }
+
+  //===--------------------------------------------------------------------===//
+
+  // Functions
+  DFNode(DFNodeKind _Kind, IntrinsicInst* _II, Function* _FuncPointer, DFNode* _Parent,
+         int _NumOfDim, std::vector<Value*> _DimLimits) : Kind(_Kind), II(_II),
+         FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim),
+         DimLimits(_DimLimits) {}
+
+  void addSuccessor(DFNode* N) {
+    Successors.push_back(N);
+  }
+
+  Function* getFuncPointer() {
+    return FuncPointer;
+  }
+
+
+
+  virtual void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL) = 0;
+//  virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0;
+
+};
+
+
+class DFInternalNode : public DFNode {
+
+  private:
+  DFGraph* childGraph;
+
+  // Constructor
+  DFInternalNode(IntrinsicInst* II, Function* FuncPointer, DFNode* Parent,
+                 int NumOfDim, std::vector<Value*> DimLimits) :
+                 DFNode(Internal, II, FuncPointer, Parent, NumOfDim, DimLimits) {
+    childGraph = new DFGraph(this);
+    //childGraph->addChildDFNode(DFLeafNode::Create(NULL, NULL, this));
+  }
+
+  public:
+  static DFInternalNode *Create(IntrinsicInst* II, Function* FuncPointer,
+                                DFNode* Parent = NULL, int NumOfDim = 0,
+                                std::vector<Value*> DimLimits = std::vector<Value*>()) {
+    return new DFInternalNode(II, FuncPointer, Parent, NumOfDim, DimLimits);
+  }
+
+  static bool classof(const DFNode *N) {
+    return N->getKind() == Internal;
+  }
+ 
+
+  void addChildToDFGraph(DFNode* N) {
+    childGraph->addChildDFNode(N);
+  }
+
+  void addEdgeToDFGraph(DFEdge* E) {
+    childGraph->addDFEdge(E);
+  }
+
+  DFGraph* getChildGraph() {
+    return childGraph;
+  }
+
+  void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL); /*virtual*/
+//  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+};
+
+class DFLeafNode : public DFNode {
+  private:
+  // Constructor
+  DFLeafNode(IntrinsicInst* II, Function* FuncPointer, DFNode* Parent,
+             int NumOfDim = 0, std::vector<Value*> DimLimits = std::vector<Value*>())
+            : DFNode(Leaf, II, FuncPointer, Parent, NumOfDim, DimLimits) {}
+
+  public:
+
+  static DFLeafNode *Create(IntrinsicInst* II, Function* FuncPointer,
+                            DFNode* Parent, int NumOfDim = 0,
+                            std::vector<Value*> DimLimits = std::vector<Value*>()) {
+    return new DFLeafNode(II, FuncPointer, Parent, NumOfDim, DimLimits);
+  }
+
+  static bool classof(const DFNode *N) {
+    return N->getKind() == Leaf;
+  }
+
+
+  void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL); /*virtual*/
+//  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+
+};
+
+// DFEdge represents a single VISC Dataflow Edge in LLVM.
+//
+// A Dataflow Edge basically consists of
+// 1. Pointer to the dataflow node that is the source of this edge
+// 2. Pointer to the dataflow node that is the destination of this edge
+// 3. Pointer to a function that describes which instances of the source
+//    dataflow node are connected to which instances of the destination
+//    dataflow node via this edge
+// 4. Pointer to a function that describes which input arguments of the
+//    destination dataflow node are connected to which outputs of the source
+//    dataflow node via this edge
+
+class DFEdge {
+  private:
+  // Important things that make up a Dataflow Edge
+  DFNode* SrcDF;                ///< Pointer to source dataflow Node
+  DFNode* DestDF;               ///< Pointer to destination dataflow Node
+  Function* DFMapFuncPointer;   ///< Function that associates the appropriate
+                                ///< instances of source and destination
+                                ///< dataflow nodes
+  Function* ArgMapFuncPointer;  ///< Function that associates the input
+                                ///< arguments of destination with the outputs
+                                ///< of source dataflow node
+  // Functions
+  DFEdge(DFNode* _SrcDF, DFNode* _DestDF, Function* _DFMapFuncPointer,
+         Function* _ArgMapFuncPointer) : SrcDF(_SrcDF), DestDF(_DestDF),
+         DFMapFuncPointer(_DFMapFuncPointer),
+         ArgMapFuncPointer(_ArgMapFuncPointer) {}
+
+  public:
+
+  static DFEdge *Create(DFNode* SrcDF, DFNode* DestDF, Function* DFMapFuncPtr,
+                        Function* ArgMapFuncPtr) {
+    return new DFEdge(SrcDF, DestDF, DFMapFuncPtr, ArgMapFuncPtr);
+
+  }
+};
+
+
+//===-------------------------- Visitor Classes ---------------------------===//
+// Visitor for DFNode objects
+class DFNodeVisitor {
+  public:
+  virtual void visit(DFInternalNode* N, DFNodeListType* L = NULL) = 0;
+  virtual void visit(DFLeafNode* N, DFNodeListType* L = NULL) = 0;
+};
+
+class DFTreeTraversal : public DFNodeVisitor {
+
+  public:
+  virtual void visit(DFInternalNode* N, DFNodeListType *L = NULL){
+    errs() << "Visted Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    if (L != NULL)
+      L->push_back(N);
+    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
+        e = N->getChildGraph()->end(); i != e; ++i) {
+      DFNode* child = *i;
+      child->applyDFNodeVisitor(*this, L);
+    }
+  }
+
+  virtual void visit(DFLeafNode* N, DFNodeListType *L = NULL) {
+    errs() << "Visted Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    if (L != NULL)
+      L->push_back(N);
+  }
+
+};
+
+class FollowSuccessors : public DFNodeVisitor {
+
+  public:
+  virtual void visit(DFInternalNode* N, DFNodeListType *L = NULL) {
+    errs() << "Visted Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    for(DFInternalNode::successor_iterator i = N->successors_begin(),
+        e = N->successors_end(); i != e; ++i) {
+      /* Traverse the graph.
+       * Choose the kind of traversal we want
+       * Do we do a DAG kind of traversal?
+       */
+    }
+  }
+
+  virtual void visit(DFLeafNode* N, DFNodeListType* L = NULL) {
+    errs() << "Visted Node (L) - " << N->getFuncPointer()->getName() << "\n";
+  }
+};
+
+// Print functions
+inline raw_ostream& operator<<(raw_ostream &O, DFInternalNode &N) {
+    O << N.getFuncPointer()->getName();
+    return O;
+}
+
+inline raw_ostream& operator<<(raw_ostream &O, DFLeafNode &N) {
+    O << N.getFuncPointer()->getName();
+    return O;
+}
+
+/*
+// Visitor for DFEdge objects
+class DFEdgeVisitor {
+public:
+  virtual void visit(DFEdge* E) = 0;
+};
+
+
+//===--------------------------------------------------------------------===//
+// GraphTraits specializations for DFNode graph (DFG)
+//===--------------------------------------------------------------------===//
+
+// Provide specializations of GraphTraits to be able to treat a DFNode as a
+// graph of DFNodes...struct GraphTraits {
+  // Elements to provide:
+
+  // typedef NodeType          - Type of Node in the graph
+  // typedef ChildIteratorType - Type used to iterate over children in graph
+
+  // static NodeType *getEntryNode(const GraphType &)
+  //    Return the entry node of the graph
+
+  // static ChildIteratorType child_begin(NodeType *)
+  // static ChildIteratorType child_end  (NodeType *)
+  //    Return iterators that point to the beginning and ending of the child
+  //    node list for the specified node.
+  //
+
+
+  // typedef  ...iterator nodes_iterator;
+  // static nodes_iterator nodes_begin(GraphType *G)
+  // static nodes_iterator nodes_end  (GraphType *G)
+  //    nodes_iterator/begin/end - Allow iteration over all nodes in the graph
+
+  // static unsigned       size       (GraphType *G)
+  //    Return total number of nodes in the graph
+  //
+
+
+  // If anyone tries to use this class without having an appropriate
+  // specialization, make an error.  If you get this error, it's because you
+  // need to include the appropriate specialization of GraphTraits<> for your
+  // graph, or you need to define it for a new graph type. Either that or
+  // your argument to XXX_begin(...) is unknown or needs to have the proper .h
+  // file #include'd.
+  //
+  typedef typename GraphType::UnknownGraphTypeError NodeType;
+//};
+*/
+
+} // End llvm namespace
+
+#endif
diff --git a/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h b/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h
new file mode 100644
index 0000000000..dfbd09402d
--- /dev/null
+++ b/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h
@@ -0,0 +1,25 @@
+#ifndef __EXTRACT_HPVM_LEAF_NODE_FUNCTIONS_H__
+	#define __EXTRACT_HPVM_LEAF_NODE_FUNCTIONS_H__
+	
+	//===-------------------- ExtractHPVMLeafNodeFunctions.h ------------------===//
+	//
+	//                     The LLVM Compiler Infrastructure
+	//
+	// This file is distributed under the University of Illinois Open Source
+	// License. See LICENSE.TXT for details.
+	//
+	//===----------------------------------------------------------------------===//
+	
+	#include "llvm/IR/Module.h"
+	#include "llvm/BuildDFG/BuildDFG.h"
+	
+	namespace extracthpvmleaf {
+	
+	class ExtractHPVMLeafNodeFunctions {
+	public:
+	  void run(Module &M, builddfg::BuildDFG &DFG);
+	};
+	
+	} // end namespace extracthpvmleaf
+	
+	#endif
\ No newline at end of file
diff --git a/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h b/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h
new file mode 100644
index 0000000000..72812071a3
--- /dev/null
+++ b/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h
@@ -0,0 +1,178 @@
+#ifndef __FUSE_HPVM_TENSOR_NODES_H__
+#define __FUSE_HPVM_TENSOR_NODES_H__
+
+//===                         FuseHPVMTensorNodes.h                        ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DFGraph.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#include "llvm/BuildDFG/BuildDFG.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+
+namespace tensorfuse {
+
+class FuseHPVMTensorNodes {
+public:
+  typedef std::vector< std::vector< IntrinsicInst* > > FusionTargets;
+private:
+  // Member variables
+
+  // Functions
+
+/* Create an identical bind (in or out, depending on the argument intrinsic)  *
+ * with different src (true) or dst (false) port                              */
+  IntrinsicInst* createIdenticalBindWithDifferentPort(IntrinsicInst* II,
+                                                      unsigned port,
+                                                      bool srcport);
+/* Given two createNode intrinsics describing connected nodes, this function  *
+ * returns the argument list type of the fused function                       */
+  void createArgTypes(IntrinsicInst* II1,
+                      IntrinsicInst* II2,
+                      std::vector<Type*> &ArgTypes);
+/* Get the return type of the function for fused node II1-II2                 */
+  StructType* createReturnType(IntrinsicInst* II1, IntrinsicInst* II2);
+/* Copy argument names, from functions of II1 and II2 to F                    */
+  void copyArgumentNames(IntrinsicInst* II1,
+                         IntrinsicInst* II2,
+                         Function* F);
+/* Copy attributes, from functions of II1 and II2 to F                        */
+  void copyAttrList(IntrinsicInst* II1,
+                    IntrinsicInst* II2,
+                    Function* F);
+/* Creates and inserts an empty function of the rght type for the fused node  */
+  Function* createEmptyDFNodeFunction(IntrinsicInst* II1,
+                                      IntrinsicInst* II2,
+                                      Module &M);
+/* Inline first node function, updating required mappings                     *
+ * - F1: first node function                                                  *
+ * - M:  module containing the node function                                  *
+ * - Ffused: fused node function                                              *
+ * - VMap: maps values used in the body of F1 to those that mst be used in    *
+           the body of the fused function instead                             *
+ * OutVs: This maps the output struct field index to the stored value         */
+  void inlineFirstNodeFunction(Module &M,
+                               Function *F1,
+                               Function *Ffused,
+                               ValueMap<Value*, Value*> &VMap,
+                               std::vector<Value*> &OutVs);
+/* Inline second node function, updating required mappings                    *
+ * - F2: second node function                                                 *
+ * - M:  module containing the node function                                  *
+ * - Ffused: fused node function                                              *
+ * - VMap: maps values used in the body of F2 to those that mst be used in    *
+           the body of the fused function instead                             */
+  void inlineSecondNodeFunction(Module &M,
+                                Function *F2,
+                                Function *Ffused,
+                                ValueMap<Value*, Value*> &VMap);
+/* Create function of leaf node after fusion                                  *
+ * - create type                                                              *
+ * - create empty function of the type                                        *
+ * - inline body of first function (applying and updating appropriate         *
+ *   mappings)                                                                *
+ * - inline body of second function (applying and updating appropriate        *
+ *   mappings)                                                                */
+  Function* createLeafDFNodeFunction(IntrinsicInst* II1,
+                                     IntrinsicInst* II2,
+                                     Module &M);
+/* Updates parent of fused nodes to use the new node intrinsic                */
+  void updateParentNodeFunction(IntrinsicInst* II1,
+                                IntrinsicInst* II2,
+                                IntrinsicInst* IInew);
+/* Performs all operations required at the IR level for fusion of HPVM tensor *
+ * nodes with intrinsic instructions II1 and II2                              *
+ * - Creates fused node function                                              *
+ * - Creates createNode intrinsic for it and returns it                       *
+ * - Updates parent function:                                                 *
+ * - - adds new intrinsic                                                     *
+ * - - edges and binds consistently use the new intrinsic                     *
+ * - Removes old functions                                                    */
+  IntrinsicInst* FuseHPVMTensorNodesStep(IntrinsicInst* II1,
+                                         IntrinsicInst* II2,
+                                         Module &M);
+/* Fuse node sequence described by creaetNode intrinsics in IIs.              *
+ * Contents of IIs are cleared.                                               */
+  void FuseHPVMTensorNodeSequence(std::vector<IntrinsicInst*> &IIs, Module &M);
+public:
+  void run(Module &M, FusionTargets &FTs);
+
+  void printFusionTargets(FusionTargets &FTs);
+};
+
+// Visitor for finding nodes to fuse
+class FindFusionTargetsTraversal : public dfg2llvm::CodeGenTraversal {
+
+private:
+  typedef std::map< visc::Target, std::vector< std::vector<Intrinsic::ID> > >
+          FusePatterns;
+  //Member variables
+
+  /* Map, from HPVM target to sequences of intrinsic IDs that if found,
+     need to be fused                                                   */
+  /* TODO: use this in the future. Current (for PLDI 2018) implementation
+   * - assumes only two patterns, for PROMISE
+   * - assumes that nodes belonging to a single pattern only, if any.  */
+//  FusePatterns FPs;
+  FuseHPVMTensorNodes::FusionTargets FTs;
+  //Functions
+
+  // Virtual Functions
+  void init() {}
+  void initRuntimeAPI() {}
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+  // Constructor
+
+  FindFusionTargetsTraversal(Module &_M, builddfg::BuildDFG &_DFG) :
+    CodeGenTraversal(_M, _DFG) {
+/*    FPs[visc::PROMISE_TARGET] = { {Intrinsic::visc_tensor_conv,
+                                   Intrinsic::visc_tensor_add,
+                                   Intrinsic::visc_tensor_relu,
+                                   Intrinsic::visc_tensor_pooling
+                                  },
+                                  {Intrinsic::visc_tensor_mul,
+                                   Intrinsic::visc_tensor_add,
+                                   Intrinsic::visc_tensor_relu
+                                  }
+                                }
+*/
+  }
+
+  FuseHPVMTensorNodes::FusionTargets &getFusionTargets() {
+    return FTs;
+  }
+
+};
+
+struct FuseHPVMTensorNodesWrapper : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  FuseHPVMTensorNodesWrapper() : ModulePass(ID) {}
+
+private:
+  // Member variables
+
+public:
+  // Functions
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<builddfg::BuildDFG>();
+  }
+
+  bool runOnModule(Module &M);
+
+};
+
+} // End of namespace
+
+#endif
diff --git a/include/GenVISC/GenVISC.h b/include/GenVISC/GenVISC.h
new file mode 100644
index 0000000000..fcdb636a05
--- /dev/null
+++ b/include/GenVISC/GenVISC.h
@@ -0,0 +1,52 @@
+//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+
+using namespace llvm;
+
+namespace genvisc {
+// GenVISC - The first implementation.
+struct GenVISC : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  GenVISC() : ModulePass(ID) {}
+
+
+private:
+  // Member variables
+  Module* M;
+  Constant* llvm_visc_initializeTimerSet;
+  Constant* llvm_visc_switchToTimer;
+  Constant* llvm_visc_printTimerSet;
+
+  GlobalVariable* TimerSet;
+
+  // Functions
+  void initializeTimerSet(Instruction*);
+  void switchToTimer(enum visc_TimerID, Instruction*);
+  void printTimerSet(Instruction*);
+  Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
+
+public:
+  // Functions
+  virtual bool runOnModule(Module &M);
+
+  void generateTest(CallInst* CI);
+  Function* genKernel(Function* KernelFunction, CallInst* CI, StructType* RetTy);
+  void genHost(CallInst*, Function*, unsigned, unsigned, unsigned, unsigned, StructType*);
+};
+
+} // End of namespace
+
diff --git a/include/InPlaceDFG/InPlaceDFGAnalysis.h b/include/InPlaceDFG/InPlaceDFGAnalysis.h
new file mode 100644
index 0000000000..fc4c7f3ee9
--- /dev/null
+++ b/include/InPlaceDFG/InPlaceDFGAnalysis.h
@@ -0,0 +1,52 @@
+#ifndef __IN_PLACE_DFG_ANALYSIS_H__
+#define __IN_PLACE_DFG_ANALYSIS_H__
+
+//===------------------------- InPlaceDFGAnalysis.h -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/DFGraph.h"
+#include "llvm/BuildDFG/BuildDFG.h"
+
+using namespace llvm;
+
+namespace inplacedfg {
+
+// InPlaceDFGAnalysis
+class InPlaceDFGAnalysis{
+public:
+  typedef std::map<DFNode*, std::vector<bool> > InPlaceDFGParameter;
+
+  void run(Module &M, builddfg::BuildDFG &DFG, InPlaceDFGParameter &IPP);
+};
+
+// InPlaceDFGAnalysisWrapper pass for ApproxHPVM - The first implementation.
+struct InPlaceDFGAnalysisWrapper : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  InPlaceDFGAnalysisWrapper() : ModulePass(ID) {}
+
+private:
+  // Member variables
+  InPlaceDFGAnalysis::InPlaceDFGParameter IPP;
+
+public:
+  // Functions
+  bool runOnModule(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  const InPlaceDFGAnalysis::InPlaceDFGParameter &getIPP();
+};
+
+// Helper Functions
+void printInPlaceDFGParameter(InPlaceDFGAnalysis::InPlaceDFGParameter &IPP);
+
+} // End of namespace
+
+#endif
diff --git a/include/SupportVISC/DFG2LLVM.h b/include/SupportVISC/DFG2LLVM.h
new file mode 100644
index 0000000000..355fb18570
--- /dev/null
+++ b/include/SupportVISC/DFG2LLVM.h
@@ -0,0 +1,497 @@
+#ifndef __DFG2LLVM_H__
+#define __DFG2LLVM_H__
+
+//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/BuildDFG/BuildDFG.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+
+using namespace llvm;
+using namespace builddfg;
+
+#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
+#define DECLARE(X) X = M.getOrInsertFunction(#X, \
+    runtimeModule->getFunction(#X)->getFunctionType()); \
+    DEBUG(errs() << *X)
+
+namespace dfg2llvm {
+// Helper Functions
+static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
+static inline ConstantInt* getTimerID(Module&, enum visc::Target);
+
+bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
+
+// DFG2LLVM abstract class implementation
+class DFG2LLVM : public ModulePass {
+protected:
+  DFG2LLVM(char ID) : ModulePass(ID) {}
+
+  // Member variables
+
+  // Functions
+
+public:
+  // Pure Virtual Functions
+  virtual bool runOnModule(Module &M) = 0;
+
+  // Functions
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addPreserved<BuildDFG>();
+  }
+
+};
+
+// Abstract Visitor for Code generation traversal (tree traversal for now)
+class CodeGenTraversal : public DFNodeVisitor {
+
+protected:
+  //Member variables
+  Module &M;
+  BuildDFG &DFG;
+  bool VISCTimer = false;
+  std::string TargetName = "None";
+  
+  // Map from Old function associated with DFNode to new cloned function with
+  // extra index and dimension arguments. This map also serves to find out if
+  // we already have an index and dim extended function copy or not (i.e.,
+  // "Have we visited this function before?")
+  DenseMap<DFNode*, Value*> OutputMap;
+
+  // VISC Runtime API
+  std::unique_ptr<Module> runtimeModule;
+
+  Constant* llvm_visc_initializeTimerSet;
+  Constant* llvm_visc_switchToTimer;
+  Constant* llvm_visc_printTimerSet;
+  GlobalVariable* TimerSet;
+  GlobalVariable* GraphIDAddr;
+  Instruction* InitCall;
+  Instruction* CleanupCall;
+
+
+  // Functions
+  Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
+//  void addArgument(Function*, Type*, const Twine& Name = "");
+  Function *addArgument(Function*, Type*, const Twine& Name = "");
+//  void addIdxDimArgs(Function* F);
+  Function *addIdxDimArgs(Function* F);
+  std::vector<Value*> extractElements(Value*, std::vector<Type*>,
+      std::vector<std::string>, Instruction*);
+  Argument* getArgumentAt(Function* F, unsigned offset);
+  void initTimerAPI();
+
+  // Pure Virtual Functions
+  virtual void init() = 0;
+  virtual void initRuntimeAPI() = 0;
+  virtual void codeGen(DFInternalNode* N) = 0;
+  virtual void codeGen(DFLeafNode* N) = 0;
+
+  // Virtual Functions
+  virtual void initializeTimerSet(Instruction*);
+  virtual void switchToTimer(enum visc_TimerID, Instruction*);
+  virtual void printTimerSet(Instruction*);
+
+  virtual ~CodeGenTraversal() {}
+
+
+public:
+
+  // Constructor
+  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
+
+  static bool checkPreferredTarget(DFNode* N, visc::Target T);
+  static bool preferredTargetIncludes(DFNode* N, visc::Target T);
+  visc::Target getPreferredTarget(DFNode *N);
+
+  virtual void visit(DFInternalNode* N) {
+    // If code has already been generated for this internal node, skip the
+    // children
+    if(N->getGenFunc() != NULL)
+      return;
+
+    errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n";
+
+    // Follows a bottom-up approach for code generation.
+    // First generate code for all the child nodes
+    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
+        e = N->getChildGraph()->end(); i != e; ++i) {
+      DFNode* child = *i;
+      child->applyDFNodeVisitor(*this);
+    }
+    // Generate code for this internal node now. This way all the cloned
+    // functions for children exist.
+    codeGen(N);
+    errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n";
+  }
+
+  virtual void visit(DFLeafNode* N) {
+    errs() << "Start: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    codeGen(N);
+    errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n";
+  }
+};
+
+// -------------- CodeGenTraversal Implementation -----------------
+
+bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) {
+  Function* F = N->getFuncPointer();
+  Module* M = F->getParent();
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CUDNN_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+      break;
+    case visc::PROMISE_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+      break;
+    case visc::CPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    default:
+      llvm_unreachable("Target Not supported yet!");
+  }
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return true;
+  }
+  return false;
+}
+
+visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
+
+  Function* F = N->getFuncPointer();
+  Module* M = F->getParent();
+  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::GPU_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::SPIR_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CUDNN_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::PROMISE_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_GPU_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MetaNode = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_SPIR_TARGET;
+  }
+
+  return visc::None;
+}
+
+bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) {
+
+  Function* F = N->getFuncPointer();
+  Module* M = F->getParent();
+  std::vector<NamedMDNode *> HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+      break;
+    case visc::SPIR_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+      break;
+    case visc::CPU_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+      break;
+    case visc::CUDNN_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn"));
+      break;
+    case visc::PROMISE_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise"));
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+    case visc::CPU_OR_SPIR_TARGET:
+      assert(false && "Target should be one of CPU/GPU/SPIR\n");
+      break;
+    default:
+      llvm_unreachable("Target Not supported yet!");
+  }
+
+  for (unsigned h = 0; h < HintNode.size(); h++) {
+    for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) {
+      MDNode *MetaNode = HintNode[h]->getOperand(i);
+      Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+      if (F == FHint)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
+// Generate Code for declaring a constant string [L x i8] and return a pointer
+// to the start of it.
+Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
+  Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true);
+  Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true,
+                                      GlobalValue::InternalLinkage, SConstant, Name);
+  Value* Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0);
+  Value* GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
+                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
+  return SPtr;
+}
+
+// Add an argument of type Ty to the given function F
+//void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+//  // Add the argument to argument list
+//  new Argument(Ty, name, F);
+//
+//  // Create the argument type list with added argument types
+//  std::vector<Type*> ArgTypes;
+//  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+//      ai != ae; ++ai) {
+//    ArgTypes.push_back(ai->getType());
+//  }
+//  // Adding new arguments to the function argument list, would not change the
+//  // function type. We need to change the type of this function to reflect the
+//  // added arguments
+//  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+//  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+//
+//  // Change the function type
+//  F->mutateType(PTy);
+//}
+
+// Creates a function with an additional argument of the specified type and
+// name. The previous function is not deleted.
+Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+  // Add the argument to argument list
+  new Argument(Ty, name, F);
+
+  // Create the argument type list with added argument types
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments. So, we create a clone of this function with the correct
+  // type.
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+  Function *newF = viscUtils::cloneFunction(F, FTy, false);
+
+  // Check if the function is used by a metadata node
+  if(F->isUsedByMetadata()) {
+    viscUtils::fixHintMetadata(*F->getParent(), F, newF);
+  }
+
+  return newF;
+}
+
+// Change the argument list of function F to add index and limit arguments
+//void CodeGenTraversal::addIdxDimArgs(Function* F) {
+//  // Add Index and Dim arguments
+//  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+//  for (int i = 0; i < 6; ++i) {
+//    addArgument(F, Type::getInt32Ty(F->getContext()), names[i]);
+//  }
+//}
+
+// Return new function with additional index and limit arguments.
+// The original function is removed from the module and erased.
+Function *CodeGenTraversal::addIdxDimArgs(Function* F) {
+  errs() << "Function Type: " << *F->getFunctionType() << "\n";
+  // Add Index and Dim arguments
+  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+  Function *newF;
+  for (int i = 0; i < 6; ++i) {
+    newF = addArgument(F, Type::getInt64Ty(F->getContext()), names[i]);
+    F->replaceAllUsesWith(UndefValue::get(F->getType()));
+    F->eraseFromParent();
+    F = newF;
+  }
+  errs() << "Function Type after adding args: " << *newF->getFunctionType() << "\n";
+  return newF;
+}
+
+// Extract elements from an aggregate value. TyList contains the type of each
+// element, and names vector contains a name. IB is the instruction before which
+// all the generated code would be inserted.
+std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate,
+    std::vector<Type*> TyList, std::vector<std::string> names, Instruction* IB) {
+  // Extract input data from i8* Aggregate.addr and store them in a vector.
+  // For each argument
+  std::vector<Value*> Elements;
+  GetElementPtrInst* GEP;
+  unsigned argNum = 0;
+  for(Type* Ty: TyList) {
+    // BitCast: %arg.addr = bitcast i8* Aggregate.addr to <pointer-to-argType>
+    CastInst* BI = BitCastInst::CreatePointerCast(Aggregate,
+                   Ty->getPointerTo(),
+                   names[argNum]+".addr",
+                   IB);
+    // Load: %arg = load <pointer-to-argType> %arg.addr
+    LoadInst* LI = new LoadInst(BI, names[argNum], IB);
+    // Patch argument to call instruction
+    Elements.push_back(LI);
+    //errs() << "Pushing element " << *LI << "\n";
+    //CI->setArgOperand(argNum, LI);
+
+    // TODO: Minor Optimization - The last GEP statement can/should be left out
+    // as no more arguments left
+    // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1
+    // This essentially takes us to the next argument in memory
+    Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1);
+    if (argNum < TyList.size()-1)
+      GEP = GetElementPtrInst::Create(nullptr, BI,
+                                                        ArrayRef<Value*>(IntOne),
+                                                        "nextArg",
+                                                        IB);
+    // Increment argNum and for the next iteration use result of this GEP to
+    // extract next argument
+    argNum++;
+    Aggregate = GEP;
+  }
+  return Elements;
+}
+
+// Traverse the function F argument list to get argument at offset
+Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
+  DEBUG(errs() << "Finding argument " << offset << ":\n");
+  assert((F->getFunctionType()->getNumParams() > offset && offset >= 0)
+         && "Invalid offset to access arguments!");
+  Argument* arg;
+  Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
+  for(; offset != 0 && i!=e; i++) {
+    offset--;
+  }
+  arg = &*i;
+  DEBUG(errs() << "\t" << *arg <<"\n");
+  return arg;
+}
+
+void CodeGenTraversal::initTimerAPI() {
+  DECLARE(llvm_visc_initializeTimerSet);
+  DECLARE(llvm_visc_switchToTimer);
+  DECLARE(llvm_visc_printTimerSet);
+}
+
+// Timer Routines
+// Initialize the timer set
+void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) {
+  DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
+  TIMER(TimerSet = new GlobalVariable(M,
+                                      Type::getInt8PtrTy(M.getContext()),
+                                      false,
+                                      GlobalValue::CommonLinkage,
+                                      Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                                      Twine("viscTimerSet_")+TargetName);
+    errs() << "New global variable: " << *TimerSet << "\n";
+
+    Value* TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
+                                          None,
+                                          "",
+                                          InsertBefore);
+    new StoreInst(TimerSetAddr, TimerSet, InsertBefore);
+  );
+}
+
+void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
+  Value* switchArgs[] = {TimerSet, getTimerID(M, timer)};
+  TIMER(CallInst::Create(llvm_visc_switchToTimer,
+                         ArrayRef<Value*>(switchArgs, 2),
+                         "",
+                         InsertBefore));
+}
+
+void CodeGenTraversal::printTimerSet(Instruction* InsertBefore) {
+  Value* TimerName;
+  TIMER(TimerName = getStringPointer(TargetName+Twine("_Timer"), InsertBefore));
+  Value* printArgs[] = {TimerSet, TimerName};
+  TIMER(CallInst::Create(llvm_visc_printTimerSet,
+                         ArrayRef<Value*>(printArgs, 2),
+                         "",
+                         InsertBefore));
+}
+
+// Implementation of Helper Functions
+static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
+  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
+}
+
+static inline ConstantInt* getTargetID(Module& M, enum visc::Target T) {
+  return ConstantInt::get(Type::getInt32Ty(M.getContext()), T);
+}
+
+// Find if argument has the given attribute
+bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) {
+  return F->getAttributes().hasAttribute(arg_index+1, AK);
+}
+
+} // End of namespace
+
+#endif
+
diff --git a/include/SupportVISC/DFGTreeTraversal.h b/include/SupportVISC/DFGTreeTraversal.h
new file mode 100644
index 0000000000..c031c112fe
--- /dev/null
+++ b/include/SupportVISC/DFGTreeTraversal.h
@@ -0,0 +1,64 @@
+#ifndef __DFGTREETRAVERSAL_H__
+#define __DFGTREETRAVERSAL_H__
+	
+//=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+	
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/BuildDFG/BuildDFG.h"
+	
+using namespace llvm;
+using namespace builddfg;
+	
+namespace dfg2llvm {
+	
+  class DFGTreeTraversal : public DFNodeVisitor {
+	
+  protected:
+    //Member variables
+    Module &M;
+    BuildDFG &DFG;
+	
+    virtual void process(DFInternalNode* N) = 0;
+    virtual void process(DFLeafNode* N) = 0;
+	
+    virtual ~DFGTreeTraversal() {}
+	
+  public:
+    // Constructor
+  DFGTreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
+	
+    void visit(DFInternalNode* N) {
+      // May visit a nodemore than once, there is no marking it as visited
+      errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() << "\n";
+	
+      // Follows a bottom-up approach.
+      for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+	     e = N->getChildGraph()->end(); i != e; ++i) {
+	DFNode* child = *i;
+	child->applyDFNodeVisitor(*this);
+      }
+	
+      // Process this internal node now.
+      process(N);
+      errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    }
+	
+    void visit(DFLeafNode* N) {
+      errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName() << "\n";
+      process(N);
+      errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    }
+  };
+	
+} // end namespace dfg2llvm
+	
+#endif
diff --git a/include/SupportVISC/VISCHint.h b/include/SupportVISC/VISCHint.h
new file mode 100644
index 0000000000..5324c0fabd
--- /dev/null
+++ b/include/SupportVISC/VISCHint.h
@@ -0,0 +1,35 @@
+//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VISC_HINT_HEADER
+#define VISC_HINT_HEADER
+
+/************************** Hint Routines ***************************/
+#ifdef __cplusplus
+namespace visc {
+#endif
+
+  enum Target {
+    None,
+    CPU_TARGET,
+    GPU_TARGET,
+    SPIR_TARGET,
+    CUDNN_TARGET,
+    PROMISE_TARGET,
+    CPU_OR_GPU_TARGET,
+    CPU_OR_SPIR_TARGET,
+//    ALL_TARGETS,
+    NUM_TARGETS
+  };
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //VISC_HINT_HEADER
diff --git a/include/SupportVISC/VISCTimer.h b/include/SupportVISC/VISCTimer.h
new file mode 100644
index 0000000000..4dbadbd34f
--- /dev/null
+++ b/include/SupportVISC/VISCTimer.h
@@ -0,0 +1,159 @@
+//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VISC_TIMER_HEADER
+#define VISC_TIMER_HEADER
+
+/************************** Timer Routines ***************************/
+extern "C" {
+
+/* A time or duration. */
+//#if _POSIX_VERSION >= 200112L
+typedef unsigned long long visc_Timestamp; /* time in microseconds */
+//#else
+//# error "Timestamps not implemented"
+//#endif
+
+enum visc_TimerState {
+  visc_Timer_STOPPED,
+  visc_Timer_RUNNING,
+};
+
+struct visc_Timer {
+  enum visc_TimerState state;
+  visc_Timestamp elapsed;       /* Amount of time elapsed so far */
+  visc_Timestamp init;		/* Beginning of the current time interval,
+				 * if state is RUNNING.  End of the last
+				 * recorded time interfal otherwise.  */
+};
+
+/* Reset a timer.
+ * Use this to initialize a timer or to clear
+ * its elapsed time.  The reset timer is stopped.
+ */
+void
+visc_ResetTimer(struct visc_Timer *timer);
+
+/* Start a timer.  The timer is set to RUNNING mode and
+ * time elapsed while the timer is running is added to
+ * the timer.
+ * The timer should not already be running.
+ */
+void
+visc_StartTimer(struct visc_Timer *timer);
+
+/* Stop a timer.
+ * This stops adding elapsed time to the timer.
+ * The timer should not already be stopped.
+ */
+void
+visc_StopTimer(struct visc_Timer *timer);
+
+/* Get the elapsed time in seconds. */
+double
+visc_GetElapsedTime(struct visc_Timer *timer);
+
+/* Execution time is assigned to one of these categories. */
+enum visc_TimerID {
+  visc_TimerID_NONE = 0,
+  visc_TimerID_IO,		/* Time spent in input/output */
+  visc_TimerID_KERNEL,		/* Time spent computing on the device,
+				 * recorded asynchronously */
+  visc_TimerID_COPY,		/* Time spent synchronously moving data
+				 * to/from device and allocating/freeing
+				 * memory on the device */
+  visc_TimerID_DRIVER,		/* Time spent in the host interacting with the
+				 * driver, primarily for recording the time
+                                 * spent queueing asynchronous operations */
+  visc_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
+  visc_TimerID_COMPUTE,		/* Time for all program execution other
+				 * than parsing command line arguments,
+				 * I/O, kernel, and copy */
+  visc_TimerID_OVERLAP,		/* Time double-counted in asynchronous and
+				 * host activity: automatically filled in,
+				 * not intended for direct usage */
+  // GPU FUNCTION
+  visc_TimerID_INIT_CTX,
+  visc_TimerID_CLEAR_CTX,
+  visc_TimerID_COPY_SCALAR,
+  visc_TimerID_COPY_PTR,
+  visc_TimerID_MEM_FREE,
+  visc_TimerID_READ_OUTPUT,
+  visc_TimerID_SETUP,
+  visc_TimerID_MEM_TRACK,
+  visc_TimerID_MEM_UNTRACK,
+  visc_TimerID_MISC,
+  // LAUNCH FUNCTION
+  visc_TimerID_PTHREAD_CREATE,
+  visc_TimerID_ARG_PACK,
+  visc_TimerID_ARG_UNPACK,
+  visc_TimerID_COMPUTATION,
+  visc_TimerID_OUTPUT_PACK,
+  visc_TimerID_OUTPUT_UNPACK,
+
+  visc_TimerID_LAST		/* Number of timer IDs */
+};
+
+/* Dynamic list of asynchronously tracked times between events */
+struct visc_async_time_marker_list {
+  char *label; // actually just a pointer to a string
+  enum visc_TimerID timerID;	/* The ID to which the interval beginning
+                                 * with this marker should be attributed */
+  void * marker;
+  //cudaEvent_t marker; 		/* The driver event for this marker */
+  struct visc_async_time_marker_list *next;
+};
+
+struct visc_SubTimer {
+  char *label;
+  struct visc_Timer timer;
+  struct visc_SubTimer *next;
+};
+
+struct visc_SubTimerList {
+  struct visc_SubTimer *current;
+  struct visc_SubTimer *subtimer_list;
+};
+
+/* A set of timers for recording execution times. */
+struct visc_TimerSet {
+  enum visc_TimerID current;
+  struct visc_async_time_marker_list* async_markers;
+  visc_Timestamp async_begin;
+  visc_Timestamp wall_begin;
+  struct visc_Timer timers[visc_TimerID_LAST];
+  struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST];
+};
+
+/* Reset all timers in the set. */
+void
+visc_InitializeTimerSet(struct visc_TimerSet *timers);
+
+void
+visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category);
+
+/* Select which timer the next interval of time should be accounted
+ * to. The selected timer is started and other timers are stopped.
+ * Using visc_TimerID_NONE stops all timers. */
+inline void
+visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer);
+
+void
+visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category);
+
+/* Print timer values to standard output. */
+void
+visc_PrintTimerSet(struct visc_TimerSet *timers);
+
+/* Release timer resources */
+void
+visc_DestroyTimerSet(struct visc_TimerSet * timers);
+
+}
+#endif //VISC_RT_HEADER
diff --git a/include/SupportVISC/VISCUtils.h b/include/SupportVISC/VISCUtils.h
new file mode 100644
index 0000000000..a20ce8bccd
--- /dev/null
+++ b/include/SupportVISC/VISCUtils.h
@@ -0,0 +1,601 @@
+//
+//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VISC_UTILS_HEADER
+#define VISC_UTILS_HEADER
+
+#include <assert.h>
+ 
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace viscUtils {
+// Helper Functions
+
+static bool isViscCreateNodeIntrinsic(Instruction* I) {
+  if(!isa<IntrinsicInst>(I))
+    return false;
+  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).startswith("llvm.visc.createNode");
+}
+
+static bool isViscCreateNodeCall(Instruction* I) {
+  if(!isa<CallInst>(I))
+    return false;
+  CallInst* CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__createNode");
+}
+
+static bool isViscLaunchCall(Instruction* I) {
+  if(!isa<CallInst>(I))
+    return false;
+  CallInst* CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__launch");
+}
+// Creates a new createNode intrinsic, similar to II but with different
+// associated function F instead
+IntrinsicInst* createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function* F,
+                                                                       IntrinsicInst* II) {
+  Module* M = F->getParent();
+
+  // Find which createNode intrinsic we need to create
+  Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID());
+  Constant* Fp = ConstantExpr::getPointerCast(F,
+                                          Type::getInt8PtrTy(II->getContext()));
+
+  ArrayRef<Value*> CreateNodeArgs;
+  switch (II->getIntrinsicID()) {
+    case Intrinsic::visc_createNode:
+    {
+      CreateNodeArgs = ArrayRef<Value*>(Fp);
+      break;
+    }
+    case Intrinsic::visc_createNode1D:
+    {
+      Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
+      break;
+    }
+    case Intrinsic::visc_createNode2D:
+    {
+      Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
+                                       II->getArgOperand(2)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
+      break;
+    }
+    case Intrinsic::visc_createNode3D:
+    {
+      Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1),
+                                       II->getArgOperand(2),
+                                       II->getArgOperand(3)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
+      break;
+    }
+    default :
+      assert(false && "Unknown createNode intrinsic");
+      break;
+  }
+
+  CallInst* CI = CallInst::Create(CreateNodeF,
+                                  CreateNodeArgs,
+                                  F->getName()+".node");
+  IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
+  return CreateNodeII;
+}
+/*
+CallInst* createIdenticalCreateNodeCallWithDifferentFunction(Function* F,
+                                                             CallInst* CI) {
+
+  // Find which createNode function call we need to create
+  Function* CreateNodeF = CI->getCalledValue();
+
+  ArrayRef<Value*> CreateNodeArgs;
+  if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode")) {
+    // This is a createNode call
+    CreateNodeArgs = ArrayRef<Value*>(CreateNodeF);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode1D")) {
+    // This is a createNode1D call
+    Value* CreateNode1DArgs[] = {CreateNodeF, CI->getArgOperand(1)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode2D")) {
+    // This is a createNode2D call
+    Value* CreateNode2DArgs[] = {CreateNodeF,
+                                 CI->getArgOperand(1),
+                                 CI->getArgOperand(2)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode3D")) {
+    // This is a createNode3D call
+    Value* CreateNode3DArgs[] = {CreateNodeF,
+                                 CI->getArgOperand(1),
+                                 CI->getArgOperand(2),
+                                 CI->getArgOperand(3)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
+  } else {
+    assert(false && "Unknown createNode call");
+  }
+
+  CallInst* newCI = CallInst::Create(CreateNodeF,
+                                     CreateNodeArgs,
+                                     F->getName()+".cncall");
+  return newCI;
+}
+*/
+
+// Fix VISC hints for this function
+void fixHintMetadata(Module &M, Function* F, Function* G) {
+    Metadata* MD_F = ValueAsMetadata::getIfExists(F);
+    MDTuple* MDT_F = MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata*>(MD_F));
+    DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n");
+    MDTuple* MDT_G = MDNode::get(F->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(G)));
+    DEBUG(errs() << "New Metadata: " << *MDT_G << "\n");
+
+    NamedMDNode* HintNode = M.getOrInsertNamedMetadata("visc_hint_gpu");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_spir");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cudnn");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_promise");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_spir");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(i, MDT_G);
+    }
+}
+
+// Assuming that the changed function is a node function, it is only used as a
+// first operand of createNode*. It is enough to iterate through all createNode*
+// calls in the program.
+void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
+
+  for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
+    Function* f = &*mi;
+    DEBUG(errs() << "Function: " << f->getName() << "\n");
+
+    std::vector<Instruction*> toBeErased;
+
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
+      Instruction* I = &*i; // Grab pointer to Instruction
+
+      if (isViscCreateNodeIntrinsic(I)) {
+        IntrinsicInst* II = cast<IntrinsicInst>(I);
+        // The found createNode is not associated with the changed function
+        if (II->getArgOperand(0) != F)
+          continue; // skip it
+
+        // Otherwise, create a new createNode similar to the other one,
+        // but with the changed function as first operand.
+        IntrinsicInst* CreateNodeII =
+          createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
+        II->replaceAllUsesWith(CreateNodeII);
+        toBeErased.push_back(II);
+      } else if (isViscCreateNodeCall(I)) {
+        CallInst* CI = cast<CallInst>(I);
+        // The found createNode is not associated with the changed function
+        if (CI->getArgOperand(1) != F)
+          continue; // skip it
+
+        DEBUG(errs() << "Fixing use: " << *CI << "\n");
+        DEBUG(errs() << "in function: " << f->getName() << "\n");
+        // Replace use of F with use of G
+        CI->setArgOperand(1, G);
+        DEBUG(errs() << "Fixed use: " << *CI << "\n");
+      } else if(isViscLaunchCall(I)) {
+        CallInst* CI = cast<CallInst>(I);
+        // The found launch call is not associated with the changed function
+        if (CI->getArgOperand(1)->stripPointerCasts() != F)
+          continue;
+
+        // Otherwise, replace F with G
+        errs() << *G->getType() << "\n";
+        errs() << *CI->getArgOperand(1)->getType() << "\n";
+        CI->setArgOperand(1, G);
+      }
+
+    }
+
+    for(auto I: toBeErased) {
+      DEBUG(errs() << "\tErasing " << *I << "\n");
+      I->eraseFromParent();
+    }
+  }
+
+  // Check if the function is used by a metadata node
+  if(F->isUsedByMetadata()) {
+    fixHintMetadata(M, F, G);
+  }
+  DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with " << G->getName() << "\n");
+
+  // Remove replaced function from the module
+  //assert(F->user_empty() && "Still some uses of older function left\n");
+  F->replaceAllUsesWith(UndefValue::get(F->getType()));
+  F->eraseFromParent();
+
+}
+
+
+// Create new function F' as a copy of old function F with a new signature.
+// The following two most used cases are handled by this function.
+// 1. When some extra arguments need to be added to this function
+//    - Here we can map the old function arguments to
+//      new ones
+// 2. When each pointer argument needs an additional size argument
+//    - Here, in the absence of VMap, we map the arguments in order, skipping
+//      over extra pointer arguments.
+// The function returns the list of return instructions to the caller to fix in
+// case the return type is also changed.
+Function* cloneFunction(Function* F, FunctionType* newFT, bool
+    isAddingPtrSizeArg, SmallVectorImpl<ReturnInst*>* Returns = NULL) {
+
+  DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
+  DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
+  DEBUG(errs() << "New Function Type: " << *newFT << "\n");
+
+  assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams()
+      && "This function assumes that the new function has more arguments than the old function!");
+
+  // Create Function of specified type
+  Function* newF = Function::Create(newFT, F->getLinkage(), F->getName()+"_cloned", F->getParent());
+  DEBUG(errs() << "Old Function name: " << F->getName() << "\n");
+  DEBUG(errs() << "New Function name: " << newF->getName() << "\n");
+  ValueToValueMapTy VMap;
+  DEBUG(errs() << "No value map provided. Creating default value map\n");
+  if(isAddingPtrSizeArg) {
+    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n");
+    Function::arg_iterator new_ai = newF->arg_begin();
+    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+        ai != ae; ++ai) {
+      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
+      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+      VMap[&*ai] = &*new_ai;
+      new_ai->takeName(&*ai);
+      if(ai->getType()->isPointerTy()) {
+        std::string oldName = new_ai->getName();
+        // If the current argument is pointer type, the next argument in new
+        // function would be an i64 type containing the data size of this
+        // argument. Hence, skip the next arguement in new function.
+        ++new_ai;
+        new_ai->setName("bytes_"+oldName);
+      }
+      ++new_ai;
+    }
+  }
+  else {
+    DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n");
+    Function::arg_iterator new_ai = newF->arg_begin();
+    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+        ai != ae; ++ai, ++new_ai) {
+      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
+      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+      VMap[&*ai] = &*new_ai;
+      new_ai->takeName(&*ai);
+    }
+  }
+
+  // Clone function
+  if (Returns == NULL)
+    Returns = new SmallVector<ReturnInst*, 8>();
+  CloneFunctionInto(newF, F, VMap, false, *Returns);
+
+  return newF;
+}
+
+ //------------------- Helper Functions For Handling Hints -------------------//
+  
+// Return true if 1st arg (tag) contains 2nd (target)
+bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
+  switch (Tag) {
+    case visc::None:
+      return false;
+    case visc::CPU_TARGET:
+      if (T == visc::CPU_TARGET)
+        return true;
+      else
+        return false;
+    case visc::GPU_TARGET:
+      if (T == visc::GPU_TARGET)
+        return true;
+      else
+        return false;
+    case visc::SPIR_TARGET:
+      if (T == visc::SPIR_TARGET)
+        return true;
+      else
+        return false;
+    case visc::CUDNN_TARGET:
+      if (T == visc::CUDNN_TARGET)
+        return true;
+      else
+        return false;
+    case visc::PROMISE_TARGET:
+      if (T == visc::PROMISE_TARGET)
+        return true;
+      else
+        return false;
+    case visc::CPU_OR_GPU_TARGET:
+      if ((T == visc::CPU_TARGET) ||
+          (T == visc::GPU_TARGET) ||
+          (T == visc::CPU_OR_GPU_TARGET))
+        return true;
+      else
+        return false;
+    case visc::CPU_OR_SPIR_TARGET:
+      if ((T == visc::CPU_TARGET) ||
+          (T == visc::SPIR_TARGET) ||
+          (T == visc::CPU_OR_SPIR_TARGET))
+        return true;
+      else
+        return false;
+    default:
+      assert(false && "Unknown Target\n");
+  }
+}
+
+bool isSingleTargetTag(visc::Target T) {
+  return ((T == visc::CPU_TARGET)    ||
+          (T == visc::GPU_TARGET)    ||
+          (T == visc::SPIR_TARGET)   ||
+          (T == visc::CUDNN_TARGET)  ||
+          (T == visc::PROMISE_TARGET));
+}
+
+// Add the specified target to the given tag
+visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
+  assert(((T == visc::CPU_TARGET)    ||
+          (T == visc::GPU_TARGET)    ||
+          (T == visc::SPIR_TARGET)   ||
+          (T == visc::CUDNN_TARGET)  ||
+          (T == visc::PROMISE_TARGET)) &&
+         "The target is only allowed to be a single target: CPU, GPU, SPIR, CUDNN, PROMISE\n");
+
+  switch (Tag) {
+    case visc::None:
+      return T;
+    case visc::CPU_TARGET:
+      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
+             "Unsupported target combination\n");
+      break;
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_TARGET;
+      if (T == visc::GPU_TARGET)
+        return visc::CPU_OR_GPU_TARGET;
+      if (T == visc::SPIR_TARGET)
+        return visc::CPU_OR_SPIR_TARGET;
+    case visc::GPU_TARGET:
+      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
+      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
+             "Unsupported target combination\n");
+      break;
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_OR_GPU_TARGET;
+      if (T == visc::GPU_TARGET)
+        return visc::GPU_TARGET;
+    case visc::SPIR_TARGET:
+      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
+      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
+             "Unsupported target combination\n");
+      break;
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_OR_SPIR_TARGET;
+      if (T == visc::SPIR_TARGET)
+        return visc::SPIR_TARGET;
+    case visc::CPU_OR_GPU_TARGET:
+      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
+             "Unsupported target combination\n");
+      break;
+      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
+      return visc::CPU_OR_GPU_TARGET;
+    case visc::CPU_OR_SPIR_TARGET:
+      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
+             "Unsupported target combination\n");
+      break;
+      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
+      return visc::CPU_OR_SPIR_TARGET;
+    default:
+      assert(false && "Unknown Target\n");
+  }
+}
+
+// This functions add the hint as metadata in visc code
+void addHint(Function* F, visc::Target T) {
+   // Get Module
+  Module* M = F->getParent();
+  DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
+ 
+  // Based on the hint, get the hint metadata
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      DEBUG(errs() << "GPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      DEBUG(errs() << "SPIR Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CUDNN_TARGET:
+      DEBUG(errs() << "CUDNN Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+      break;
+    case visc::PROMISE_TARGET:
+      DEBUG(errs() << "PROMISE Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+      break;
+    case visc::CPU_TARGET:
+      DEBUG(errs() << "CPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      DEBUG(errs() << "CPU or GPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+      break;
+    case visc::CPU_OR_SPIR_TARGET:
+      DEBUG(errs() << "CPU or SPIR Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+      break;
+    default:
+      llvm_unreachable("Unsupported Target Hint!");
+      break;
+  }
+
+  // Create a node for the function and add it to the hint node
+  MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  HintNode->addOperand(N);
+}
+
+// This function removes the hint as metadata in visc code
+void removeHint(Function* F, visc::Target T) {
+  // Get Module
+  Module* M = F->getParent();
+  DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n");
+
+  // Based on the hint, get the hint metadata
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CUDNN_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+      break;
+    case visc::PROMISE_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+      break;
+    case visc::CPU_OR_SPIR_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+      break;
+    case visc::CPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    default:
+      llvm_unreachable("Unsupported Target Hint!");
+      break;
+  }
+
+  // Gather metadata nodes, and keep those not associated with this function
+  MDNode* N = MDNode::get(M->getContext(),
+                          ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  std::vector<MDNode*> MDNodes;
+
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MDN = HintNode->getOperand(i);
+    if (MDN == N) {
+      continue;
+    }
+    MDNodes.push_back(MDN);
+  }
+
+  HintNode->dropAllReferences();
+
+  for (unsigned i = 0; i < MDNodes.size(); i++) {
+    HintNode->addOperand(MDNodes[i]);
+  }
+
+}
+
+visc::Target getPreferredTarget(Function* F) {
+  DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
+  Module* M = F->getParent();
+  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::GPU_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::SPIR_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CUDNN_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::PROMISE_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_GPU_TARGET;
+  }
+
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_SPIR_TARGET;
+  }
+  return visc::CPU_TARGET;
+}
+
+
+} // End of namespace
+
+#endif //VISC_UTILS_HEADER
diff --git a/lib/BuildDFG/BuildDFG.cpp b/lib/BuildDFG/BuildDFG.cpp
new file mode 100644
index 0000000000..04b01e332b
--- /dev/null
+++ b/lib/BuildDFG/BuildDFG.cpp
@@ -0,0 +1,395 @@
+//=== BuildDFG.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "buildDFG"
+#include "llvm/BuildDFG/BuildDFG.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+
+using namespace llvm;
+
+namespace builddfg {
+
+bool BuildDFG::runOnModule(Module &M) {
+  errs() << "\nBUILDDFG PASS\n";
+  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
+
+  IntrinsicInst* II;
+
+  // Iterate over all functions in the module
+  for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
+    Function* f = &*mi;
+    DEBUG(errs() << "Function: " << f->getName() << "\n");
+
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
+      Instruction* I = &*i; // Grab pointer to Instruction
+      if (isViscLaunchIntrinsic(I)) {
+        DEBUG(errs() << "------------ Found launch site --------------\n");
+        II = cast<IntrinsicInst>(I);
+
+        assert(II && "Launch intrinsic not recognized.");
+
+        // Intrinsic Instruction has been initialized from this point on.
+        Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts());
+        Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F));
+        Roots.push_back(Root);
+        BuildGraph(Root, F);
+
+        for(DFGraph::children_iterator i = Root->getChildGraph()->begin(),
+            e = Root->getChildGraph()->end(); i!=e; i++) {
+          DFNode* N = *i;
+          DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
+        }
+        Root->getChildGraph()->sortChildren();
+        for(DFGraph::children_iterator i = Root->getChildGraph()->begin(),
+            e = Root->getChildGraph()->end(); i!=e; i++) {
+          DFNode* N = *i;
+          DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
+        }
+        viewDFGraph(Root->getChildGraph());
+
+      }
+    }
+  }
+
+  // Checking that we found at least one launch site
+  assert((Roots.size() != 0) && "Launch site not found.");
+
+  return false; //TODO: What does returning "false" mean?
+}
+
+DFInternalNode *BuildDFG::getRoot() const {
+  return Root;
+}
+
+std::vector<DFInternalNode*> &BuildDFG::getRoots() {
+  return Roots;
+}
+
+//TODO: Maybe make this const
+BuildDFG::HandleToDFNode &BuildDFG::getHandleToDFNodeMap() {
+  return HandleToDFNodeMap;
+}
+
+//TODO: Maybe make this const
+BuildDFG::HandleToDFEdge &BuildDFG::getHandleToDFEdgeMap() {
+  return HandleToDFEdgeMap;
+}
+
+void BuildDFG::addElementToHandleToDFNodeMap(Value* V, DFNode* N) {
+  assert((HandleToDFNodeMap.find(V) == HandleToDFNodeMap.end()) &&
+         "Attempted to insert duplicate key in HandleToDFNodeMap");
+  HandleToDFNodeMap.insert(std::pair<Value*, DFNode*>(V,N));
+}
+
+//TODO: check if the removed element was not there
+void BuildDFG::removeElementFromHandleToDFNodeMap(Value* V) {
+  HandleToDFNodeMap.erase(V);
+}
+
+void BuildDFG::addElementToHandleToDFEdgeMap(Value* V, DFEdge* E) {
+  assert((HandleToDFEdgeMap.find(V) == HandleToDFEdgeMap.end()) &&
+         "Attempted to insert duplicate key in HandleToDFEdgeMap");
+  HandleToDFEdgeMap.insert(std::pair<Value*, DFEdge*>(V,E));
+}
+
+//TODO: check if the removed element was not there
+void BuildDFG::removeElementFromHandleToDFEdgeMap(Value* V) {
+  HandleToDFEdgeMap.erase(V);
+}
+
+// Returns true if instruction I is a visc launch intrinsic, false otherwise
+bool BuildDFG::isViscLaunchIntrinsic(Instruction* I) {
+  if(!isa<IntrinsicInst>(I))
+    return false;
+  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).equals("llvm.visc.launch");
+}
+
+// Returns true if instruction I is a visc graph intrinsic, false otherwise
+bool BuildDFG::isViscGraphIntrinsic(Instruction* I) {
+  if(!isa<IntrinsicInst>(I))
+    return false;
+  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).startswith("llvm.visc.create")
+         || (II->getCalledFunction()->getName()).startswith("llvm.visc.bind");
+}
+
+// Returns true if instruction I is a visc query intrinsic, false otherwise
+bool BuildDFG::isViscQueryIntrinsic(Instruction* I) {
+  if(!isa<IntrinsicInst>(I))
+    return false;
+  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).startswith("llvm.visc.get");
+}
+
+// Returns true if instruction I is a visc intrinsic, false otherwise
+bool BuildDFG::isViscIntrinsic(Instruction* I) {
+  if(!isa<IntrinsicInst>(I))
+    return false;
+  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).startswith("llvm.visc");
+}
+
+// Two types are "congruent" if they are identical, or if they are both
+// pointer types with different pointee types and the same address space.
+bool BuildDFG::isTypeCongruent(Type* L, Type* R) {
+  if(L == R)
+    return true;
+  PointerType *PL = dyn_cast<PointerType>(L);
+  PointerType *PR = dyn_cast<PointerType>(R);
+  if (!PL || !PR)
+    return false;
+  return PL->getAddressSpace() == PR->getAddressSpace();
+}
+
+// Handles all the createNodeXX visc intrinsics.
+void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) {
+  bool isInternalNode = false;
+
+  Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts());
+
+  // Check if the function associated with this intrinsic is a leaf or
+  // internal node
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction* I = &*i; // Grab pointer to Instruction
+    if (isViscGraphIntrinsic(I))
+      isInternalNode = true;
+  }
+
+  // Number of Dimensions would be equal to the (number of operands - 1) as
+  // the first operand is the pointer to associated Function and the
+  // remaining operands are the limits in each dimension.
+  unsigned numOfDim = II->getCalledFunction()->getFunctionType()->getNumParams()-1;
+  assert(numOfDim <= 3
+         && "Invalid number of dimensions for createNode intrinsic!");
+  std::vector<Value*> dimLimits;
+  for (unsigned i = 1; i <= numOfDim; i++) {
+    // The operands of II are same as the operands of the called
+    // intrinsic. It has one extra operand at the end, which is the intrinsic
+    // being called.
+    dimLimits.push_back(cast<Value> (II->getOperand(i)));
+  }
+
+  if(isInternalNode) {
+    // Create Internal DFNode, add it to the map and recursively build its
+    // dataflow graph
+    DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+    N->addChildToDFGraph(childDFNode);
+    HandleToDFNodeMap[II] = childDFNode;
+    BuildGraph(childDFNode, F);
+  }
+  else {
+    // Create Leaf DFnode and add it to the map.
+    DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+    N->addChildToDFGraph(childDFNode);
+    HandleToDFNodeMap[II] = childDFNode;
+  }
+}
+
+void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) {
+  // The DFNode structures must be in the map before the edge is processed
+  HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
+  assert(DFI != HandleToDFNodeMap.end());
+  DFI = HandleToDFNodeMap.find(II->getOperand(1));
+  assert(DFI != HandleToDFNodeMap.end());
+
+  DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)];
+  DFNode* DestDF = HandleToDFNodeMap[II->getOperand(1)];
+
+  bool EdgeType = !cast<ConstantInt>(II->getOperand(2))->isZero();
+
+  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(3))->getZExtValue();
+  unsigned DestPosition = cast<ConstantInt>(II->getOperand(4))->getZExtValue();
+
+  bool isStreaming = !cast<ConstantInt>(II->getOperand(5))->isZero();
+
+  Type *SrcTy, *DestTy;
+
+  // Get destination type
+  FunctionType *FT = DestDF->getFuncPointer()->getFunctionType();
+  assert((FT->getNumParams() > DestPosition)
+         && "Invalid argument number for destination dataflow node!");
+  DestTy = FT->getParamType(DestPosition);
+
+  // Get source type
+  StructType* OutTy = SrcDF->getOutputType();
+  assert((OutTy->getNumElements() > SourcePosition)
+         && "Invalid argument number for source dataflow node!");
+  SrcTy = OutTy->getElementType(SourcePosition);
+
+  // check if the types are compatible
+  assert(isTypeCongruent(SrcTy, DestTy)
+         && "Source and destination type of edge do not match");
+
+  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
+                                     DestDF,
+                                     EdgeType,
+                                     SourcePosition,
+                                     DestPosition,
+                                     DestTy,
+                                     isStreaming);
+
+  HandleToDFEdgeMap[II] = newDFEdge;
+
+  // Add Edge to the dataflow graph associated with the parent node
+  N->addEdgeToDFGraph(newDFEdge);
+}
+
+void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) {
+  // The DFNode structures must be in the map before the edge is processed
+  HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
+  assert(DFI != HandleToDFNodeMap.end());
+
+  DFNode* SrcDF = N->getChildGraph()->getEntry();
+  DFNode* DestDF = HandleToDFNodeMap[II->getOperand(0)];
+
+  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+  unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+
+  bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero();
+  
+  // Get destination type
+  FunctionType *FT = DestDF->getFuncPointer()->getFunctionType();
+  assert((FT->getNumParams() > DestPosition)
+         && "Invalid argument number for destination dataflow node!");
+  Type* DestTy = FT->getParamType(DestPosition);
+
+  // Get source type
+  FT = SrcDF->getFuncPointer()->getFunctionType();
+  assert((FT->getNumParams() > SourcePosition)
+         && "Invalid argument number for parent dataflow node!");
+  Type* SrcTy = FT->getParamType(SourcePosition);
+
+  // check if the types are compatible
+  assert(isTypeCongruent(SrcTy, DestTy)
+         && "Source and destination type of edge do not match");
+
+  // Add Binding as an edge between Entry and child Node
+  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
+                                     DestDF,
+                                     false,
+                                     SourcePosition,
+                                     DestPosition,
+                                     DestTy,
+                                     isStreaming);
+
+  HandleToDFEdgeMap[II] = newDFEdge;
+
+  // Add Edge to the dataflow graph associated with the parent node
+  N->addEdgeToDFGraph(newDFEdge);
+}
+
+void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) {
+  // The DFNode structures must be in the map before the edge is processed
+  HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
+  assert(DFI != HandleToDFNodeMap.end());
+
+  DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)];
+  DFNode* DestDF = N->getChildGraph()->getExit();
+
+  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+  unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+
+  bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero();
+  
+  // Get destination type
+  StructType* OutTy = DestDF->getOutputType();
+  assert((OutTy->getNumElements() > DestPosition)
+         && "Invalid argument number for destination parent dataflow node!");
+  Type* DestTy = OutTy->getElementType(DestPosition);
+
+  // Get source type
+  OutTy = SrcDF->getOutputType();
+  assert((OutTy->getNumElements() > SourcePosition)
+         && "Invalid argument number for source dataflow node!");
+  Type* SrcTy = OutTy->getElementType(SourcePosition);
+
+  // check if the types are compatible
+  assert(isTypeCongruent(SrcTy, DestTy)
+         && "Source and destination type of edge do not match");
+
+  // Add Binding as an edge between child and exit node
+  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
+                                     DestDF,
+                                     false,
+                                     SourcePosition,
+                                     DestPosition,
+                                     DestTy,
+                                     isStreaming);
+
+  HandleToDFEdgeMap[II] = newDFEdge;
+
+  // Add Edge to the dataflow graph associated with the parent node
+  N->addEdgeToDFGraph(newDFEdge);
+}
+
+void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) {
+
+  // TODO: Place checks for valid visc functions. For example one of the
+  // check can be that any function that contains visc dataflow graph
+  // construction intrinsics should not have other llvm IR statements.
+
+  // Iterate over all the instructions of a function and look for visc
+  // intrinsics.
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction* I = &*i; // Grab pointer to instruction reference
+    DEBUG(errs() << *I << "\n");
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
+      DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName()<<"\n");
+      switch(II->getIntrinsicID()) {
+
+      case Intrinsic::visc_createNode:
+      case Intrinsic::visc_createNode1D:
+      case Intrinsic::visc_createNode2D:
+      case Intrinsic::visc_createNode3D:
+        handleCreateNode (N, II);
+        break;
+
+      case Intrinsic::visc_createEdge:
+        handleCreateEdge(N, II);
+        break;
+      case Intrinsic::visc_bind_input:
+        handleBindInput(N, II);
+        break;
+      case Intrinsic::visc_bind_output:
+        handleBindOutput(N, II);
+        break;
+
+      //TODO: Reconsider launch within a dataflow graph (recursion?)
+      case Intrinsic::visc_wait:
+      case Intrinsic::visc_launch:
+        errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n";
+        break;
+
+      default:
+        errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n";
+        break;
+      }
+    }
+    else if(!isa<ReturnInst>(I)) {
+      errs() << "Non-intrinsic instruction: " << *I << "\n";
+      llvm_unreachable("Found non-intrinsic instruction inside an internal node. Only return instruction is allowed!");
+
+    }
+
+  }
+}
+
+char BuildDFG::ID = 0;
+static RegisterPass<BuildDFG> X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false);
+
+} // End of namespace builddfg
+
diff --git a/lib/BuildDFG/BuildDFG.exports b/lib/BuildDFG/BuildDFG.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/BuildDFG/CMakeLists.txt b/lib/BuildDFG/CMakeLists.txt
new file mode 100644
index 0000000000..0b1fa4837c
--- /dev/null
+++ b/lib/BuildDFG/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMBuildDFG
+  BuildDFG.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/BuildDFG/LLVMBuild.txt b/lib/BuildDFG/LLVMBuild.txt
new file mode 100644
index 0000000000..26d8856162
--- /dev/null
+++ b/lib/BuildDFG/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/BuildDFG/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = BuildDFG
+parent = Transforms
diff --git a/lib/ClearDFG/CMakeLists.txt b/lib/ClearDFG/CMakeLists.txt
new file mode 100644
index 0000000000..f928c8acda
--- /dev/null
+++ b/lib/ClearDFG/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMClearDFG
+  ClearDFG.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/ClearDFG/ClearDFG.cpp b/lib/ClearDFG/ClearDFG.cpp
new file mode 100644
index 0000000000..84f9bec04f
--- /dev/null
+++ b/lib/ClearDFG/ClearDFG.cpp
@@ -0,0 +1,172 @@
+//===-------------------------- ClearDFG.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ClearDFG"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/BuildDFG/BuildDFG.h"
+
+using namespace llvm;
+using namespace builddfg;
+
+//STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
+
+namespace {
+
+// ClearDFG - The first implementation.
+struct ClearDFG : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  ClearDFG() : ModulePass(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+  }
+
+
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class TreeTraversal : public DFNodeVisitor {
+
+private:
+  //Member variables
+  Module &M;
+  BuildDFG &DFG;
+
+  // Map from Old function associated with DFNode to new cloned function with
+  // extra index and dimension arguments. This map also serves to find out if
+  // we already have an index and dim extended function copy or not (i.e.,
+  // "Have we visited this function before?")
+  ValueMap<Function*, Function*> FMap;
+  DenseMap<DFNode*, CallInst*> CallMap;
+
+  //Functions
+  void deleteNode(DFNode* N);
+  
+public:
+  // Constructor
+  TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
+
+  virtual void visit(DFInternalNode* N) {
+    // Follows a bottom-up approach for code generation.
+    // First generate code for all the child nodes
+    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
+        e = N->getChildGraph()->end(); i != e; ++i) {
+      DFNode* child = *i;
+      child->applyDFNodeVisitor(*this);
+    }
+    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    // Generate code for this internal node now. This way all the cloned
+    // functions for children exist.
+    deleteNode(N);
+    DEBUG(errs() << "\tDone - " << "\n");
+    //errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n";
+  }
+
+  virtual void visit(DFLeafNode* N) {
+    DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n");
+    deleteNode(N);
+    DEBUG(errs() << "DONE" << "\n");
+  }
+
+};
+
+bool ClearDFG::runOnModule(Module &M) {
+
+  errs() << "\nCLEARDFG PASS\n";
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n");
+  for(Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    I->eraseFromParent();
+  }
+  VI->replaceAllUsesWith(UndefValue::get(VI->getType()));
+  VI->eraseFromParent();
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n");
+  for(Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    I->eraseFromParent();
+  }
+    
+  VC->replaceAllUsesWith(UndefValue::get(VC->getType()));
+  VC->eraseFromParent();
+
+
+  Function* VN = M.getFunction("llvm.visc.node.id");
+  if (VN != NULL){ // Delete visc.node.id intrinsic calls if they exist
+    for(Value::user_iterator ui = VN->user_begin(), ue = VN->user_end(); ui != ue; ui++) {
+      Instruction* I = dyn_cast<Instruction>(*ui);
+      I->eraseFromParent();
+    }
+    
+    VN->replaceAllUsesWith(UndefValue::get(VN->getType()));
+    VN->eraseFromParent();
+  }
+  
+  
+  // Visitor for Code Generation Graph Traversal
+  TreeTraversal *Visitor = new TreeTraversal(M, DFG);
+
+  // Initiate code generation for root DFNode
+  for (auto rootNode: Roots) {
+    Visitor->visit(rootNode);
+  }
+  
+  delete Visitor;
+
+  return true;
+}
+
+void TreeTraversal::deleteNode(DFNode* N) {
+  if(N->isDummyNode())
+    return;
+  // Erase Function associated with this node
+  Function* F = N->getFuncPointer();
+  F->replaceAllUsesWith(UndefValue::get(F->getType()));
+  F->eraseFromParent();
+  // If N is not a root node, we are done. Return.
+  if(!N->isRoot())
+    return;
+  // N is a root node. Delete the Launch Intrinsic associated it with as well.
+  IntrinsicInst* LI = N->getInstruction();
+  LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+  LI->eraseFromParent();
+}
+
+} // End of namespace
+
+char ClearDFG::ID = 0;
+static RegisterPass<ClearDFG> X("clearDFG",
+				"Delete all DFG functions for which code has been generated",
+				false /* does not modify the CFG */,
+				true /* transformation, not just analysis */);
+
diff --git a/lib/ClearDFG/ClearDFG.exports b/lib/ClearDFG/ClearDFG.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/ClearDFG/LLVMBuild.txt b/lib/ClearDFG/LLVMBuild.txt
new file mode 100644
index 0000000000..ebca891469
--- /dev/null
+++ b/lib/ClearDFG/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/ClearDFG/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ClearDFG
+parent = Transforms
diff --git a/lib/DFG2LLVM_CUDNN/CMakeLists.txt b/lib/DFG2LLVM_CUDNN/CMakeLists.txt
new file mode 100644
index 0000000000..dc98faafec
--- /dev/null
+++ b/lib/DFG2LLVM_CUDNN/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_CUDNN
+  DFG2LLVM_CUDNN.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
new file mode 100644
index 0000000000..f18325588c
--- /dev/null
+++ b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
@@ -0,0 +1,645 @@
+//=== DFG2LLVM_CUDNN.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "DFG2LLVM_CUDNN"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+using namespace inplacedfg;
+
+namespace {
+// Helper class declarations
+
+// DFG2LLVM_CUDNN - The first implementation.
+
+struct DFG2LLVM_CUDNN : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_CUDNN() : DFG2LLVM(ID) {}
+private:
+
+public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addRequired<InPlaceDFGAnalysisWrapper>();
+    AU.addPreserved<BuildDFG>();
+    AU.addPreserved<InPlaceDFGAnalysisWrapper>();
+  }
+
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_CUDNN : public CodeGenTraversal {
+
+private:
+  //Member variables
+  InPlaceDFGAnalysis::InPlaceDFGParameter *IPP;
+
+  // VISC Runtime API and Tensor runtime API
+  Constant* llvm_hpvm_initTensorRt;
+  Constant* llvm_hpvm_cleanupTensorRt;
+  Constant* hpvm_request_tensor;
+
+  // Functions
+  bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N);
+
+
+
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_CUDNN(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP)
+  : CodeGenTraversal(_M, _DFG), IPP(&_IPP) {
+    initRuntimeAPI();
+  }
+
+};
+
+bool CGT_CUDNN::isValidOperandForInPlaceOperation(Value *Op,
+                                                  Function *Fgen,
+                                                  DFNode *N) {
+
+  if (Argument *Arg = dyn_cast<Argument>(Op)) {
+    DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n");
+    assert((Arg->getParent() == Fgen) &&
+          "Extra Parameter in body of Function\n");
+    // Candidae parameter is a function argument
+    // In this case, consult the result of in place analysis
+    // Find position in arg list
+    unsigned pos = Arg->getArgNo();
+    // If this parameter cannot be used for in place operation
+    // code gen cannot continue
+    if (IPP->at(N)[pos]) {
+      DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n");
+      return false;
+    }
+  }
+  else {
+    // If it is not an argument, then it needs to be the result of
+    // another intrinsic. These are new objects that are allocated,
+    // and consumed by next intrinsic. 
+    DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n");
+    if (dyn_cast<IntrinsicInst>(Op)) {
+      DEBUG(errs() << *Arg << "\t: local, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n");
+      return false;
+    }
+  }
+}
+
+
+void CGT_CUDNN::init() {
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_CUDNN::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n");
+
+  // FIXME: set correct path
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+   else
+    DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
+
+  // Get or insert Global declarations for
+  // - initialization
+  // - cleanup
+  // - request a tensor
+  DECLARE(llvm_hpvm_initTensorRt);
+  DECLARE(llvm_hpvm_cleanupTensorRt);
+  DECLARE(hpvm_request_tensor);
+
+  // Find visc.init and visc.cleanup calls, and add placeholder methods
+  // for initialization and cleanup of the hpvm tensor runtime
+  /*
+  LLVMContext &C = M.getContext();
+  auto *FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt32Ty(C)}), false);
+  llvm_hpvm_initTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_initTensorRt"), FuncType);
+  FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({}), false);
+  llvm_hpvm_cleanupTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_cleanupTensorRt"), FuncType);
+  FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt8PtrTy(C), Type::getInt32Ty(C)}), false);
+  hpvm_request_tensor = M.getOrInsertFunction(StringRef("hpvm_request_tensor"), FuncType); 
+*/
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  CallInst::Create(llvm_hpvm_initTensorRt,
+                   ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
+                   "", InitCall);
+
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall);
+
+}
+
+void CGT_CUDNN::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Skipping internal node\n";
+}
+
+  
+void CGT_CUDNN::codeGen(DFLeafNode* N) {
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+
+  // Generate code only if it has the right hint
+  if (!checkPreferredTarget(N, visc::CUDNN_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+  errs()<<"function name = "<< F->getName()<<"\n";
+
+  /* Removing HPVM in/out/inout function attributes */
+  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ai++){
+    Argument *Arg = &*ai;
+    if(Arg->hasAttribute(Attribute::In))
+      Arg->removeAttr(Attribute::In);
+    if(Arg->hasAttribute(Attribute::Out))
+      Arg->removeAttr(Attribute::Out);
+    if(Arg->hasAttribute(Attribute::InOut))
+      Arg->removeAttr(Attribute::InOut);    
+  }
+
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+  Function *F_cudnn = N->getGenFuncForTarget(visc::CUDNN_TARGET);
+
+  assert((F_cudnn == NULL) &&
+         "Error: Visiting a node for which code already generated");
+  
+  // Clone the function
+  ValueToValueMapTy VMap;
+  std::string FName(F->getName().data());
+  F_cudnn = CloneFunction(F, VMap);
+  F_cudnn->setName(FName + "_cudnn");
+  errs()<<"Cloned function name2 = "<<F_cudnn->getName()<<"\n";
+  F_cudnn->removeFromParent();  
+  M.getFunctionList().push_back(F_cudnn);
+
+  N->addGenFunc(F_cudnn, visc::CUDNN_TARGET, true);
+
+  // Adding nounwind to generated function : FIXME: needed?
+  DEBUG(errs() << "Adding nounwind to generated function\n");
+  F_cudnn->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+
+  // Add llvm_visc_requestTensor calls for every pointer argument of the function
+  // (they are all expected to be tensors), at the beginning of the function.
+  // This is the first instruction of the function, insert them before this
+  Instruction* FI = &*(F_cudnn->getEntryBlock().begin());
+
+  // In this backend, the target device is GPU, represented by i32 1.
+  ConstantInt *TargetDeviceID =
+    ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+
+  for (Function::arg_iterator ai = F_cudnn->arg_begin(),
+       ae = F_cudnn->arg_end(); ai != ae; ++ai) {
+    Argument* Arg = &*ai;
+    if (Arg->getType()->isPointerTy()) {
+      Value *Args[] = {Arg, TargetDeviceID};
+      CallInst::Create(hpvm_request_tensor,
+                       ArrayRef<Value*>(Args, 2),
+                       "", FI);
+    }
+  }
+
+  std::vector<IntrinsicInst *> IItoRemove;
+
+  for (inst_iterator i = inst_begin(F_cudnn), e = inst_end(F_cudnn); i != e; ++i) {
+    Instruction *I = &(*i);
+
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+      //  && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+
+      //if (!(II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){
+      //continue; // skip non-tensor ops 
+      //}
+      
+      /********************* Handle VISC Tensor intrinsics ********************/
+      switch (II->getIntrinsicID()) {
+
+      case Intrinsic::visc_tensor_convolution:
+      { /* llvm.hpvm.tensor.mul */
+        // Tensor mul is not in place.
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor convolution \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+
+	Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+	Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+
+        Args.push_back(conv_mode);
+        Args.push_back(conv_precision);
+	
+        // Create cudnn runtime function call
+        Constant* tensorConvolution;
+        DECLARE(tensorConvolution);
+	
+        CallInst* CI = CallInst::Create(tensorConvolution,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the runtime call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_group_convolution:
+      { /* llvm.hpvm.tensor.mul */
+        // Tensor mul is not in place.
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor convolution \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+
+	Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+
+        Args.push_back(conv_mode);
+        Args.push_back(II->getOperand(7));
+	
+        // Create cudnn runtime function call
+        Constant* tensorConvolution;
+        DECLARE(tensorConvolution);
+	
+        CallInst* CI = CallInst::Create(tensorConvolution,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the runtime call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_batchnorm:
+      { /* llvm.hpvm.tensor.batchnorm */
+        // Tensor batchnorm is in place.
+	// FIXME: Add Check for InPlace Analysis 
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor batch normalization \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+	
+        // Create cudnn runtime function call
+        Constant* tensorBatchNorm;
+        DECLARE(tensorBatchNorm);
+	
+        CallInst* CI = CallInst::Create(tensorBatchNorm,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.batchnorm with the TensorRT call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      
+      case Intrinsic::visc_tensor_mul:
+      { /* llvm.hpvm.tensor.mul */
+        // Tensor mul is not in place.
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor mul\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create cudnn runtime function call
+        Constant* tensorGemmGPU;
+        DECLARE(tensorGemmGPU);
+	
+        CallInst* CI = CallInst::Create(tensorGemmGPU,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the runtime call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_tensor_add:
+      { /* llvm.hpvm.tensor.add */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor add\n");
+        // Tensor add(a,b) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+
+	// FIXME: remove this comment - must check for in-place
+        //assert(inplace &&
+        //       "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create cudnn runtime function call
+        Constant* tensorAdd;
+        DECLARE(tensorAdd);
+        CallInst::Create(tensorAdd, Args, "", II);
+        // We can replace the call to hpvm.tensor.add with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_tensor_pool_max:
+      case Intrinsic::visc_tensor_pool_mean:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor_pool_max\n");
+
+        // Argument list - tensorPooling(input, poolFunction, window_height,
+	//                               window_width, vertical_pad, horizontal_pad,
+	//                               vertical_stride, horizontal_stride);
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	int pool_type = 0;
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){
+          pool_type = 0;
+	}
+        if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){
+          pool_type = 1;
+	}	
+	
+	Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type);
+        Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero)	
+	Args.push_back(II->getOperand(1));
+        Args.push_back(II->getOperand(2));
+	Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+	Args.push_back(II->getOperand(5));
+	Args.push_back(II->getOperand(6));
+
+        // Create cudnn runtime function call
+        Constant* tensorPooling;
+        DECLARE(tensorPooling);
+        CallInst* CI = CallInst::Create(tensorPooling, Args, "", II);
+
+	// Replacing intrinsic result uses with the result of the tensor runtime operation
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+      
+      case Intrinsic::visc_tensor_relu:
+      case Intrinsic::visc_tensor_clipped_relu:
+      case Intrinsic::visc_tensor_tanh:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor activation functions \n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){
+          // Create cudnn runtime function call
+          Constant* tensorRelu;
+          DECLARE(tensorRelu);
+          CallInst::Create(tensorRelu, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){
+          // Create cudnn runtime function call
+          //-- Constant* tensorClippedRelu;
+	  Constant* tensorRelu2;
+          DECLARE(tensorRelu2);
+          CallInst::Create(tensorRelu2, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){
+          // Create cudnn runtime function call
+          Constant* tensorTanh;
+	  errs()<<"tensorTanh Call = \n\n";
+          DECLARE(tensorTanh);
+	  //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l";
+          CallInst::Create(tensorTanh, Args, "", II);
+	}
+     
+        // We can replace the call to hpvm.tensor.relu with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_tensor_softmax:
+      { /* llvm.visc.tensor.softmax */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor softmax\n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+        // Create cudnn runtime function call
+        Constant* tensorSoftmax;
+        DECLARE(tensorSoftmax);
+        CallInst::Create(tensorSoftmax, Args, "", II);
+        // We can replace the call to hpvm.tensor.softmax with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_node_id:
+      { /* llvm.visc.node.id */
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling Node ID Intrinsic \n");
+        // Get uint32 argument
+        Value *Op = II->getOperand(0);
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+        // Create hpvm-tensor-rt function call
+        Constant* tensor_set_node_id;
+        DECLARE(tensor_set_node_id);
+        CallInst::Create(tensor_set_node_id, Args, "", II);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+      }
+    }
+  }
+
+  //--- errs()<<"IIToRemove.size() = "<<IItoRemove.size()<<"\n\n";
+
+  // We need to do this explicitly: DCE pass may not remove them.
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around.
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    errs() << "Erasing: " << **ri << "\n";
+    (*ri)->eraseFromParent();
+  }
+
+  return;
+}
+
+bool DFG2LLVM_CUDNN::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_CUDNN PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // Get the In Place Analysis Results
+  InPlaceDFGAnalysis::InPlaceDFGParameter IPP =
+    (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP();
+  // Print results
+  printInPlaceDFGParameter(IPP);
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+ 
+  // Visitor for Code Generation Graph Traversal
+  CGT_CUDNN *CGTVisitor = new CGT_CUDNN(M, DFG, IPP);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+
+} // End of namespace
+
+char DFG2LLVM_CUDNN::ID = 0;
+static RegisterPass<DFG2LLVM_CUDNN> X("dfg2llvm-cudnn",
+                                      "Dataflow Graph to LLVM for CUDNN Pass",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_CUDNN/LLVMBuild.txt b/lib/DFG2LLVM_CUDNN/LLVMBuild.txt
new file mode 100644
index 0000000000..1579b2fc47
--- /dev/null
+++ b/lib/DFG2LLVM_CUDNN/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_CUDNN
+parent = Transforms
diff --git a/lib/DFG2LLVM_NVPTX/CMakeLists.txt b/lib/DFG2LLVM_NVPTX/CMakeLists.txt
new file mode 100644
index 0000000000..430bea7693
--- /dev/null
+++ b/lib/DFG2LLVM_NVPTX/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_NVPTX
+  DFG2LLVM_NVPTX.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
new file mode 100644
index 0000000000..c0cbd4df14
--- /dev/null
+++ b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -0,0 +1,2075 @@
+//=== DFG2LLVM_NVPTX.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define ENABLE_ASSERTS
+#define TARGET_PTX 64
+#define GENERIC_ADDRSPACE 0
+#define GLOBAL_ADDRSPACE 1
+#define CONSTANT_ADDRSPACE 4
+#define SHARED_ADDRSPACE 3
+
+#define DEBUG_TYPE "DFG2LLVM_NVPTX"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/IR/UseListOrder.h"
+
+
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+using namespace viscUtils;
+
+// VISC Command line option to use timer or not
+static cl::opt<bool>
+VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers"));
+
+namespace {
+// Helper class declarations
+
+// Class to maintain the tuple of host pointer, device pointer and size
+// in bytes. Would have preferred to use tuple but support not yet available
+class OutputPtr {
+public:
+  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
+    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+
+  Value* h_ptr;
+  Value* d_ptr;
+  Value* bytes;
+};
+
+// Class to maintain important kernel info required for generating runtime
+// calls
+class Kernel {
+public:
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
+           std::map<unsigned, unsigned>(),
+         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
+           std::map<unsigned, std::pair<Value*, unsigned> >(),
+         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
+         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
+    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
+      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size()
+           && "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size()
+           && "blockDim should be same as the size of vector localWGSize");
+  }
+
+  Function* KernelFunction;
+  DFLeafNode* KernelLeafNode;
+  std::map<unsigned, unsigned> inArgMap;
+  // Map for shared memory arguments
+  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  // Fields for (potential) allocation node
+  DFLeafNode* AllocationNode;
+  Function* AllocationFunction;
+  std::map<unsigned, unsigned> allocInArgMap;
+
+  std::vector<unsigned> outArgMap;
+  unsigned gridDim;
+  std::vector<Value*> globalWGSize;
+  unsigned blockDim;
+  std::vector<Value*> localWGSize;
+  std::vector<int> localDimMap;
+
+  std::map<unsigned, unsigned> getInArgMap() {
+    return inArgMap;
+  }
+  void setInArgMap(std::map<unsigned, unsigned> map) {
+    inArgMap = map;
+  }
+
+  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() {
+    return sharedInArgMap;
+  }
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+    sharedInArgMap = map;
+  }
+
+  std::vector<unsigned> getOutArgMap() {
+    return outArgMap;
+  }
+  void setOutArgMap(std::vector<unsigned> map) {
+    outArgMap = map;
+  }
+
+  void setLocalWGSize(std::vector<Value*> V) {
+    localWGSize = V;
+  }
+
+  bool hasLocalWG() {
+    return blockDim != 0;
+  }
+};
+
+// Helper function declarations
+static bool canBePromoted(Argument* arg, Function* F);
+static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
+                                 ValueToValueMapTy&, Instruction*);
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
+                              Instruction*, const Twine& WGName = "WGSize");
+static std::string getPTXFilename(const Module&);
+static std::string getFilenameFromModule(const Module& M);
+static void changeDataLayout(Module &);
+static void changeTargetTriple(Module &);
+static void findReturnInst(Function *, std::vector<ReturnInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
+static std::string getAtomicOpName(Intrinsic::ID);
+
+// DFG2LLVM_NVPTX - The first implementation.
+struct DFG2LLVM_NVPTX : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_NVPTX() : DFG2LLVM(ID) {}
+
+private:
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_NVPTX : public CodeGenTraversal {
+
+private:
+  //Member variables
+  std::unique_ptr<Module> KernelM;
+  DFNode* KernelLaunchNode = NULL;
+  Kernel* kernel;
+
+  // VISC Runtime API
+  Constant* llvm_visc_ocl_launch;
+  Constant* llvm_visc_ocl_wait;
+  Constant* llvm_visc_ocl_initContext;
+  Constant* llvm_visc_ocl_clearContext;
+  Constant* llvm_visc_ocl_argument_shared;
+  Constant* llvm_visc_ocl_argument_scalar;
+  Constant* llvm_visc_ocl_argument_ptr;
+  Constant* llvm_visc_ocl_output_ptr;
+  Constant* llvm_visc_ocl_free;
+  Constant* llvm_visc_ocl_getOutput;
+  Constant* llvm_visc_ocl_executeNode;
+
+  //Functions
+  std::string getKernelsModuleName(Module &M);
+  void fixValueAddrspace(Value* V, unsigned addrspace);
+  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*);
+  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
+  void addCLMetadata(Function* F);
+  Function* transformFunctionToVoid(Function* F);
+  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
+
+  // Virtual Functions
+  void init() {
+    VISCTimer = VISCTimer_NVPTX;
+    TargetName = "NVPTX";
+  }
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) {
+    init();
+    initRuntimeAPI();
+    errs() << "Old module pointer: " << &_M << "\n";
+    errs() << "New module pointer: " <<  KernelM.get() << "\n";
+
+    // Copying instead of creating new, in order to preserve required info (metadata)
+    // Remove functions, global variables and aliases
+    std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
+    for (Module::global_iterator mi = KernelM->global_begin(),
+         me = KernelM->global_end(); (mi != me); ++mi) {
+      GlobalVariable* gv = &*mi;
+      gvv.push_back(gv);
+    }
+    for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    std::vector<Function*> fv = std::vector<Function*>();
+    for (Module::iterator mi = KernelM->begin(),
+         me = KernelM->end(); (mi != me); ++mi) {
+      Function* f = &*mi;
+      fv.push_back(f);
+    }
+    for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
+    for (Module::alias_iterator mi = KernelM->alias_begin(),
+         me = KernelM->alias_end(); (mi != me); ++mi) {
+      GlobalAlias* a = &*mi;
+      av.push_back(a);
+    }
+    for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    changeDataLayout(*KernelM);
+    changeTargetTriple(*KernelM);
+
+
+    DEBUG(errs() << *KernelM);
+
+  }
+
+  void writeKernelsModule();
+};
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_NVPTX::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll";
+
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE(llvm_visc_ocl_launch);
+  DECLARE(llvm_visc_ocl_wait);
+  DECLARE(llvm_visc_ocl_initContext);
+  DECLARE(llvm_visc_ocl_clearContext);
+  DECLARE(llvm_visc_ocl_argument_shared);
+  DECLARE(llvm_visc_ocl_argument_scalar);
+  DECLARE(llvm_visc_ocl_argument_ptr);
+  DECLARE(llvm_visc_ocl_output_ptr);
+  DECLARE(llvm_visc_ocl_free);
+  DECLARE(llvm_visc_ocl_getOutput);
+  DECLARE(llvm_visc_ocl_executeNode);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  initTimerAPI();
+
+  // Insert init context in main
+  DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n");
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+
+  InitCall = cast<Instruction>(*VI->user_begin());
+  initializeTimerSet(InitCall);
+  switchToTimer(visc_TimerID_INIT_CTX, InitCall);
+  CallInst::Create(llvm_visc_ocl_initContext,
+                   ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)),
+                   "", InitCall);
+  switchToTimer(visc_TimerID_NONE, InitCall);
+
+  // Insert print instruction at visc exit
+  DEBUG(errs() << "Gen Code to print NVPTX Timer\n");
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  DEBUG(errs() << *VC << "\n");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
+
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  printTimerSet(CleanupCall);
+
+
+}
+
+// Generate Code to call the kernel
+// The plan is to replace the internal node with a leaf node. This method is
+// used to generate a function to associate with this leaf node. The function
+// is responsible for all the memory allocation/transfer and invoking the
+// kernel call on the device
+void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before.
+//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+
+  assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
+         "Code already generated for this node");
+
+  // Useful values
+  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+
+  // If kernel struct has not been initialized with kernel function, then fail
+  assert(K != NULL && "No kernel found!!");
+
+  DEBUG(errs() << "Generating kernel call code\n");
+
+  Function* F = N->getFuncPointer();
+
+
+  // Create of clone of F with no instructions. Only the type is the same as F
+  // without the extra arguments.
+  Function* F_X86;
+
+  // Clone the function, if we are seeing this function for the first time. We
+  // only need a clone in terms of type.
+  ValueToValueMapTy VMap;
+
+  // Create new function with the same type
+  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+  // Loop over the arguments, copying the names of arguments over.
+  Function::arg_iterator dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    dest_iterator->setName(i->getName()); // Copy the name over...
+    // Increment dest iterator
+    ++dest_iterator;
+  }
+
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(F_X86->getReturnType()), BB);
+
+  // FIXME: Adding Index and Dim arguments are probably not required except
+  // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
+  // have those arguments)
+
+  // Add Index and Dim arguments except for the root node
+  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+    F_X86 = addIdxDimArgs(F_X86);
+
+  BB = &*F_X86->begin();
+  RI = cast<ReturnInst>(BB->getTerminator());
+
+  //Add the generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
+  errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
+         << N->getFuncPointer()->getName() << "\n";
+
+
+  // Loop over the arguments, to create the VMap
+  dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    // Add mapping to VMap and increment dest iterator
+    VMap[&*i] = &*dest_iterator;
+    ++dest_iterator;
+  }
+
+  /* TODO: Use this code to verufy if this is a good pattern for PTX kernel
+
+  // Sort children in topological order before code generation for kernel call
+  N->getChildGraph()->sortChildren();
+
+  // The DFNode N has the property that it has only one child (leaving Entry
+  // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
+  // generation for kernel calls significantly. All the inputs to this child
+  // node would either be constants or from the parent node N.
+
+  assert(N->getChildGraph()->size() == 3
+         && "Node expected to have just one non-dummy node!");
+
+  DFNode* C;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    C = *ci;
+    // Skip dummy node call
+    if (!C->isDummyNode())
+      break;
+  }
+
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+
+  Function* CF = C->getFuncPointer();
+  */
+  Function* KF = K->KernelLeafNode->getFuncPointer();
+  // Initialize context
+  //DEBUG(errs() << "Initializing context" << "\n");
+  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+
+  DEBUG(errs() << "Initializing commandQ" << "\n");
+  // Initialize command queue
+  switchToTimer(visc_TimerID_SETUP, InitCall);
+  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
+  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
+  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
+  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
+
+  Value* LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call" << "\n");
+  CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
+                                         ArrayRef<Value*>(LaunchInstArgs, 2),
+                                         "graph"+KF->getName(),
+                                         InitCall);
+  DEBUG(errs() << *NVPTX_Ctx << "\n");
+  GraphIDAddr = new GlobalVariable(M,
+                                   NVPTX_Ctx->getType(),
+                                   false,
+                                   GlobalValue::CommonLinkage,
+                                   Constant::getNullValue(NVPTX_Ctx->getType()),
+                                   "graph"+KF->getName()+".addr");
+  DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
+  StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
+  DEBUG(errs() << *SI << "\n");
+  switchToTimer(visc_TimerID_NONE, InitCall);
+  switchToTimer(visc_TimerID_SETUP, RI);
+  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
+
+  // Iterate over the required input edges of the node and use the visc-rt API
+  // to set inputs
+  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+  std::vector<OutputPtr> OutputPointers;
+  // Vector to hold the device memory object that need to be cleared before we release
+  // context
+  std::vector<Value*> DevicePointers;
+
+  std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap();
+  /*
+    for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
+
+      // The kernel object gives us the mapping of arguments from kernel launch
+      // node function (F_X86) to kernel (kernel->KF)
+      Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
+
+  */
+
+  for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(),
+      ie = kernelInArgMap.end(); ib != ie; ++ib) {
+    unsigned i = ib->first;
+    Value* inputVal = getArgumentAt(F_X86, ib->second);
+    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+
+    // input value has been obtained.
+    // Check if input is a scalar value or a pointer operand
+    // For scalar values such as int, float, etc. the size is simply the size of
+    // type on target machine, but for pointers, the size of data would be the
+    // next integer argument
+    if(inputVal->getType()->isPointerTy()) {
+
+      switchToTimer(visc_TimerID_COPY_PTR, RI);
+      // Pointer Input
+      // CheckAttribute
+      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
+      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
+                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
+
+      Argument* A = getArgumentAt(KF, i);
+      if(isOutput == True) {
+        DEBUG(errs() << *A << " is an OUTPUT argument\n");
+      }
+      if(isInput == True) {
+        DEBUG(errs() << *A << " is an INPUT argument\n");
+      }
+
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      // Assert that the pointer argument size (next argument) is in the map
+      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
+
+      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
+      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
+             && "Pointer type input must always be followed by size (integer type)");
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               inputSize,
+                               isInput,
+                               isOutput
+                              };
+      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
+                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      DevicePointers.push_back(d_ptr);
+      // If this has out attribute, store the returned device pointer in
+      // memory to read device memory later
+      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    }
+    else {
+      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+      // Scalar Input
+      // Store the scalar value on stack and then pass the pointer to its
+      // location
+      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
+      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               ConstantExpr::getSizeOf(inputVal->getType())
+                              };
+      CallInst::Create(llvm_visc_ocl_argument_scalar,
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+    }
+  }
+
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+
+  // Check to see if all the allocation sizes are constant (determined
+  // statically)
+  bool constSizes = true;
+  for (auto& e: K->getSharedInArgMap()) {
+    constSizes &= isa<Constant>(e.second.first);
+  }
+
+  // If the sizes are all constant
+  if (constSizes) {
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = e.second.first;
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+            allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                                Type::getInt8PtrTy(M.getContext()),
+                                allocSize->getName()+".sharedMem.i8ptr",
+                                RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  } else {
+
+    Function *F_alloc = K->AllocationFunction;
+    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
+    assert(FAllocRetTy && "Allocation node with no struct return type");
+
+    std::vector<Value *> AllocInputArgs;
+    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
+      AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i)));
+    }
+
+    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
+    std::vector<ExtractValueInst *> ExtractValueInstVec;
+    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
+      ExtractValueInstVec.push_back(EI);
+    }
+
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+            allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                                Type::getInt8PtrTy(M.getContext()),
+                                allocSize->getName()+".sharedMem.i8ptr",
+                                RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  }
+
+
+  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
+  // Set output if struct is not an empty struct
+  StructType* OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value*> d_Outputs;
+  if(!OutputTy->isEmptyTy()) {
+    switchToTimer(visc_TimerID_COPY_PTR, RI);
+    // Not an empty struct
+    // Iterate over all elements of the struct and put them in
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
+      Value* setOutputArgs[] = {GraphID,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
+                               };
+
+      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                            ArrayRef<Value*>(setOutputArgs, 3),
+                                            "d_output."+KF->getName(),
+                                            RI);
+      d_Outputs.push_back(d_Output);
+    }
+  }
+
+  // Enqueue kernel
+  // Need work dim, localworksize, globalworksize
+  // Allocate size_t[numDims] space on stack. Store the work group sizes and
+  // pass it as an argument to ExecNode
+
+  switchToTimer(visc_TimerID_MISC, RI);
+  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
+  getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
+  switchToTimer(visc_TimerID_KERNEL, RI);
+  Value* ExecNodeArgs[] = {GraphID,
+                           workDim,
+                           LocalWGPtr,
+                           GlobalWGPtr
+                          };
+  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
+                                     ArrayRef<Value*>(ExecNodeArgs, 4),
+                                     "event."+KF->getName(),
+                                     RI);
+  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
+
+  // Wait for Kernel to Finish
+  CallInst::Create(llvm_visc_ocl_wait,
+                   ArrayRef<Value*>(GraphID),
+                   "",
+                   RI);
+
+  switchToTimer(visc_TimerID_READ_OUTPUT, RI);
+  // Read Output Struct if not empty
+  if(!OutputTy->isEmptyTy()) {
+    std::vector<Value*>h_Outputs;
+    Value* KernelOutput = UndefValue::get(OutputTy);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Value* GetOutputArgs[] = {GraphID,
+                                Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                                d_Outputs[i],
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
+                               };
+      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
+                                            ArrayRef<Value*>(GetOutputArgs, 4),
+                                            "h_output."+KF->getName()+".addr",
+                                            RI);
+      // Read each device pointer listed in output struct
+      // Load the output struct
+      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
+                     OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
+
+      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
+                                             KF->getName()+"output", RI);
+    }
+    OutputMap[K->KernelLeafNode] = KernelOutput;
+  }
+
+  // Read all the pointer arguments which had side effects i.e., had out
+  // attribute
+  DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
+  // FIXME: Not reading output pointers anymore as we read them when data is
+  // actually requested
+  /*for(auto output: OutputPointers) {
+    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
+    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
+    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
+
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
+    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+                                    ArrayRef<Value*>(GetOutputArgs, 4),
+                                    "", RI);
+  }*/
+  switchToTimer(visc_TimerID_MEM_FREE, RI);
+  // Clear Context and free device memory
+  DEBUG(errs() << "Clearing context" << "\n");
+  // Free Device Memory
+  for(auto d_ptr: DevicePointers) {
+    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
+  }
+  switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
+  // Clear Context
+  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
+  switchToTimer(visc_TimerID_NONE, CleanupCall);
+
+  switchToTimer(visc_TimerID_MISC, RI);
+  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+  // Generate code for output bindings
+  // Get Exit node
+  DFNode* C = N->getChildGraph()->getExit();
+  // Get OutputType of this node
+  StructType* OutTy = N->getOutputType();
+  Value *retVal = UndefValue::get(F_X86->getReturnType());
+  // Find the kernel's output arg map, to use instead of the bindings
+  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
+  // Find all the input edges to exit node
+  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    DEBUG(errs() << "Output Edge " << i << "\n");
+    // Find the incoming edge at the requested input port
+    DFEdge* E = C->getInDFEdgeAt(i);
+
+    assert(E && "No Binding for output element!");
+    // Find the Source DFNode associated with the incoming edge
+    DFNode* SrcDF = E->getSourceDF();
+
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
+    // If Source DFNode is a dummyNode, edge is from parent. Get the
+    // argument from argument list of this internal node
+    Value* inputVal;
+    if(SrcDF->isEntryNode()) {
+      inputVal = getArgumentAt(F_X86, i);
+      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+    }
+    else {
+      // edge is from a internal node
+      // Check - code should already be generated for this source dfnode
+      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
+      // can assume the SrcDF is same as Kernel Leaf node.
+      // Use outArgMap to get correct mapping
+      SrcDF = K->KernelLeafNode;
+      assert(OutputMap.count(SrcDF)
+             && "Source node call not found. Dependency violation!");
+
+      // Find Output Value associated with the Source DFNode using OutputMap
+      Value* CI = OutputMap[SrcDF];
+
+      // Extract element at source position from this call instruction
+      std::vector<unsigned> IndexList;
+      // i is the destination of DFEdge E
+      // Use the mapping instead of the bindings
+//      IndexList.push_back(E->getSourcePosition());
+      IndexList.push_back(outArgMap[i]);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                             "",RI);
+      inputVal = EI;
+    }
+    std::vector<unsigned> IdxList;
+    IdxList.push_back(i);
+    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+  }
+
+  DEBUG(errs() << "Extracted all\n");
+  switchToTimer(visc_TimerID_NONE, RI);
+  retVal->setName("output");
+  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReplaceInstWithInst(RI, newRI);
+}
+
+
+// Right now, only targeting the one level case. In general, device functions
+// can return values so we don't need to change them
+void CGT_NVPTX::codeGen(DFInternalNode* N) {
+  errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n";
+  if(KernelLaunchNode == NULL)
+    errs () << "No kernel launch node\n";
+  else {
+    errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+  }
+
+  if (!KernelLaunchNode) {
+    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    return;
+  }
+
+  if (N == KernelLaunchNode) {
+    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
+    //TODO
+
+    // Now the remaining nodes to be visited should be ignored
+    KernelLaunchNode = NULL;
+    DEBUG(errs() << "Insert Runtime calls\n");
+    insertRuntimeCalls(N, kernel, getPTXFilename(M));
+
+  } else {
+    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
+    // Keep track of the arguments order.
+    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
+    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
+    // TODO: Structure assumed: one thread node, one allocation node (at most),
+    // TB node
+    std::map<unsigned, unsigned> inmapFinal;
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+         ib != ie; ++ib) {
+      inmapFinal[ib->first] = inmap1[ib->second];
+    }
+
+    kernel->setInArgMap(inmapFinal);
+
+    // Keep track of the output arguments order.
+    std::vector<unsigned> outmap1 = N->getOutArgMap();
+    std::vector<unsigned> outmap2 = kernel->getOutArgMap();
+
+    // TODO: Change when we have incoming edges to the dummy exit node from more
+    // than one nodes. In this case, the number of bindings is the same, but
+    // their destination position, thus the index in outmap1, is not
+    // 0 ... outmap2.size()-1
+    // The limit is the size of outmap2, because this is the number of kernel
+    // output arguments for which the mapping matters
+    // For now, it reasonable to assume that all the kernel arguments are returned,
+    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    for (unsigned i = 0; i < outmap2.size(); i++) {
+      outmap1[i] = outmap2[outmap1[i]];
+    }
+    kernel->setOutArgMap(outmap1);
+
+    // Track the source of local dimlimits for the kernel
+    // Dimension limit can either be a constant or an argument of parent
+    // function. Since Internal node would no longer exist, we need to insert the
+    // localWGSize with values from the parent of N.
+    std::vector<Value*> localWGSizeMapped;
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Constant>(kernel->localWGSize[i])) {
+        // if constant, use as it is
+        localWGSizeMapped.push_back(kernel->localWGSize[i]);
+      }
+      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+        // if argument, find the argument location in N. Use InArgMap of N to
+        // find the source location in Parent of N. Retrieve the argument from
+        // parent to insert in the vector.
+        unsigned argNum = Arg->getArgNo();
+        // This argument will be coming from the parent node, not the allocation
+        // Node
+        assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
+
+        unsigned parentArgNum = N->getInArgMap()[argNum];
+        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        localWGSizeMapped.push_back(A);
+      }
+      else {
+        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      }
+    }
+    // Update localWGSize vector of kernel
+    kernel->setLocalWGSize(localWGSizeMapped);
+  }
+
+}
+
+void CGT_NVPTX::codeGen(DFLeafNode* N) {
+  errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n";
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Skip code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // Generate code only if it has the right hint
+//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+//    return;
+//  }
+  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Checking which node is the kernel launch
+  DFNode* PNode = N->getParent();
+  int pLevel = PNode->getLevel();
+  int pReplFactor = PNode->getNumOfDim();
+
+  // Choose parent node as kernel launch if:
+  // (1) Parent is the top level node i.e., Root of DFG
+  //                    OR
+  // (2) Parent does not have multiple instances
+  errs() << "pLevel = " << pLevel << "\n";
+  errs() << "pReplFactor = " << pReplFactor << "\n";
+  if (!pLevel || !pReplFactor) {
+    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
+    KernelLaunchNode = PNode;
+    kernel = new Kernel(NULL,
+                        N,
+                        N->getInArgMap(),
+                        N->getSharedInArgMap(),
+                        N->getOutArgMap(),
+                        N->getNumOfDim(),
+                        N->getDimLimits());
+  }
+  else {
+    // Converting a 2-level DFG to opencl kernel
+    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
+    KernelLaunchNode = PNode->getParent();
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
+    // Contains the instructions generating the kernel configuration parameters
+    kernel = new Kernel(NULL,                 // kernel function
+                        N,                    // kernel leaf node
+                        N->getInArgMap(),     // kenel argument mapping
+                        N->getSharedInArgMap(),
+                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
+                        PNode->getNumOfDim(), // gridDim
+                        PNode->getDimLimits(),// grid size
+                        N->getNumOfDim(),     // blockDim
+                        N->getDimLimits());   // block size
+
+  }
+
+  std::vector<IntrinsicInst *> IItoRemove;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+//  Function *F_nvptx = N->getGenFunc();
+  Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
+
+  assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
+  // Clone the function
+  ValueToValueMapTy VMap;
+
+  Twine FName = F->getName();
+  F_nvptx = CloneFunction(F, VMap);
+  F_nvptx->setName(FName+"_nvptx");
+//  errs() << "Old Function Name: " << F->getName() << "\n";
+//  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
+
+  F_nvptx->removeFromParent();
+
+
+  // Insert the cloned function into the kernels module
+  KernelM->getFunctionList().push_back(F_nvptx);
+
+
+  //TODO: Iterate over all the instructions of F_nvptx and identify the
+  //callees and clone them into this module.
+  DEBUG(errs() << *F_nvptx->getType());
+  DEBUG(errs() << *F_nvptx);
+
+  // Transform  the function to void and remove all target dependent attributes
+  // from the function
+  F_nvptx = transformFunctionToVoid(F_nvptx);
+  
+  //Add generated function info to DFNode
+//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
+  N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
+
+  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_nvptx->removeAttributes(AttributeSet::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
+  F_nvptx->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+
+  //FIXME: For now, assume only one allocation node
+  kernel->AllocationNode = NULL;
+
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+       ieb != iee; ++ieb) {
+    DFNode *SrcDFNode = (*ieb)->getSourceDF();
+    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    if (!SrcDFNode->isDummyNode()) {
+      assert(SrcDFNode->isAllocationNode());
+      kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
+      kernel->allocInArgMap = SrcDFNode->getInArgMap();
+      break;
+    }
+  }
+
+  // Vector for shared memory arguments
+  std::vector<unsigned> SharedMemArgs;
+
+  // If no allocation node was found, SharedMemArgs is empty
+  if (kernel->AllocationNode) {
+
+    ValueToValueMapTy VMap;
+    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    //F_alloc->removeFromParent();
+    // Insert the cloned function into the kernels module
+    //M.getFunctionList().push_back(F_alloc);
+
+    std::vector<IntrinsicInst *> ViscMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+
+    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
+      IntrinsicInst *II = ViscMallocInstVec[i];
+      assert(II->hasOneUse() && "visc_malloc result is used more than once");
+      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->eraseFromParent();
+    }
+    kernel->AllocationFunction = F_alloc;
+
+    // This could be used to check that the allocation node has the appropriate
+    // number of fields in its return struct
+    /*
+        ReturnInst *RI = ReturnInstVec[0];
+        Value *RetVal = RI->getReturnValue();
+        Type *RetTy = RetVal->getType();
+        StructType *RetStructTy = dyn_cast<StructType>(RetTy);
+        assert(RetStructTy && "Allocation node does not return a struct type");
+        unsigned numFields = RetStructTy->getNumElements();
+    */
+    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
+    AllocationNodeProperty* APN =
+      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
+    for (auto& AllocPair: APN->getAllocationList()) {
+      unsigned destPos = AllocPair.first->getDestPosition();
+      unsigned srcPos = AllocPair.first->getSourcePosition();
+      SharedMemArgs.push_back(destPos);
+      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+    }
+    kernel->setSharedInArgMap(sharedInMap);
+  }
+  std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
+
+  // All pointer args which are not shared memory pointers have to be moved to
+  // global address space
+  unsigned argIndex = 0;
+  std::vector<unsigned> GlobalMemArgs;
+  for(auto& Arg: F_nvptx->getArgumentList()) {
+    if (Arg.getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list, skip.
+      // Else put it in Global memory arguement list
+      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+        GlobalMemArgs.push_back(argIndex);
+      }
+    }
+    argIndex++;
+  }
+  std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end());
+
+  /* At this point, we assume that chescks for the fact that SharedMemArgs only
+     contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
+     analysis pass */
+  // Optimization: Gloabl memory arguments, which are not modified and whose
+  // loads are not dependent on node id of current node, should be moved to
+  // constant memory, subject to size of course
+  std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
+
+  F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, CONSTANT_ADDRSPACE);
+  F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
+  F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
+
+
+  // Go through all the instructions
+  std::vector<CallInst *> CItoRemove;
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+    Instruction *I = &(*i);
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /************************ Handle VISC Query intrinsics ************************/
+
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *****************************/
+      case Intrinsic::visc_getNode: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
+      }
+      break;
+      /************************* llvm.visc.getParentNode() **************************/
+      case Intrinsic::visc_getParentNode: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // get the parent node of the arg node
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /*************************** llvm.visc.getNumDims() ***************************/
+      case Intrinsic::visc_getNumDims: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        int numOfDim = ArgDFNode->getNumOfDim();
+        DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
+        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+
+        // Replace the result of the intrinsic with the computed value
+        II->replaceAllUsesWith(numOfDimConstant);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /*********************** llvm.visc.getNodeInstanceID() ************************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        assert(ArgDFNode && "Arg node is NULL");
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNodeInstanceID_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = N->getParent()->getLevel();
+        int parentReplFactor = N->getParent()->getNumOfDim();
+        DEBUG(errs() << "Parent Level = " << parentLevel << "\n");
+        DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n");
+
+        FunctionType* FT =
+          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                            Type::getInt32Ty(KernelM->getContext()),
+                            false);
+        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so we need to specify a global id.
+          // We can translate this only if the argument is the current node
+          // itself
+          DEBUG(errs() << "Substitute with get_global_id()\n");
+          DEBUG(errs() << *II << "\n");
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
+          //DEBUG(errs() << "Here inside cond 2\n");
+          // We are asking for this node's id with respect to its parent
+          // this is a local id call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT));
+          //DEBUG(errs() << "exiting condition 2\n");
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's id with respect to its
+          // parent: this is a group id call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT));
+        } else {
+          errs() << N->getFuncPointer()->getName() << "\n";
+          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
+          errs() << *II << "\n";
+
+          assert(false && "Unable to translate getNodeInstanceID intrinsic");
+        }
+
+        //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n");
+        //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n");
+        //DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
+        //DEBUG(errs() << "Argument: " << Args[0] << "\n");
+        //DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
+        // Create call instruction, insert it before the intrinsic and
+        // replace the uses of the previous instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        //DEBUG(errs() << "Replace uses\n");
+        II->replaceAllUsesWith(CI);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /********************** llvm.visc.getNumNodeInstances() ***********************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
+        // TODO: think about whether this is the best way to go there are hw
+        // specific registers. therefore it is good to have the intrinsic but
+        // then, why do we need to keep that info in the graph?  (only for the
+        // kernel configuration during the call)
+
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNumNodeInstances_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = ParentDFNode->getLevel();
+        int parentReplFactor = ParentDFNode->getNumOfDim();
+        FunctionType* FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()),
+                              false);
+
+        if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so the instances are global_size (gridDim x blockDim)
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
+          // We are asking for this node's instances
+          // this is a local size (block dim) call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's instances
+          // this is a (global_size/local_size) (grid dim) call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT));
+        } else {
+          assert(false && "Unable to translate getNumNodeInstances intrinsic");
+        }
+
+        // Create call instruction, insert it before the intrinsic and
+        // replace the uses of the previous instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        II->replaceAllUsesWith(CI);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_barrier:
+      {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
+        DEBUG(errs() << "Substitute with barrier()\n");
+        DEBUG(errs() << *II << "\n");
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
+                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
+                                             false);
+        Function* OpenCLFunction = cast<Function>
+                                   (KernelM->getOrInsertFunction(StringRef("barrier"), FT));
+        CallInst* CI = CallInst::Create(OpenCLFunction,
+                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
+                                        "", II);
+        II->replaceAllUsesWith(CI);
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_atomic_cmpxchg:
+        break;
+      case Intrinsic::visc_atomic_add:
+      case Intrinsic::visc_atomic_sub:
+      case Intrinsic::visc_atomic_xchg:
+      case Intrinsic::visc_atomic_min:
+      case Intrinsic::visc_atomic_umin:
+      case Intrinsic::visc_atomic_max:
+      case Intrinsic::visc_atomic_umax:
+      case Intrinsic::visc_atomic_and:
+      case Intrinsic::visc_atomic_or:
+      case Intrinsic::visc_atomic_xor:
+        //case Intrinsic::visc_atomic_inc:
+        //case Intrinsic::visc_atomic_dec:
+      {
+        DEBUG(errs() << *II << "\n");
+        // Only have support for i32 atomic intrinsics
+        assert(II->getType() == Type::getInt32Ty(II->getContext())
+               && "Only support i32 atomic intrinsics for now");
+        // Substitute with atomicrmw instruction
+        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
+        Value* Ptr = II->getArgOperand(0);
+        Value* Val = II->getArgOperand(1);
+        assert(Ptr->getType()->isPointerTy()
+               && "First argument of supported atomics is expected to be a pointer");
+        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
+        if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
+          Ptr = CastInst::CreatePointerCast(Ptr, Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), "", II);
+        }
+        AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()),
+            Ptr, Val, AtomicOrdering::SequentiallyConsistent, llvm::CrossThread, II);
+        AtomicInst->setVolatile(true);
+        DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n");
+        II->replaceAllUsesWith(AtomicInst);
+        IItoRemove.push_back(II);
+      }
+      break;
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+      }
+
+    }
+    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if(calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+        if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
+          // Now handle a few specific intrinsics
+          // For now, sin and cos are translated to their libclc equivalent
+          switch(II->getIntrinsicID()) {
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+          {
+            DEBUG(errs() << "Found sincos: " << *II << "\n");
+            // Get the libclc function
+            // libclc uses mangled name for sin cos
+            assert(II->getType()->isFloatTy()
+                   && "Only handling sin(float) and cos(float)!");
+            std::string name;
+            if(II->getIntrinsicID() == Intrinsic::sin)
+              name = "_Z3sinf";
+            else
+              name = "_Z3cosf";
+
+            FunctionType* SinCosFT = FunctionType::get(II->getType(),
+                                     Type::getFloatTy(KernelM->getContext()),
+                                     false);
+            Function* LibclcFunction = cast<Function>
+                                       (KernelM->getOrInsertFunction(name, SinCosFT));
+            CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
+
+            II->replaceAllUsesWith(CI);
+            IItoRemove.push_back(II);
+            break;
+          }
+          case Intrinsic::floor:
+          {
+            DEBUG(errs() << "Found floor intrinsic\n");
+            F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
+            FunctionType* FTy = F->getFunctionType();
+            DEBUG(errs() << *F << "\n");
+
+            // Create argument list
+            std::vector<Value*> args;
+            assert(CI->getNumArgOperands() == FTy->getNumParams()
+                   && "Number of arguments of call do not match with Intrinsic");
+            for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+              Value* V = CI->getArgOperand(i);
+              // Either the type should match or both should be of pointer type
+              assert(V->getType() == FTy->getParamType(i) ||
+                     (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())
+                     && "Dummy function call argument does not match with Intrinsic argument!");
+              // If the types do not match, then both must be pointer type and pointer
+              // cast needs to be performed
+              if(V->getType() != FTy->getParamType(i)) {
+                V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+              }
+              args.push_back(V);
+            }
+            // Insert call instruction
+            CallInst* Inst = CallInst::Create(F, args,
+                  F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+            DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+            CI->replaceAllUsesWith(Inst);
+            IItoRemove.push_back(II);
+            break;
+          }
+          default:
+            errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
+          }
+        }
+
+      }
+      else {
+        // Clone the function
+        ValueToValueMapTy VMap;
+        Function* newCalleeF = CloneFunction(calleeF, VMap);
+        newCalleeF->removeFromParent(); //TODO: MARIA check
+        KernelM->getFunctionList().push_back(newCalleeF);
+        CallInst *CInew = CallInst::Create(newCalleeF, CI->getArgOperand(0), CI->getName(), CI);
+        CI->replaceAllUsesWith(CInew);
+        CItoRemove.push_back(CI);
+
+      }
+      //TODO: how to handle address space qualifiers in load/store
+    }
+
+  }
+
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    (*ri)->eraseFromParent();
+  }
+  for(auto *CI : reverse(CItoRemove)) {
+    DEBUG(errs() << "Erasing: " << *CI << "\n");
+    CI->eraseFromParent();
+
+  }
+
+
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+  DEBUG(errs() << *KernelM);
+
+  return;
+}
+
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_NVPTX PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  CGTVisitor->writeKernelsModule();
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
+  /*SmallString<128> currentDir;
+  llvm::sys::fs::current_path(currentDir);
+  std::string fileName = getFilenameFromModule(M);
+  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+  return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
+}
+
+void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType())
+         && "Value should be of Pointer Type!");
+  PointerType* OldTy = cast<PointerType>(V->getType());
+  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
+}
+
+
+std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
+  std::vector<unsigned> ConstantMemArgs;
+  for(auto& arg: F->getArgumentList()) {
+    std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
+        GlobalMemArgs->end(), arg.getArgNo());
+    // It has to be a global memory argument to be promotable
+    if(pos == GlobalMemArgs->end())
+      continue;
+
+    // Check if it can/should be promoted
+    if(canBePromoted(&arg, F)) {
+      errs() << "Promoting << " << arg.getName()  << " to constant memory."<< "\n";
+      ConstantMemArgs.push_back(arg.getArgNo());
+      GlobalMemArgs->erase(pos);
+    }
+  }
+  return ConstantMemArgs;
+}
+
+Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned argno = arg.getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(&arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg.getType());
+  }
+  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+  return newF;
+}
+
+/* Add metadata to module KernelM, for OpenCL kernels */
+void CGT_NVPTX::addCLMetadata(Function *F) {
+
+  IRBuilder<> Builder(&*F->begin());
+
+  SmallVector<Metadata*,8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
+
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
+
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+
+}
+
+void CGT_NVPTX::writeKernelsModule() {
+
+  // In addition to deleting all other functions, we also want to spiff it
+  // up a little bit.  Do this now.
+  legacy::PassManager Passes;
+
+  errs() << "Writing to File --- ";
+  errs() << getKernelsModuleName(M).c_str() << "\n";
+  std::error_code EC;
+  tool_output_file Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
+  }
+
+  Passes.add(
+      createPrintModulePass(Out.os()));
+
+  Passes.run(*KernelM);
+
+  // Declare success.
+  Out.keep();
+}
+
+Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
+
+  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+  // FIXME: Maybe do that using the Node?
+  StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
+
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
+
+
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
+
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
+
+    // Replacing return statements with others returning void
+    for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(),
+         e = RItoRemove.end(); i != e; ++i) {
+      ReturnInst::Create((F->getContext()), 0, (*i));
+      (*i)->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  }
+  else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    std::vector<Argument*> Args;
+    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      Args.push_back(RetArg);
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
+
+    Function::arg_iterator ai, ae;
+
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(),
+         rie = RItoRemove.end(); rii != rie; ++rii) {
+      ReturnInst* RI = (*rii);
+      Value* RetVal = RI->getReturnValue();
+      for(unsigned i = 0; i < Args.size(); i++) {
+        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+                               Args[i]->getName()+".val", RI);
+        new StoreInst(EI, Args[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type* VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  //F->eraseFromParent();
+  return newF;
+}
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+// Check if argument arg can be promoted to constant memory in Function F
+// Condition:
+// 1. No stores
+// 2. Loads not dependent on getNodeInstanceID itrinsic
+
+static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
+  if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  VisitedList->push_back(V);
+  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
+      ui != ue; ++ui) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    if(!I) {
+      // if use is not an instruction, then skip it
+      continue;
+    }
+    DEBUG(errs() << "\t" << *I << "\n");
+    if(isa<LoadInst>(I)) {
+      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+      UseList->push_back(V);
+    }
+    else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+      // found a store in use chain
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+      return true;
+    }
+    else if(BuildDFG::isViscIntrinsic(I)) {
+      // If it is an atomic intrinsic, we found a store
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
+          && "Only visc atomic intrinsics can have an argument as input");
+      return true;
+    }
+    else {
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+      if(findLoadStoreUses(I, UseList, VisitedList))
+        return true;
+    }
+  }
+ return false;
+}
+
+static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
+  if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  DependenceList->push_back(V);
+  // If not an instruction, then not dependent on node instance id
+  if(!isa<Instruction>(V) || isa<Constant>(V)) {
+    DEBUG(errs() << "\tStop\n");
+    return false;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  for(unsigned i = 0; i < I->getNumOperands(); i++) {
+    Value* operand = I->getOperand(i);
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
+      if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
+          || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
+            || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+        Value* Node = II->getArgOperand(0);
+        IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
+        assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
+        if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
+          return true;
+        }
+      }
+    }
+    if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
+      DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
+      continue;
+    }
+    DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+    if(isDependentOnNodeInstanceID(operand, DependenceList)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Function to check if argument arg can be changed to a constant memory pointer
+static bool canBePromoted(Argument* arg, Function* F) {
+  DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
+  std::vector<Value*> UseList;
+  std::vector<Value*> VisitedList;
+  // recursively traverse use chain
+  // if find a store instruction return false, everything fails, cannot be
+  // promoted
+  // if find a load instruction as use, add the GEP instruction to list
+  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+  if(foundStore == true)
+    return false;
+  // See that the GEP instructions are not dependent on getNodeInstanceID
+  // intrinsic
+  DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
+  std::vector<Value*>DependenceList;
+  for(auto U: UseList) {
+    if(isDependentOnNodeInstanceID(U, &DependenceList))
+      return false;
+  }
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+  return true;
+}
+
+
+// Calculate execute node parameters which include, number of diemnsions for
+// dynamic instances of the kernel, local and global work group sizes.
+static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
+                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if(!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  }
+  else {
+    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if(isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if(isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value*> globalWGSizeInsts;
+  if(kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  }
+  else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+}
+
+// CodeGen for allocating space for Work Group on stack and returning a pointer
+// to its address
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
+  Value* WGPtr;
+  // Get int64_t and or ease of use
+  Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst* WG = new AllocaInst(WGTy, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+  Value* nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for(unsigned i=0; i < WGSize.size(); i++) {
+    DEBUG(errs() << *WGSize[i] << "\n");
+    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+    if(WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst* SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if(i+1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+                               WG->getName()+"."+Twine(i+1),
+                               IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
+
+}
+
+// Get generated PTX binary name
+static std::string getPTXFilename(const Module& M) {
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".nvptx.s");
+  return moduleID;
+}
+
+// Get the name of the input file from module ID
+static std::string getFilenameFromModule(const Module& M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/")+1);
+}
+
+// Changes the data layout of the Module to be compiled with NVPTX backend
+// TODO: Figure out when to call it, probably after duplicating the modules
+static void changeDataLayout(Module &M) {
+  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else assert(false && "Invalid PTX target");
+
+  return;
+}
+
+static void changeTargetTriple(Module &M) {
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else assert(false && "Invalid PTX target");
+
+  return;
+}
+
+// Helper function, populate a vector with all return statements in a function
+static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    ReturnInst* RI = dyn_cast<ReturnInst>(I);
+    if (RI) {
+      ReturnInstVec.push_back(RI);
+    }
+  }
+}
+
+// Helper function, populate a vector with all IntrinsicID intrinsics in a function
+static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
+}
+
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
+  switch(ID) {
+  case Intrinsic::visc_atomic_add:
+    return AtomicRMWInst::Add;
+  case Intrinsic::visc_atomic_sub:
+    return AtomicRMWInst::Sub;
+  case Intrinsic::visc_atomic_min:
+    return AtomicRMWInst::Min;
+  case Intrinsic::visc_atomic_umin:
+    return AtomicRMWInst::UMin;
+  case Intrinsic::visc_atomic_max:
+    return AtomicRMWInst::Max;
+  case Intrinsic::visc_atomic_umax:
+    return AtomicRMWInst::UMax;
+  //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+  //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+  case Intrinsic::visc_atomic_xchg:
+    return AtomicRMWInst::Xchg;
+  case Intrinsic::visc_atomic_and:
+    return AtomicRMWInst::And;
+  case Intrinsic::visc_atomic_or:
+    return AtomicRMWInst::Or;
+  case Intrinsic::visc_atomic_xor:
+    return AtomicRMWInst::Xor;
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+
+// Helper funtion, returns the OpenCL function name, corresponding to atomic op
+static std::string getAtomicOpName(Intrinsic::ID ID) {
+  switch(ID) {
+  case Intrinsic::visc_atomic_cmpxchg:
+    return "atom_cmpxchg";
+  case Intrinsic::visc_atomic_add:
+    return "atom_add";
+  case Intrinsic::visc_atomic_sub:
+    return "atom_sub";
+  case Intrinsic::visc_atomic_min:
+    return "atom_min";
+  case Intrinsic::visc_atomic_max:
+    return "atom_max";
+  case Intrinsic::visc_atomic_inc:
+    return "atom_inc";
+  case Intrinsic::visc_atomic_dec:
+    return "atom_dec";
+  case Intrinsic::visc_atomic_xchg:
+    return "atom_xchg";
+  case Intrinsic::visc_atomic_and:
+    return "atom_and";
+  case Intrinsic::visc_atomic_or:
+    return "atom_or";
+  case Intrinsic::visc_atomic_xor:
+    return "atom_xor";
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+} // End of namespace
+
+char DFG2LLVM_NVPTX::ID = 0;
+static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
+                                      "Dataflow Graph to LLVM for NVPTX Pass",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_NVPTX/LLVMBuild.txt b/lib/DFG2LLVM_NVPTX/LLVMBuild.txt
new file mode 100644
index 0000000000..fb7cae49f8
--- /dev/null
+++ b/lib/DFG2LLVM_NVPTX/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_NVPTX
+parent = Transforms
diff --git a/lib/DFG2LLVM_PROMISE/CMakeLists.txt b/lib/DFG2LLVM_PROMISE/CMakeLists.txt
new file mode 100644
index 0000000000..5b5d2677d0
--- /dev/null
+++ b/lib/DFG2LLVM_PROMISE/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_PROMISE
+  DFG2LLVM_PROMISE.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp
new file mode 100644
index 0000000000..184f92910a
--- /dev/null
+++ b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp
@@ -0,0 +1,1283 @@
+//=== DFG2LLVM_PROMISE.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "DFG2LLVM_PROMISE"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include <sstream>
+#include <fstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+namespace {
+
+cl::opt<std::string> QuantizationInputsFilename(
+  "quantization-levels-filename",
+  cl::desc("<PROMISE quantization levels input file (path)>"),
+  cl::value_desc("filename"),
+  cl::Required);
+
+// Helper class declarations
+
+// State machine definition for pattern identification
+
+/* An assumption is made for the PROMISE simulator:                           *
+ * a leaf node will contain consequtive operations that will map to a         *
+ * single PROMISE simulator call                                              *
+
+ * To alleviate that, the states that correspond to valid patterns            *
+ * - (FullyConnectedLayer_(2,3,x, ConvilutionLayer_(2,3,4,x))                 *
+ * can invoke codeGen when detecting the beginning of a new pattern, then     *
+ * clear the collected IIs and Args, then go to initial and invoke its        *
+ * transition.                                                                */
+
+class AbstractState;
+
+class CodeGenStateMachine {
+private:
+  Module *M;
+  Module *RtM;
+
+  std::ifstream &qin; // Quantization levels input stream reference
+  std::vector<Value*> Args;
+  std::vector<IntrinsicInst*> IIs;
+  AbstractState *current;
+
+public:
+  CodeGenStateMachine(Module *, Module *, std::ifstream &);
+
+  void setCurrent(AbstractState *s) {
+    current = s;
+  }
+
+  void transition(IntrinsicInst *II);
+
+  Module *getModule() {
+    return M;
+  }
+
+  void getNextQuantizationLevel(float &ql) {
+    qin >> ql;
+  }
+
+  void addArgument(Value *Arg) {
+    Args.push_back(Arg);
+  }
+
+  void addIntrinsicInst(IntrinsicInst *II) {
+    IIs.push_back(II);
+  }
+
+  IntrinsicInst *getIntrinsicInstAt(unsigned idx) {
+    return IIs[idx];
+  }
+
+  void codeGen();
+
+};
+
+class AbstractState {
+public:
+  enum ID
+  {
+    INITIAL_STATE,
+    FULLY_CONNECTED_LAYER_1,
+    FULLY_CONNECTED_LAYER_2,
+    FULLY_CONNECTED_LAYER_3,
+    FULLY_CONNECTED_LAYER,
+    CONVOLUTION_LAYER_1,
+    CONVOLUTION_LAYER_2,
+    CONVOLUTION_LAYER_3,
+    CONVOLUTION_LAYER_4,
+    CONVOLUTION_LAYER,
+    NO_PATTERN,
+  };
+
+protected:
+  enum ID StateID;
+
+public:
+  enum ID getStateID() {
+    return StateID;
+  }
+
+  virtual void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) = 0;
+  virtual ~AbstractState() {}
+};
+
+class InitialState : public AbstractState {
+public:
+  InitialState() {
+    StateID = ID::INITIAL_STATE;
+    DEBUG(errs() << "new InitialState\n");
+  }
+  ~InitialState() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_1 : public AbstractState {
+public:
+  FullyConnectedLayer_1() {
+    StateID = ID::FULLY_CONNECTED_LAYER_1;
+    DEBUG(errs() << "new FullyConnectedLayer_1\n");
+  }
+  ~FullyConnectedLayer_1() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_2 : public AbstractState {
+public:
+  FullyConnectedLayer_2() {
+    StateID = ID::FULLY_CONNECTED_LAYER_2;
+    DEBUG(errs() << "new FullyConnectedLayer_2\n");
+  }
+  ~FullyConnectedLayer_2() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_3 : public AbstractState {
+public:
+  FullyConnectedLayer_3() {
+    StateID = ID::FULLY_CONNECTED_LAYER_3;
+    DEBUG(errs() << "new FullyConnectedLayer_3\n");
+  }
+  ~FullyConnectedLayer_3() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer : public AbstractState {
+public:
+  FullyConnectedLayer() {
+    StateID = ID::FULLY_CONNECTED_LAYER;
+    DEBUG(errs() << "new FullyConnectedLayer\n");
+  }
+  ~FullyConnectedLayer() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_1 : public AbstractState {
+public:
+  ConvolutionLayer_1() {
+    StateID = ID::CONVOLUTION_LAYER_1;
+    DEBUG(errs() << "new ConvolutionLayer_1\n");
+  }
+  ~ConvolutionLayer_1() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_2 : public AbstractState {
+public:
+  ConvolutionLayer_2() {
+    StateID = ID::CONVOLUTION_LAYER_2;
+    DEBUG(errs() << "new ConvolutionLayer_2\n");
+  }
+  ~ConvolutionLayer_2() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_3 : public AbstractState {
+public:
+  ConvolutionLayer_3() {
+    StateID = ID::CONVOLUTION_LAYER_3;
+    DEBUG(errs() << "new ConvolutionLayer_3\n");
+  }
+  ~ConvolutionLayer_3() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_4 : public AbstractState {
+public:
+  ConvolutionLayer_4() {
+    StateID = ID::CONVOLUTION_LAYER_4;
+    DEBUG(errs() << "new ConvolutionLayer_4\n");
+  }
+  ~ConvolutionLayer_4() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer : public AbstractState {
+public:
+  ConvolutionLayer() {
+    StateID = ID::CONVOLUTION_LAYER;
+    DEBUG(errs() << "new ConvolutionLayer\n");
+  }
+  ~ConvolutionLayer() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class NoPattern : public AbstractState {
+public:
+  NoPattern() {
+    StateID = ID::NO_PATTERN;
+    DEBUG(errs() << "new NoPattern\n");
+  }
+  ~NoPattern() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_convolution:
+        {
+        Mch->addIntrinsicInst(II);
+        Mch->addArgument(II->getOperand(0)); // conv input
+
+        // Read quantization levels for input
+        float i_min, i_max;
+        Mch->getNextQuantizationLevel(i_min);
+        Mch->getNextQuantizationLevel(i_max);
+        errs() << "i_min: " << i_min << "\n";
+        errs() << "i_max: " << i_max << "\n";
+
+        // Create associated arguments for the quantization levels
+        Constant *IminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) i_min);
+//      errs() << "IminC : "
+//             << dyn_cast<ConstantFP>(IminC)->getValueAPF().convertToFloat()
+//             << "\n";
+        Mch->addArgument(IminC);
+        Constant *ImaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) i_max);
+        Mch->addArgument(ImaxC);
+
+        Mch->addArgument(II->getOperand(1)); // conv kernel
+
+        // Read quantization levels for filter
+        float w_min, w_max;
+        Mch->getNextQuantizationLevel(w_min);
+        Mch->getNextQuantizationLevel(w_max);
+        errs() << "w_min: " << w_min << "\n";
+        errs() << "w_max: " << w_max << "\n";
+        Constant *WminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) w_min);
+        Mch->addArgument(WminC);
+        Constant *WmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) w_max);
+        Mch->addArgument(WmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_1());
+        }
+        break;
+      case Intrinsic::visc_tensor_mul:
+        {
+        Mch->addIntrinsicInst(II);
+        Mch->addArgument(II->getOperand(0)); // 1st gemm input
+
+        // Read quantization levels for input
+        float i_min, i_max;
+        Mch->getNextQuantizationLevel(i_min);
+        Mch->getNextQuantizationLevel(i_max);
+        errs() << "i_min: " << i_min << "\n";
+        errs() << "i_max: " << i_max << "\n";
+
+        Constant *IminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) i_min);
+        Mch->addArgument(IminC);
+        Constant *ImaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) i_max);
+        Mch->addArgument(ImaxC);
+
+        Mch->addArgument(II->getOperand(1)); // 2nd gemm input
+
+        // Read quantization levels for weight
+        float w_min, w_max;
+        Mch->getNextQuantizationLevel(w_min);
+        Mch->getNextQuantizationLevel(w_max);
+        errs() << "w_min: " << w_min << "\n";
+        errs() << "w_max: " << w_max << "\n";
+
+        Constant *WminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) w_min);
+        Mch->addArgument(WminC);
+        Constant *WmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) w_max);
+        Mch->addArgument(WmaxC);
+
+        Mch->setCurrent(new FullyConnectedLayer_1());
+        }
+        break;
+      default: // Other HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+    delete this;
+  } // else {} // No HPVM intrinsic received. Remain at initial 
+}
+
+void FullyConnectedLayer_1::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_add:
+        {
+        IntrinsicInst *MulII = Mch->getIntrinsicInstAt(0);
+        assert((MulII == II->getOperand(0)) &&
+               "Output of mul must be used as 1st operand of add");
+        Mch->addIntrinsicInst(II);
+
+        Mch->addArgument(II->getOperand(1));     // bias
+
+        // Read quantization levels for input
+        float b_min, b_max;
+        Mch->getNextQuantizationLevel(b_min);
+        Mch->getNextQuantizationLevel(b_max);
+        errs() << "b_min: " << b_min << "\n";
+        errs() << "b_max: " << b_max << "\n";
+
+        Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) b_min);
+        Mch->addArgument(BminC);
+        Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) b_max);
+        Mch->addArgument(BmaxC);
+
+        Mch->setCurrent(new FullyConnectedLayer_2());
+        }
+        break;
+      default:
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer_2::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_tanh:
+        {
+        // Type of activation : TanH
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_relu:
+        {
+        // Type of activation : ReLU
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_clipped_relu:
+        {
+        // Type of activation : Clipped ReLU
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      default: // No activation, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    // Read quantization levels for output
+    float out_min, out_max;
+    Mch->getNextQuantizationLevel(out_min);
+    Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+    Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_min);
+    Mch->addArgument(OutminC);
+    Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_max);
+    Mch->addArgument(OutmaxC);
+
+    Mch->setCurrent(new FullyConnectedLayer());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer_3::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (!II) { // End of instruction stream
+    Mch->setCurrent(new FullyConnectedLayer());
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer::transition(CodeGenStateMachine *Mch,
+                                     IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    Mch->setCurrent(new NoPattern());
+    delete this;
+  }
+}
+
+void ConvolutionLayer_1::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_add:
+        {
+        IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0);
+        assert((ConvII == II->getOperand(0)) &&
+               "Output of conv must be used as 1st operand of add");
+        Mch->addIntrinsicInst(II);
+
+        Mch->addArgument(II->getOperand(1));     // bias
+        // Read quantization levels for bias
+        float b_min, b_max;
+        Mch->getNextQuantizationLevel(b_min);
+        Mch->getNextQuantizationLevel(b_max);
+        errs() << "b_min: " << b_min << "\n";
+        errs() << "b_max: " << b_max << "\n";
+
+        Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) b_min);
+        Mch->addArgument(BminC);
+        Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) b_max);
+        Mch->addArgument(BmaxC);
+
+        Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv
+
+        Mch->setCurrent(new ConvolutionLayer_2());
+        }
+        break;
+      default:
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else {
+    // No addition
+    Mch->addArgument(ConstantPointerNull::get(
+                     Type::getInt8PtrTy(Mch->getModule()->getContext())));
+    // Still need to add the quantization constants - and remove them from file
+    float b_min, b_max;
+    Mch->getNextQuantizationLevel(b_min);
+    Mch->getNextQuantizationLevel(b_max);
+        errs() << "b_min: " << b_min << "\n";
+        errs() << "b_max: " << b_max << "\n";
+    Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                      (double) b_min);
+    Mch->addArgument(BminC);
+    Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                      (double) b_max);
+    Mch->addArgument(BmaxC);
+
+    // Zero for all convolution numeric arguments FIXME???
+        IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0);
+        Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv
+//    Mch->addArgument(ConstantInt::get(
+//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+//    Mch->addArgument(ConstantInt::get(
+//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+//    Mch->addArgument(ConstantInt::get(
+//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+//    Mch->addArgument(ConstantInt::get(
+//                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+    // No pooling
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    // 0 for unused pool argument
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    // Read quantization levels for output
+    float out_min, out_max;
+    Mch->getNextQuantizationLevel(out_min);
+    Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+    Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_min);
+    Mch->addArgument(OutminC);
+    Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_max);
+    Mch->addArgument(OutmaxC);
+
+    Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_tanh:
+        {
+        // Type of activation : TanH
+//        Mch->addArgument(ConstantInt::get(
+//                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_relu:
+        {
+        // Type of activation : ReLU
+//        Mch->addArgument(ConstantInt::get(
+//                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_clipped_relu:
+        {
+        // Type of activation : Clipped ReLU
+//        Mch->addArgument(ConstantInt::get(
+//                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_max:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_min:
+        {
+        // pool min FIXME: 2: supported?
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_mean:
+        {
+        // pool mean
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      default: // No activation, No pooling, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No pooling
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    // 0 for unused pool argument
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    // Read quantization levels for output
+    float out_min, out_max;
+    Mch->getNextQuantizationLevel(out_min);
+    Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+    Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_min);
+    Mch->addArgument(OutminC);
+    Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                        (double) out_max);
+    Mch->addArgument(OutmaxC);
+
+    Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_3::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_pool_max:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_min:
+        {
+        // pool min FIXME: 2: supported?
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_mean:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        // poolSize
+        Mch->addArgument(II->getOperand(1));
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        // Read quantization levels for output
+        float out_min, out_max;
+        Mch->getNextQuantizationLevel(out_min);
+        Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+        Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_min);
+        Mch->addArgument(OutminC);
+        Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                            (double) out_max);
+        Mch->addArgument(OutmaxC);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      default: // No pooling, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No pooling
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    // 0 for unused pool argument
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+    // Revisit last intrinsic, to add argument for activation operation
+    IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+    // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+    Intrinsic::ID ActIID = ActII->getIntrinsicID();
+    if (ActIID == Intrinsic::visc_tensor_tanh) {
+      Mch->addArgument(ConstantInt::get(
+                       Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+      } else if (ActIID == Intrinsic::visc_tensor_relu) {
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+      } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+      }
+
+      // Read quantization levels for output
+      float out_min, out_max;
+      Mch->getNextQuantizationLevel(out_min);
+      Mch->getNextQuantizationLevel(out_max);
+        errs() << "out_min: " << out_min << "\n";
+        errs() << "out_max: " << out_max << "\n";
+
+      Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) out_min);
+      Mch->addArgument(OutminC);
+      Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()),
+                          (double) out_max);
+      Mch->addArgument(OutmaxC);
+
+     Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_4::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (!II) { // End of instruction stream
+    Mch->setCurrent(new ConvolutionLayer());
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void ConvolutionLayer::transition(CodeGenStateMachine *Mch,
+                                  IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    Mch->setCurrent(new NoPattern());
+    delete this;
+  }
+}
+
+void NoPattern::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {}
+
+CodeGenStateMachine::CodeGenStateMachine(Module *_M, Module *_RtM, std::ifstream &_qin) :
+  M(_M), RtM(_RtM), qin(_qin) {
+  current = new InitialState();
+}
+
+void CodeGenStateMachine::transition(IntrinsicInst *II) {
+  current->transition(this, II);
+}
+
+void CodeGenStateMachine::codeGen() {
+
+  if ((current->getStateID() != AbstractState::ID::FULLY_CONNECTED_LAYER) &&
+      (current->getStateID() != AbstractState::ID::CONVOLUTION_LAYER)) {
+    // Not a valid instruction sequence.
+    assert(false && "Unsupported instruction sequence by PROMISE simulator\n");
+  }
+
+  // We have a valid instruction sequence.
+  // Make sure that the instruction sequence can be traslated:
+  // each instruction's result must be used only by the next one in sequence.
+  for (unsigned p = 0; p < IIs.size()-1; p++) {
+    IntrinsicInst *II = IIs[p];
+    assert((II->hasOneUse()) &&
+          "Instruction sequence does not fit expected pattern: not single use\n");
+
+    Value::user_iterator ui = II->user_begin(); // The only use
+    assert((*ui == IIs[p+1]) &&
+           "Instruction sequence does not fit expected pattern: not used by next instruction\n");
+  }
+
+  // Create corresponding PROMISE simulator call
+  CallInst *CI;
+  switch (current->getStateID()) {
+    case AbstractState::ID::CONVOLUTION_LAYER:
+      {
+        Constant* ConvLayer_PROMISE =
+          M->getOrInsertFunction(StringRef("ConvLayer_PROMISE"),
+                 RtM->getFunction(StringRef("ConvLayer_PROMISE"))->getFunctionType());
+        DEBUG(errs() << *ConvLayer_PROMISE);
+
+        // FIXME: get last argument from some intrinsic. For now, 7
+        Args.push_back(ConstantInt::get(Type::getInt32Ty(M->getContext()), 7));
+        // Create PROMISE simulator function call
+        CI = CallInst::Create(ConvLayer_PROMISE, Args, "");
+      }
+      break;
+    case AbstractState::ID::FULLY_CONNECTED_LAYER:
+      {
+        Constant* FCLayer_PROMISE =
+          M->getOrInsertFunction(StringRef("FCLayer_PROMISE"),
+              RtM->getFunction(StringRef("FCLayer_PROMISE"))->getFunctionType());
+        DEBUG(errs() << *FCLayer_PROMISE);
+
+        // FIXME: get last argument from some intrinsic. For now, 7
+        Args.push_back(ConstantInt::get(Type::getInt32Ty(M->getContext()), 7));
+        // Create PROMISE simulator function call
+        CI = CallInst::Create(FCLayer_PROMISE, Args, "");
+      }
+      break;
+    default:
+      llvm_unreachable("Unexpected CodeGenStateMachine State\n");
+      break;
+  }
+
+  // Insert new call and replace all uses of pattern result with
+  // the PROMISE simulator call
+  IntrinsicInst *IIlast = *(IIs.rbegin());
+  CI->insertBefore(IIlast);
+  IIlast->replaceAllUsesWith(CI);
+
+  // Remove the instructions we translated to the simulator call.
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around.
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs.rbegin(),
+       re = IIs.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    (*ri)->eraseFromParent();
+  }
+errs() << "****** GenF:\n" << *(CI->getParent()->getParent());
+
+}
+
+// DFG2LLVM_PROMISE - The first implementation.
+
+struct DFG2LLVM_PROMISE : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_PROMISE() : DFG2LLVM(ID) {}
+private:
+
+public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addPreserved<BuildDFG>();
+  }
+
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_PROMISE : public CodeGenTraversal {
+
+private:
+  //Member variables
+  std::ifstream qin;
+
+  // VISC Runtime API and Tensor runtime API
+  Constant* llvm_hpvm_initTensorRt;
+  Constant* llvm_hpvm_cleanupTensorRt;
+  Constant* hpvm_request_tensor;
+
+  // Functions
+
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_PROMISE(Module &_M, BuildDFG &_DFG, std::string &_str) : CodeGenTraversal(_M, _DFG) {
+    qin.open(_str.c_str());
+    assert(qin && "Failed to open quantization levels input file\n"); 
+    initRuntimeAPI();
+  }
+
+  ~CGT_PROMISE() {
+    qin.close();
+  }
+
+};
+
+void CGT_PROMISE::init() {
+  // FIXME: what to do here? If anything?
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_PROMISE::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n");
+
+  // FIXME: set correct path
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
+
+  // Get or insert Global declarations for
+  // - initialization
+  // - cleanup
+  // - request a tensor
+  DECLARE(llvm_hpvm_initTensorRt);
+  DECLARE(llvm_hpvm_cleanupTensorRt);
+  DECLARE(hpvm_request_tensor);
+
+  // Find visc.init and visc.cleanup calls, and add placeholder methods
+  // for initialization and cleanup of the hpvm tensor runtime
+
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  CallInst::Create(llvm_hpvm_initTensorRt,
+                   ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
+                   "", InitCall);
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall);
+
+}
+
+void CGT_PROMISE::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Skipping internal node\n";
+}
+
+void CGT_PROMISE::codeGen(DFLeafNode* N) {
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+
+  // Generate code only if it has the right hint
+  if (!checkPreferredTarget(N, visc::PROMISE_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+errs() << "Node Function: " << *F << "\n";
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+  Function *F_promise = N->getGenFuncForTarget(visc::PROMISE_TARGET);
+
+  assert((F_promise == NULL) &&
+         "Error: Visiting a node for which code already generated");
+
+  // Clone the function
+  ValueToValueMapTy VMap;
+  std::string FName(F->getName().data());//Twine FName = F->getName();
+  F_promise = CloneFunction(F, VMap);
+  F_promise->setName(FName+"_promise");
+  F_promise->removeFromParent();
+  M.getFunctionList().push_back(F_promise);
+
+  N->addGenFunc(F_promise, visc::PROMISE_TARGET, true);
+
+  /* Removing HPVM in/out/inout function attributes */
+  for(Function::arg_iterator ai = F_promise->arg_begin(), ae = F_promise->arg_end();
+      ai != ae; ai++){
+    Argument *Arg = &*ai;
+    if(Arg->hasAttribute(Attribute::In))
+      Arg->removeAttr(Attribute::In);
+    if(Arg->hasAttribute(Attribute::Out))
+      Arg->removeAttr(Attribute::Out);
+    if(Arg->hasAttribute(Attribute::InOut))
+      Arg->removeAttr(Attribute::InOut);    
+  }
+
+  // Adding nounwind to generated function : FIXME: needed?
+  DEBUG(errs() << "Adding nounwind to generated function\n");
+  F_promise->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+
+  // Add llvm_visc_requestTensor calls for every pointer argument of the function
+  // (they are all expected to be tensors), at the beginning of the function.
+  // This is the first instruction of the function, insert them before this
+  Instruction* FI = &*(F_promise->getEntryBlock().begin());
+
+  // FIXME: verify that we want 0 as a target device
+  // In this backend, the target device is CPU, represented by i32 0.
+  ConstantInt *TargetDeviceID =
+    ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+
+  for (Function::arg_iterator ai = F_promise->arg_begin(),
+       ae = F_promise->arg_end(); ai != ae; ++ai) {
+    Argument* Arg = &*ai;
+    if (Arg->getType()->isPointerTy()) {
+      Value *Args[] = {Arg, TargetDeviceID};
+      CallInst::Create(hpvm_request_tensor,
+                       ArrayRef<Value*>(Args, 2),
+                       "", FI);
+    }
+  }
+
+  CodeGenStateMachine CGM(&M, runtimeModule.get(), qin);
+
+  /* An assumption is made for the PROMISE simulator:                         *
+   * a leaf node will contain consequtive operations that will map to a       *
+   * single PROMISE simulator call                                            */
+
+  for (inst_iterator i = inst_begin(F_promise), e = inst_end(F_promise);
+       i != e; ++i) {
+    Instruction *I = &(*i);
+    CGM.transition(dyn_cast<IntrinsicInst>(I));
+  }
+
+  CGM.codeGen();
+
+//errs() << "-----------------------------------\n";
+//errs() << *F_promise << "\n";
+
+  return;
+}
+
+bool DFG2LLVM_PROMISE::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_PROMISE PASS\n";
+
+  errs() << QuantizationInputsFilename << "\n";
+  
+//  std::ifstream qin(quantizationInputsFilename_cstr);
+//  std::ifstream qin;
+//  qin.open(QuantizationInputsFilename.c_str());
+//  qin.open(QuantizationInputsFilename.c_str(), std::ifstream::in);
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+ 
+  // Visitor for Code Generation Graph Traversal
+  CGT_PROMISE *CGTVisitor = new CGT_PROMISE(M, DFG, QuantizationInputsFilename);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+} // End of namespace
+
+char DFG2LLVM_PROMISE::ID = 0;
+static RegisterPass<DFG2LLVM_PROMISE> X("dfg2llvm-promise",
+                                      "Dataflow Graph to LLVM for PROMISE Pass",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_PROMISE/LLVMBuild.txt b/lib/DFG2LLVM_PROMISE/LLVMBuild.txt
new file mode 100644
index 0000000000..714ad14f18
--- /dev/null
+++ b/lib/DFG2LLVM_PROMISE/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_PROMISE
+parent = Transforms
diff --git a/lib/DFG2LLVM_SPIR/CMakeLists.txt b/lib/DFG2LLVM_SPIR/CMakeLists.txt
new file mode 100644
index 0000000000..43e2254c79
--- /dev/null
+++ b/lib/DFG2LLVM_SPIR/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_SPIR
+  DFG2LLVM_SPIR.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
new file mode 100644
index 0000000000..48b1492047
--- /dev/null
+++ b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
@@ -0,0 +1,2010 @@
+//=== DFG2LLVM_SPIR.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define ENABLE_ASSERTS
+#define TARGET_PTX 32
+#define GENERIC_ADDRSPACE 0
+#define GLOBAL_ADDRSPACE 1
+#define SHARED_ADDRSPACE 3
+
+#define DEBUG_TYPE "DFG2LLVM_SPIR"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Core.h"
+
+#include "llvm/SupportVISC/VISCUtils.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/IR/UseListOrder.h"
+
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+using namespace viscUtils;
+
+// VISC Command line option to use timer or not
+static cl::opt<bool>
+VISCTimer_SPIR("visc-timers-spir", cl::desc("Enable visc timers"));
+
+namespace {
+// Helper class declarations
+
+// Class to maintain the tuple of host pointer, device pointer and size
+// in bytes. Would have preferred to use tuple but support not yet available
+class OutputPtr {
+public:
+  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
+    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+
+  Value* h_ptr;
+  Value* d_ptr;
+  Value* bytes;
+};
+
+// Class to maintain important kernel info required for generating runtime
+// calls
+class Kernel {
+public:
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
+         std::map<unsigned, unsigned>(),
+         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
+         std::map<unsigned, std::pair<Value*, unsigned> >(),
+         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
+         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
+    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
+      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size()
+           && "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size()
+           && "blockDim should be same as the size of vector localWGSize");
+  }
+
+  Function* KernelFunction;
+  DFLeafNode* KernelLeafNode;
+  std::map<unsigned, unsigned> inArgMap;
+  // Map for shared memory arguments
+  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  // Fields for (potential) allocation node
+  DFLeafNode* AllocationNode;
+  Function* AllocationFunction;
+  std::map<unsigned, unsigned> allocInArgMap;  
+
+  std::vector<unsigned> outArgMap;
+  unsigned gridDim;
+  std::vector<Value*> globalWGSize;
+  unsigned blockDim;
+  std::vector<Value*> localWGSize;
+  std::vector<int> localDimMap;
+
+  std::map<unsigned, unsigned> getInArgMap() {
+    return inArgMap;
+  }
+  void setInArgMap(std::map<unsigned, unsigned> map) {
+    inArgMap = map;
+  }
+
+  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() {
+    return sharedInArgMap;
+  }
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+    sharedInArgMap = map;
+  }
+
+  std::vector<unsigned> getOutArgMap() {
+    return outArgMap;
+  }
+  void setOutArgMap(std::vector<unsigned> map) {
+    outArgMap = map;
+  }
+
+  void setLocalWGSize(std::vector<Value*> V) {
+    localWGSize = V;
+  }
+
+  bool hasLocalWG() {
+    return blockDim != 0;
+  }
+};
+
+// Helper function declarations
+static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
+                                 ValueToValueMapTy&, Instruction*);
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
+                              Instruction*, const Twine& WGName = "WGSize");
+static std::string getSPIRFilename(const Module&);
+static std::string getFilenameFromModule(const Module& M);
+static void changeDataLayout(Module &);
+static void changeTargetTriple(Module &);
+static std::string printType(Type*);
+static StringRef getMangledName(std::string);
+static StringRef getAtomicMangledName(std::string, unsigned, bool);
+static void findReturnInst(Function *, std::vector<ReturnInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static StringRef getAtomicOpName(Intrinsic::ID, unsigned);
+static std::string getMathFunctionName(Intrinsic::ID);
+
+// DFG2LLVM_SPIR - The first implementation.
+struct DFG2LLVM_SPIR : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_SPIR() : DFG2LLVM(ID) {}
+
+private:
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_SPIR : public CodeGenTraversal {
+
+private:
+  //Member variables
+  std::unique_ptr<Module> KernelM;
+  DFNode* KernelLaunchNode = nullptr;
+  Kernel* kernel;
+
+  // VISC Runtime API
+  Constant* llvm_visc_ocl_launch;
+  Constant* llvm_visc_ocl_wait;
+  Constant* llvm_visc_ocl_initContext;
+  Constant* llvm_visc_ocl_clearContext;
+  Constant* llvm_visc_ocl_argument_shared;
+  Constant* llvm_visc_ocl_argument_scalar;
+  Constant* llvm_visc_ocl_argument_ptr;
+  Constant* llvm_visc_ocl_output_ptr;
+  Constant* llvm_visc_ocl_free;
+  Constant* llvm_visc_ocl_getOutput;
+  Constant* llvm_visc_ocl_executeNode;
+
+  //Functions
+  std::string getKernelsModuleName(Module &M);
+  void fixValueAddrspace(Value* V, unsigned addrspace);
+  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned i);
+  void removeAttributeAtArguments(Function* F, std::vector<unsigned> &Ags, Attribute::AttrKind attrKind);
+  void addCLMetadata(Function* F);
+  Function* transformFunctionToVoid(Function* F);
+  void removeInOutAttributes(Function* F);
+  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
+
+  // Virtual Functions
+  void init() {
+    VISCTimer = VISCTimer_SPIR;
+    TargetName = "SPIR";
+  }
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_SPIR(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) {
+    KernelLaunchNode = NULL;
+    init();
+    initRuntimeAPI();
+    errs() << "Old module pointer: " << &_M << "\n";
+    errs() << "New module pointer: " <<  KernelM.get() << "\n";
+    // Copying instead of creating new, in order to preserve required info (metadata)
+    // Remove functions, global variables and aliases
+    std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
+    for (Module::global_iterator mi = KernelM->global_begin(),
+         me = KernelM->global_end(); (mi != me); ++mi) {
+      GlobalVariable* gv = &*mi;
+      gvv.push_back(gv);
+    }
+    for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    std::vector<Function*> fv = std::vector<Function*>();
+    for (Module::iterator mi = KernelM->begin(),
+         me = KernelM->end(); (mi != me); ++mi) {
+      Function* f = &*mi;
+      fv.push_back(f);
+    }
+    for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
+    for (Module::alias_iterator mi = KernelM->alias_begin(),
+         me = KernelM->alias_end(); (mi != me); ++mi) {
+      GlobalAlias* a = &*mi;
+      av.push_back(a);
+    }
+    for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) {
+      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+      (*vi)->eraseFromParent();
+    }
+
+    changeDataLayout(*KernelM);
+    changeTargetTriple(*KernelM);
+
+    DEBUG(errs() << *KernelM);
+
+  }
+
+  void removeLLVMIntrinsics();
+  void writeKernelsModule();
+};
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_SPIR::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll";
+  errs() << "Open file: " << runtimeAPI.str() << "\n";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    errs() << "Successfully loaded visc-rt API module\n";
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE(llvm_visc_ocl_launch);
+  DECLARE(llvm_visc_ocl_wait);
+  DECLARE(llvm_visc_ocl_initContext);
+  DECLARE(llvm_visc_ocl_clearContext);
+  DECLARE(llvm_visc_ocl_argument_shared);
+  DECLARE(llvm_visc_ocl_argument_scalar);
+  DECLARE(llvm_visc_ocl_argument_ptr);
+  DECLARE(llvm_visc_ocl_output_ptr);
+  DECLARE(llvm_visc_ocl_free);
+  DECLARE(llvm_visc_ocl_getOutput);
+  DECLARE(llvm_visc_ocl_executeNode);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  initTimerAPI();
+
+  // Insert init context in main
+  DEBUG(errs() << "Gen Code to initialize SPIR Timer\n");
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+
+  InitCall = cast<Instruction>(*VI->user_begin());
+  initializeTimerSet(InitCall);
+  switchToTimer(visc_TimerID_INIT_CTX, InitCall);
+  CallInst::Create(llvm_visc_ocl_initContext,
+                  ArrayRef<Value*>(getTargetID(M, visc::SPIR_TARGET)),
+                  "", InitCall);
+  switchToTimer(visc_TimerID_NONE, InitCall);
+
+  // Insert print instruction at visc exit
+  DEBUG(errs() << "Gen Code to print SPIR Timer\n");
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  DEBUG(errs() << *VC << "\n");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
+
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  printTimerSet(CleanupCall);
+
+
+}
+
+// Generate Code to call the kernel
+// The plan is to replace the internal node with a leaf node. This method is
+// used to generate a function to associate with this leaf node. The function
+// is responsible for all the memory allocation/transfer and invoking the
+// kernel call on the device
+void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before.
+//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+
+  assert(N->getGenFuncForTarget(visc::SPIR_TARGET) == NULL &&
+         "Code already generated for this node");
+
+  // Useful values
+  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+
+  // If kernel struct has not been initialized with kernel function, then fail
+  assert(K != NULL && "No kernel found!!");
+
+  DEBUG(errs() << "Generating kernel call code\n");
+
+  Function* F = N->getFuncPointer();
+
+
+  // Create of clone of F with no instructions. Only the type is the same as F
+  // without the extra arguments.
+  Function* F_X86;
+
+  // Clone the function, if we are seeing this function for the first time. We
+  // only need a clone in terms of type.
+  ValueToValueMapTy VMap;
+
+  // Create new function with the same type
+  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+  // Loop over the arguments, copying the names of arguments over.
+  Function::arg_iterator dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    dest_iterator->setName(i->getName()); // Copy the name over...
+    // Increment dest iterator
+    ++dest_iterator;
+  }
+
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(F_X86->getReturnType()), BB);
+
+  // FIXME: Adding Index and Dim arguments are probably not required except
+  // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
+  // have those arguments)
+
+  // Add Index and Dim arguments except for the root node
+  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+    F_X86 = addIdxDimArgs(F_X86);
+
+  BB = &*F_X86->begin();
+  RI = cast<ReturnInst>(BB->getTerminator());
+
+  //Add the generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::SPIR_TARGET, true);
+
+  // Loop over the arguments, to create the VMap
+  dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    // Add mapping to VMap and increment dest iterator
+    VMap[&*i] = &*dest_iterator;
+    ++dest_iterator;
+  }
+
+  /* TODO: Use this code to verufy if this is a good pattern for OCL kernel
+
+  // Sort children in topological order before code generation for kernel call
+  N->getChildGraph()->sortChildren();
+
+  // The DFNode N has the property that it has only one child (leaving Entry
+  // and Exit dummy nodes). This child is the OCL kernel. This simplifies code
+  // generation for kernel calls significantly. All the inputs to this child
+  // node would either be constants or from the parent node N.
+
+  assert(N->getChildGraph()->size() == 3
+         && "Node expected to have just one non-dummy node!");
+
+  DFNode* C;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    C = *ci;
+    // Skip dummy node call
+    if (!C->isDummyNode())
+      break;
+  }
+
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+
+  Function* CF = C->getFuncPointer();
+  */
+  Function* KF = K->KernelLeafNode->getFuncPointer();
+  // Initialize context
+  //DEBUG(errs() << "Initializing context" << "\n");
+  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+
+  DEBUG(errs() << "Initializing commandQ" << "\n");
+  // Initialize command queue
+  switchToTimer(visc_TimerID_SETUP, InitCall);
+  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
+  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
+  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
+  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
+
+  Value* LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call" << "\n");
+  CallInst* SPIR_Ctx = CallInst::Create(llvm_visc_ocl_launch,
+                                         ArrayRef<Value*>(LaunchInstArgs, 2),
+                                         "graph"+KF->getName(),
+                                         InitCall);
+  DEBUG(errs() << *SPIR_Ctx << "\n");
+  GraphIDAddr = new GlobalVariable(M,
+                                   SPIR_Ctx->getType(),
+                                   false,
+                                   GlobalValue::CommonLinkage,
+                                   Constant::getNullValue(SPIR_Ctx->getType()),
+                                   "graph"+KF->getName()+".addr");
+  DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
+  StoreInst* SI = new StoreInst(SPIR_Ctx, GraphIDAddr, InitCall);
+  DEBUG(errs() << *SI << "\n");
+  switchToTimer(visc_TimerID_NONE, InitCall);
+  switchToTimer(visc_TimerID_SETUP, RI);
+  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
+
+  // Iterate over the required input edges of the node and use the visc-rt API
+  // to set inputs
+  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+  std::vector<OutputPtr> OutputPointers;
+  // Vector to hold the device memory object that need to be cleared before we release
+  // context
+  std::vector<Value*> DevicePointers;
+
+  std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap();
+/*
+  for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
+
+    // The kernel object gives us the mapping of arguments from kernel launch
+    // node function (F_X86) to kernel (kernel->KF)
+    Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
+
+*/
+  for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(),
+      ie = kernelInArgMap.end(); ib != ie; ++ib) {
+    unsigned i = ib->first;
+    Value* inputVal = getArgumentAt(F_X86, ib->second);
+    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+
+    // input value has been obtained.
+    // Check if input is a scalar value or a pointer operand
+    // For scalar values such as int, float, etc. the size is simply the size of
+    // type on target machine, but for pointers, the size of data would be the
+    // next integer argument
+    if(inputVal->getType()->isPointerTy()) {
+
+      switchToTimer(visc_TimerID_COPY_PTR, RI);
+      // Pointer Input
+      // CheckAttribute
+      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
+      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
+                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
+
+      Argument* A = getArgumentAt(KF, i);
+      if(isOutput == True) {
+        DEBUG(errs() << *A << " is an OUTPUT argument\n");
+      }
+      if(isInput == True) {
+        DEBUG(errs() << *A << " is an INPUT argument\n");
+      }
+
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      // Assert that the pointer argument size (next argument) is in the map
+      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
+
+      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
+
+      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
+             && "Pointer type input must always be followed by size (integer type)");
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               inputSize,
+                               isInput,
+                               isOutput
+                              };
+      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
+                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      DevicePointers.push_back(d_ptr);
+      // If this has out attribute, store the returned device pointer in
+      // memory to read device memory later
+      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    }
+    else {
+      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+      // Scalar Input
+      // Store the scalar value on stack and then pass the pointer to its
+      // location
+      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
+      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                             Type::getInt8PtrTy(M.getContext()),
+                             inputVal->getName()+".i8ptr",
+                             RI);
+
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               ConstantExpr::getSizeOf(inputVal->getType())
+                              };
+      CallInst::Create(llvm_visc_ocl_argument_scalar,
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+    }
+  }
+
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+
+  // Check to see if all the allocation sizes are constant (determined
+  // statically)
+  bool constSizes = true;
+  for (auto& e: K->getSharedInArgMap()) {
+    constSizes &= isa<Constant>(e.second.first);
+  }
+
+  // If the sizes are all constant
+  if (constSizes) {
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = e.second.first;
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               allocSize->getName()+".sharedMem.i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  } else {
+
+    Function *F_alloc = K->AllocationFunction;
+    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
+    assert(FAllocRetTy && "Allocation node with no struct return type");
+
+    std::vector<Value *> AllocInputArgs;
+    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
+      AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i)));
+    }
+
+    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
+    std::vector<ExtractValueInst *> ExtractValueInstVec;
+    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
+      ExtractValueInstVec.push_back(EI);
+    }
+
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               allocSize->getName()+".sharedMem.i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  }
+
+
+  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
+
+  // Set output if struct is not an empty struct
+  StructType* OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value*> d_Outputs;
+  if(!OutputTy->isEmptyTy()) {
+    switchToTimer(visc_TimerID_COPY_PTR, RI);
+    // Not an empty struct
+    // Iterate over all elements of the struct and put them in
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
+      Value* setOutputArgs[] = {GraphID,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                ArrayRef<Value*>(setOutputArgs, 3),
+                                "d_output."+KF->getName(),
+                                RI);
+      d_Outputs.push_back(d_Output);
+    }
+  }
+
+  // Enqueue kernel
+  // Need work dim, localworksize, globalworksize
+  // Allocate size_t[numDims] space on stack. Store the work group sizes and
+  // pass it as an argument to ExecNode
+
+  switchToTimer(visc_TimerID_MISC, RI);
+  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
+  getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
+  switchToTimer(visc_TimerID_KERNEL, RI);
+  Value* ExecNodeArgs[] = {GraphID,
+                           workDim,
+                           LocalWGPtr,
+                           GlobalWGPtr
+                          };
+  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
+                                     ArrayRef<Value*>(ExecNodeArgs, 4),
+                                     "event."+KF->getName(),
+                                     RI);
+  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
+
+  // Wait for Kernel to Finish
+  CallInst::Create(llvm_visc_ocl_wait,
+                   ArrayRef<Value*>(GraphID),
+                   "",
+                   RI);
+
+  switchToTimer(visc_TimerID_READ_OUTPUT, RI);
+  // Read Output Struct if not empty
+  if(!OutputTy->isEmptyTy()) {
+    std::vector<Value*>h_Outputs;
+    Value* KernelOutput = UndefValue::get(OutputTy);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Value* GetOutputArgs[] = {GraphID,
+                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+                              d_Outputs[i],
+                              ConstantExpr::getSizeOf(OutputTy->getElementType(i))
+                             };
+      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
+                                          ArrayRef<Value*>(GetOutputArgs, 4),
+                                          "h_output."+KF->getName()+".addr",
+                                          RI);
+      // Read each device pointer listed in output struct
+      // Load the output struct
+      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
+          OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
+
+      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
+                                            KF->getName()+"output", RI);
+    }
+    OutputMap[K->KernelLeafNode] = KernelOutput;
+  }
+
+  // Read all the pointer arguments which had side effects i.e., had out
+  // attribute
+  DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
+  // FIXME: Not reading output pointers anymore as we read them when data is
+  // actually requested
+  /*for(auto output: OutputPointers) {
+    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
+    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
+    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
+
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
+    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+                                    ArrayRef<Value*>(GetOutputArgs, 4),
+                                    "", RI);
+  }*/
+  switchToTimer(visc_TimerID_MEM_FREE, RI);
+  // Clear Context and free device memory
+  DEBUG(errs() << "Clearing context" << "\n");
+  // Free Device Memory
+  for(auto d_ptr: DevicePointers) {
+    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
+  }
+  switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
+  // Clear Context
+  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
+  switchToTimer(visc_TimerID_NONE, CleanupCall);
+
+  switchToTimer(visc_TimerID_MISC, RI);
+  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+  // Generate code for output bindings
+  // Get Exit node
+  DFNode* C = N->getChildGraph()->getExit();
+  // Get OutputType of this node
+  StructType* OutTy = N->getOutputType();
+  Value *retVal = UndefValue::get(F_X86->getReturnType());
+  // Find the kernel's output arg map, to use instead of the bindings
+  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
+  // Find all the input edges to exit node
+  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    DEBUG(errs() << "Output Edge " << i << "\n");
+    // Find the incoming edge at the requested input port
+    DFEdge* E = C->getInDFEdgeAt(i);
+
+    assert(E && "No Binding for output element!");
+    // Find the Source DFNode associated with the incoming edge
+    DFNode* SrcDF = E->getSourceDF();
+
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
+    // If Source DFNode is a dummyNode, edge is from parent. Get the
+    // argument from argument list of this internal node
+    Value* inputVal;
+    if(SrcDF->isEntryNode()) {
+      inputVal = getArgumentAt(F_X86, i);
+      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+    }
+    else {
+      // edge is from a internal node
+      // Check - code should already be generated for this source dfnode
+      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
+      // can assume the SrcDF is same as Kernel Leaf node.
+      // Use outArgMap to get correct mapping
+      SrcDF = K->KernelLeafNode;
+      assert(OutputMap.count(SrcDF)
+             && "Source node call not found. Dependency violation!");
+
+      // Find Output Value associated with the Source DFNode using OutputMap
+      Value* CI = OutputMap[SrcDF];
+
+      // Extract element at source position from this call instruction
+      std::vector<unsigned> IndexList;
+      // i is the destination of DFEdge E
+      // Use the mapping instead of the bindings
+//      IndexList.push_back(E->getSourcePosition());
+      IndexList.push_back(outArgMap[i]);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                             "",RI);
+      inputVal = EI;
+    }
+    std::vector<unsigned> IdxList;
+    IdxList.push_back(i);
+    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+  }
+
+  DEBUG(errs() << "Extracted all\n");
+  switchToTimer(visc_TimerID_NONE, RI);
+  retVal->setName("output");
+  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReplaceInstWithInst(RI, newRI);
+}
+
+
+// Right now, only targeting the one level case. In general, device functions
+// can return values so we don't need to change them
+void CGT_SPIR::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  if(KernelLaunchNode == NULL)
+    errs () << "No kernel launch node\n";
+  else {
+    errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+  }
+
+
+  if (!KernelLaunchNode) {
+    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    return;
+  }
+
+  if (N == KernelLaunchNode) {
+    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
+    //TODO
+
+    // Now the remaining nodes to be visited should be ignored
+    KernelLaunchNode = NULL;
+    DEBUG(errs() << "Insert Runtime calls\n");
+    insertRuntimeCalls(N, kernel, getSPIRFilename(M));
+
+  } else {
+    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
+    // Keep track of the arguments order.
+    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
+    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
+    // TODO: Structure assumed: one thread node, one allocation node (at most),
+    // TB node
+    std::map<unsigned, unsigned> inmapFinal;
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+         ib != ie; ++ib) {
+      inmapFinal[ib->first] = inmap1[ib->second];
+    }
+
+    kernel->setInArgMap(inmapFinal);
+
+    // Keep track of the output arguments order.
+    std::vector<unsigned> outmap1 = N->getOutArgMap();
+    std::vector<unsigned> outmap2 = kernel->getOutArgMap();
+
+    // TODO: Change when we have incoming edges to the dummy exit node from more
+    // than one nodes. In this case, the number of bindings is the same, but
+    // their destination position, thus the index in outmap1, is not
+    // 0 ... outmap2.size()-1
+    // The limit is the size of outmap2, because this is the number of kernel
+    // output arguments for which the mapping matters
+    // For now, it reasonable to assume that all the kernel arguments are returned,
+    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    for (unsigned i = 0; i < outmap2.size(); i++) {
+      outmap1[i] = outmap2[outmap1[i]];
+    }
+    kernel->setOutArgMap(outmap1);
+
+    // Track the source of local dimlimits for the kernel
+    // Dimension limit can either be a constant or an argument of parent
+    // function. Since Internal node would no longer exist, we need to insert the
+    // localWGSize with values from the parent of N.
+    std::vector<Value*> localWGSizeMapped;
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Constant>(kernel->localWGSize[i])) {
+        // if constant, use as it is
+        localWGSizeMapped.push_back(kernel->localWGSize[i]);
+      }
+      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+        // if argument, find the argument location in N. Use InArgMap of N to
+        // find the source location in Parent of N. Retrieve the argument from
+        // parent to insert in the vector.
+        unsigned argNum = Arg->getArgNo();
+        // This argument will be coming from the parent node, not the allocation
+        // Node
+        assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
+
+        unsigned parentArgNum = N->getInArgMap()[argNum];
+        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        localWGSizeMapped.push_back(A);
+      }
+      else {
+        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      }
+    }
+    // Update localWGSize vector of kernel
+    kernel->setLocalWGSize(localWGSizeMapped);
+  }
+
+}
+
+//static bool checkPreferredTarget(DFNode* N, visc::Target T) {
+  //Function* F = N->getFuncPointer();
+  //Module* M = F->getParent();
+  //NamedMDNode* HintNode;
+  //switch (T) {
+    //case visc::GPU_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      //break;
+    //case visc::SPIR_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      //break;
+    //case visc::CPU_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      //break;
+    //default:
+      //llvm_unreachable("Target Not supported yet!");
+  //}
+  //for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    //MDNode* MetaNode = HintNode->getOperand(i);
+    //if(F == MetaNode->getOperand(0))
+      //return true;
+  //}
+  //return false;
+//}
+
+void CGT_SPIR::codeGen(DFLeafNode* N) {
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Skip code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // Generate code only if it has the right hint
+//  if(!checkPreferredTarget(N, visc::SPIR_TARGET)) {
+//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+//    return;
+//  }
+  if(!preferredTargetIncludes(N, visc::SPIR_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Checking which node is the kernel launch
+  DFNode* PNode = N->getParent();
+  int pLevel = PNode->getLevel();
+  int pReplFactor = PNode->getNumOfDim();
+
+  // Choose parent node as kernel launch if:
+  // (1) Parent is the top level node i.e., Root of DFG
+  //                    OR
+  // (2) Parent does not have multiple instances
+  errs() << "pLevel = " << pLevel << "\n";
+  errs() << "pReplFactor = " << pReplFactor << "\n";
+
+  if (!pLevel || !pReplFactor) {
+    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
+    KernelLaunchNode = PNode;
+    errs() << "Setting Kernel Launch Node\n";
+    kernel = new Kernel(NULL,
+                        N,
+                        N->getInArgMap(),
+                        N->getSharedInArgMap(),
+                        N->getOutArgMap(),
+                        N->getNumOfDim(),
+                        N->getDimLimits());
+  }
+  else {
+    // Converting a 2-level DFG to opencl kernel
+    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
+    KernelLaunchNode = PNode->getParent();
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
+    // Contains the instructions generating the kernel configuration parameters
+    kernel = new Kernel(NULL,                 // kernel function
+                        N,                    // kernel leaf node
+                        N->getInArgMap(),     // kenel argument mapping
+                        N->getSharedInArgMap(),
+                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
+                        PNode->getNumOfDim(), // gridDim
+                        PNode->getDimLimits(),// grid size
+                        N->getNumOfDim(),     // blockDim
+                        N->getDimLimits());   // block size
+
+  }
+
+  std::vector<IntrinsicInst *> IItoRemove;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+  Function *F_spir = N->getGenFuncForTarget(visc::SPIR_TARGET);
+  assert(F_spir == NULL && "Error: Visiting a node for which code already generated");
+
+  // Clone the function
+  ValueToValueMapTy VMap;
+
+  Twine FName = F->getName();
+  F_spir = CloneFunction(F, VMap);
+  F_spir->setName(FName+"_spir");
+  errs() << "Old Function Name: " << F->getName() << "\n";
+  errs() << "New Function Name: " << F_spir->getName() << "\n";
+
+  F_spir->removeFromParent();
+
+  // Insert the cloned function into the kernels module
+  KernelM->getFunctionList().push_back(F_spir);
+
+  //TODO: Iterate over all the instructions of F_spir and identify the
+  //callees and clone them into this module.
+  DEBUG(errs() << *F_spir->getType());
+  DEBUG(errs() << *F_spir);
+
+  //Add generated function info to DFNode
+  //N->setGenFunc(F_spir, visc::SPIR_TARGET);
+
+  F_spir = transformFunctionToVoid(F_spir);
+
+  // Add generated function info to DFNode
+  //N->setGenFunc(F_spir, visc::SPIR_TARGET);
+
+  removeInOutAttributes(F_spir);
+
+  //Add generated function info to DFNode
+  N->addGenFunc(F_spir, visc::SPIR_TARGET, false);
+
+  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_spir->removeAttributes(AttributeSet::FunctionIndex, F_spir->getAttributes().getFnAttributes());
+  F_spir->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+
+
+  //FIXME: For now, assume only one allocation node
+  kernel->AllocationNode = NULL;
+
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+       ieb != iee; ++ieb) {
+    DFNode *SrcDFNode = (*ieb)->getSourceDF();
+    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    if (!SrcDFNode->isDummyNode()) {
+      assert(SrcDFNode->isAllocationNode());
+      kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
+      kernel->allocInArgMap = SrcDFNode->getInArgMap();
+      break;
+    }
+  }
+
+  // Vector for shared memory arguments
+  std::vector<unsigned> SharedMemArgs;
+
+  // If no allocation node was found, SharedMemArgs is empty
+  if (kernel->AllocationNode) {
+
+    ValueToValueMapTy VMap;
+    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    //F_alloc->removeFromParent();
+    // Insert the cloned function into the kernels module
+    //M.getFunctionList().push_back(F_alloc);
+
+    std::vector<IntrinsicInst *> ViscMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+
+    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
+      IntrinsicInst *II = ViscMallocInstVec[i];
+      assert(II->hasOneUse() && "visc_malloc result is used more than once");
+      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->eraseFromParent();
+    }
+    kernel->AllocationFunction = F_alloc;
+
+    // This could be used to check that the allocation node has the appropriate
+    // number of fields in its return struct
+/*
+    ReturnInst *RI = ReturnInstVec[0];
+    Value *RetVal = RI->getReturnValue();
+    Type *RetTy = RetVal->getType();
+    StructType *RetStructTy = dyn_cast<StructType>(RetTy);
+    assert(RetStructTy && "Allocation node does not return a struct type");
+    unsigned numFields = RetStructTy->getNumElements();
+*/
+    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
+    AllocationNodeProperty* APN =
+      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
+    for (auto& AllocPair: APN->getAllocationList()) {
+      unsigned destPos = AllocPair.first->getDestPosition();
+      unsigned srcPos = AllocPair.first->getSourcePosition();
+      SharedMemArgs.push_back(destPos);
+      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+    }
+    kernel->setSharedInArgMap(sharedInMap);
+  }
+  std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
+
+  // All pointer args which are not shared memory pointers have to be moved to
+  // global address space
+  unsigned argIndex = 0;
+  std::vector<unsigned> GlobalMemArgs;
+  for(auto& Arg: F_spir->getArgumentList()) {
+    if (Arg.getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list, skip.
+      // Else put it in Global memory arguement list
+      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+        GlobalMemArgs.push_back(argIndex);
+      }
+    }
+    argIndex++;
+  }
+  std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end());
+
+  /* At this point, we assume that chescks for the fact that SharedMemArgs only
+     contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
+     analysis pass */
+
+  F_spir = changeArgAddrspace(F_spir, SharedMemArgs, SHARED_ADDRSPACE);
+  removeAttributeAtArguments(F_spir, SharedMemArgs, Attribute::NoCapture);
+  F_spir = changeArgAddrspace(F_spir, GlobalMemArgs, GLOBAL_ADDRSPACE);
+
+
+  // Go through all the instructions
+  for (inst_iterator i = inst_begin(F_spir), e = inst_end(F_spir); i != e; ++i) {
+    Instruction *I = &(*i);
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /************************ Handle VISC Query intrinsics ************************/
+
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *****************************/
+      case Intrinsic::visc_getNode: {
+        DEBUG(errs() << F_spir->getName() << "\t: Handling getNode\n");
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
+      }
+      break;
+      /************************* llvm.visc.getParentNode() **************************/
+      case Intrinsic::visc_getParentNode: {
+        DEBUG(errs() << F_spir->getName() << "\t: Handling getParentNode\n");
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // get the parent node of the arg node
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /*************************** llvm.visc.getNumDims() ***************************/
+      case Intrinsic::visc_getNumDims: {
+        DEBUG(errs() << F_spir->getName() << "\t: Handling getNumDims\n");
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        int numOfDim = ArgDFNode->getNumOfDim();
+        DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
+        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+
+        // Replace the result of the intrinsic with the computed value
+        II->replaceAllUsesWith(numOfDimConstant);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /*********************** llvm.visc.getNodeInstanceID() ************************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        DEBUG(errs() << F_spir->getName() << "\t: Handling getNodeInstanceID\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        assert(ArgDFNode && "Arg node is NULL");
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNodeInstanceID_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = N->getParent()->getLevel();
+        int parentReplFactor = N->getParent()->getNumOfDim();
+        DEBUG(errs() << "Parent Level = " << parentLevel << "\n");
+        DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n");
+
+        FunctionType* FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),           
+                              ArrayRef<Type*>(Type::getInt32Ty(KernelM->getContext())),
+                              false);
+
+        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so we need to specify a global id.
+          // We can translate this only if the argument is the current node
+          // itself
+          DEBUG(errs() << "Substitute with get_global_id()\n");
+          DEBUG(errs() << *II << "\n");
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_global_id"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
+          // We are asking for this node's id with respect to its parent
+          // this is a local id call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_local_id"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's id with respect to its
+          // parent: this is a group id call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_group_id"), FT));
+        } else {
+          errs() << N->getFuncPointer()->getName() << "\n";
+          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
+          errs() << *II << "\n";
+
+          assert(false && "Unable to translate getNodeInstanceID intrinsic");
+        }
+
+        // Create call instruction, insert it before the intrinsic and truncate
+        // the output to 32 bits and replace all the uses of the previous
+        // instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        II->replaceAllUsesWith(CI);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      /********************** llvm.visc.getNumNodeInstances() ***********************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
+//TODO: think about whether this is the best way to go
+// there are hw specific registers. therefore it is good to have the intrinsic
+// but then, why do we need to keep that info in the graph?
+// (only for the kernel configuration during the call)
+
+        DEBUG(errs() << F_spir->getName() << "\t: Handling getNumNodeInstances\n");
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // A leaf node always has a parent
+        DFNode* ParentDFNode = ArgDFNode->getParent();
+        assert(ParentDFNode && "Parent node of a leaf is NULL");
+
+        // Get the number associated with the required dimension
+        // FIXME: The order is important!
+        // These three intrinsics need to be consecutive x,y,z
+        uint64_t dim = II->getIntrinsicID() -
+                       Intrinsic::visc_getNumNodeInstances_x;
+        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
+        DEBUG(errs() << "\t  dimension = " << dim << "\n");
+
+        // Argument of the function to be called
+        ConstantInt * DimConstant =
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        //ArrayRef<Value *> Args(DimConstant);
+
+        // The following is to find which function to call
+        Function * OpenCLFunction;
+        int parentLevel = ParentDFNode->getLevel();
+        int parentReplFactor = ParentDFNode->getNumOfDim();
+
+        FunctionType* FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()),
+                              false);
+        if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) {
+          // We only have one level in the hierarchy or the parent node is not
+          // replicated. This indicates that the parent node is the kernel
+          // launch, so the instances are global_size (gridDim x blockDim)
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_global_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
+          // We are asking for this node's instances
+          // this is a local size (block dim) call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_local_size"), FT));
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
+          // We are asking for this node's parent's instances
+          // this is a (global_size/local_size) (grid dim) call
+          OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("get_num_groups"), FT));
+        } else {
+          assert(false && "Unable to translate getNumNodeInstances intrinsic");
+        }
+
+        // Create call instruction, insert it before the intrinsic and truncate
+        // the output to 32 bits and replace all the uses of the previous
+        // instruction with the new one
+        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        II->replaceAllUsesWith(CI);
+
+        IItoRemove.push_back(II);
+      }
+      break;
+      case Intrinsic::visc_barrier:
+      {
+        DEBUG(errs() << F_spir->getName() << "\t: Handling barrier\n");
+        DEBUG(errs() << "Substitute with barrier()\n");
+        DEBUG(errs() << *II << "\n");
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
+                              std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
+                              false);
+        Function* OpenCLFunction = cast<Function>
+                           (KernelM->getOrInsertFunction(getMangledName("barrier"), FT));
+        CallInst* CI = CallInst::Create(OpenCLFunction,
+                               ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
+                               "", II);
+        II->replaceAllUsesWith(CI);
+        IItoRemove.push_back(II); 
+      }
+      break;
+      case Intrinsic::visc_atomic_cmpxchg:
+      case Intrinsic::visc_atomic_add:
+      case Intrinsic::visc_atomic_sub:
+      case Intrinsic::visc_atomic_xchg:
+      case Intrinsic::visc_atomic_min:
+      case Intrinsic::visc_atomic_umin:
+      case Intrinsic::visc_atomic_max:
+      case Intrinsic::visc_atomic_umax:
+      case Intrinsic::visc_atomic_and:
+      case Intrinsic::visc_atomic_or:
+      case Intrinsic::visc_atomic_xor:
+      case Intrinsic::visc_atomic_inc:
+      case Intrinsic::visc_atomic_dec:
+      {
+        DEBUG(errs() << *II << "\n");
+        // Only have support for i32 atomic intrinsics
+        assert(II->getType() == Type::getInt32Ty(II->getContext())
+            && "Only support i32 atomic intrinsics for now");
+        // Substitute with appropriate atomic builtin
+        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
+
+        Value* Ptr = II->getArgOperand(0);
+        Value* Val = II->getArgOperand(1);
+        assert(Ptr->getType()->isPointerTy()
+            && "First argument of supported atomics is expected to be a pointer");
+        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
+        if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
+          Ptr = CastInst::CreatePointerCast(Ptr,
+                                            Type::getInt32PtrTy(II->getContext(),
+                                            PtrTy->getAddressSpace()), "", II);
+        }
+
+        StringRef name = getAtomicOpName(II->getIntrinsicID(), PtrTy->getAddressSpace());
+
+        Type* paramTypes[] = { Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()),
+                               Type::getInt32Ty(KernelM->getContext())
+                             };
+        FunctionType* AtomicFT = FunctionType::get(II->getType(),
+                                 ArrayRef<Type*>(paramTypes, 2),
+                                 false);
+        Function* AtomicFunction = cast<Function>
+                                   (KernelM->getOrInsertFunction(name, AtomicFT));
+        Value* atomicArgs[] = { Ptr, Val };
+        CallInst* AtomicInst = CallInst::Create(AtomicFunction,
+                                               ArrayRef<Value*>(atomicArgs, 2),
+                                               "", II);
+
+        DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n");
+        II->replaceAllUsesWith(AtomicInst);
+        IItoRemove.push_back(II);
+      }
+      break;
+      default:
+        assert(false && "Unknown VISC Intrinsic!");
+        break;
+      }
+
+    }
+    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if(calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+        if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
+          // Now handle a few specific intrinsics
+          // For now, sin and cos are translated to their libclc equivalent
+          switch(II->getIntrinsicID()) {
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+          case Intrinsic::sqrt:
+          case Intrinsic::floor:
+          case Intrinsic::nvvm_rsqrt_approx_f:
+          {
+            DEBUG(errs() << "Found math function: " << *II << "\n");
+            // Get the builtin function
+            // SPIR uses mangled name for builtin math functions
+            assert(II->getType()->isFloatTy()
+                   && "Only handling sin(float) and cos(float)!");
+            std::string name = getMathFunctionName(II->getIntrinsicID());
+
+            FunctionType* MathFT = FunctionType::get(II->getType(),
+                                   Type::getFloatTy(KernelM->getContext()),
+                                   false);
+            Function* MathFunction = cast<Function>
+                                     (KernelM->getOrInsertFunction(name, MathFT));
+            CallInst* CI = CallInst::Create(MathFunction, II->getArgOperand(0), II->getName(), II);
+
+            II->replaceAllUsesWith(CI);
+            IItoRemove.push_back(II);
+            break;
+          }
+          default:
+            DEBUG(errs() << "[WARNING] Found Intrinsic: " << *II << "\n" );
+          }
+        }
+
+      }
+      else {
+        // Clone the function
+        ValueToValueMapTy VMap;
+        Function* newCalleeF = CloneFunction(calleeF, VMap);
+        newCalleeF->removeFromParent(); //TODO: MARIA check
+        KernelM->getFunctionList().push_back(newCalleeF);
+      }
+      //TODO: how to handle address space qualifiers in load/store
+    }
+
+  }
+
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri)
+    (*ri)->eraseFromParent();
+
+  addCLMetadata(F_spir);
+  kernel->KernelFunction = F_spir;
+  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
+  DEBUG(errs() << *KernelM);
+
+  return;
+}
+
+bool DFG2LLVM_SPIR::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_SPIR PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_SPIR *CGTVisitor = new CGT_SPIR(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  // This is not required. Itrinsics that do not have a use are not a problem
+  //CGTVisitor->removeLLVMIntrinsics();
+  CGTVisitor->writeKernelsModule();
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+std::string CGT_SPIR::getKernelsModuleName(Module &M) {
+  /*SmallString<128> currentDir;
+  llvm::sys::fs::current_path(currentDir);
+  std::string fileName = getFilenameFromModule(M);
+  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+  return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
+}
+
+void CGT_SPIR::fixValueAddrspace(Value* V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType())
+         && "Value should be of Pointer Type!");
+  PointerType* OldTy = cast<PointerType>(V->getType());
+  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
+}
+
+Function* CGT_SPIR::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned argno = arg.getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(&arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg.getType());
+  }
+  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
+  return newF;
+}
+
+/* Remove the specified argument from arguments at positions denoted in Args */
+void CGT_SPIR::removeAttributeAtArguments(Function* F, std::vector<unsigned> &Args, Attribute::AttrKind attrKind) {
+  DEBUG(errs() << "Removing nocapture attribute from shared memory arguments of function " << F->getName() << "\n");
+
+  unsigned cnt = 0, arg_no = 0;
+  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae && arg_no < Args.size(); ++ai, ++cnt) {
+
+    if (Args[arg_no] == cnt) {
+      AttributeSet AS = F->getAttributes();
+      AttrBuilder AB(AS, ai->getArgNo()+1);
+      AB.removeAttribute(attrKind);
+      AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB);
+      F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1));
+      F->addAttributes(1+ai->getArgNo(), argAS);
+
+      arg_no++;
+    }
+  }
+}
+
+/* Add metadata to module KernelM, for OpenCL kernels */
+void CGT_SPIR::addCLMetadata(Function *F) {
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
+
+  IRBuilder<> Builder(&*F->begin());
+ 
+  // Create node for "kernel_arg_type"
+  SmallVector<Metadata*,8> argTypeNames;
+  argTypeNames.push_back(MDString::get(KernelM->getContext(), "kernel_arg_type"));
+
+  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+      ai++) {
+    argTypeNames.push_back(MDString::get(KernelM->getContext(), printType(ai->getType())));
+  }
+  // All argument type names are in the vector. Create a metadata node
+  // "kernel_arg_type"
+  MDTuple* KernelArgTypes = MDNode::get(KernelM->getContext(), argTypeNames);
+  
+  // Create kernel metadata node containg the kernel function and the
+  // "kernel_arg_type" metadata node created above
+  SmallVector<Metadata*,8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
+  KernelMD.push_back(KernelArgTypes);
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+
+  // Create metadata node opencl.kernels. It points to the kernel metadata node
+  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  //KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  //KernelMD.push_back(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1));
+  //MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  //NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  //MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+
+}
+
+/* Function to remove all remaining declarations of llvm intrinsics,
+ * as they are not supported in SPIR.
+ */
+void CGT_SPIR::removeLLVMIntrinsics() {
+
+  std::vector<Function*> fv = std::vector<Function*>();
+
+  for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); (mi != me); ++mi) {
+    Function* F = &*mi;
+    if (F->isDeclaration() && F->getName().startswith("llvm.")) {
+      DEBUG(errs() << "Declaration: " << F->getName() << " with " << F->getNumUses() <<"uses.\n");
+      assert(F->hasNUses(0) && "LLVM intrinsic function still in use");
+      fv.push_back(F);
+    }
+  }
+
+  for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
+    DEBUG(errs() << "Erasing declaration: " << (*vi)->getName() <<"\n");
+    (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
+    (*vi)->eraseFromParent();
+  }
+
+}
+
+void CGT_SPIR::writeKernelsModule() {
+
+  // In addition to deleteing all otjer functions, we also want to spice it up a
+  // little bit. Do this now.
+  legacy::PassManager Passes;
+
+  std::error_code EC;
+  tool_output_file Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << "\n";
+  }
+
+  Passes.add(
+      createPrintModulePass(Out.os()));
+
+  Passes.run(*KernelM);
+
+  // Declare success.
+  Out.keep();
+}
+
+Function* CGT_SPIR::transformFunctionToVoid(Function* F) {
+
+  // FIXME: Maybe do that using the Node?
+  StructType* FRetTy = cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
+
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
+
+
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
+
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
+
+    // Replacing return statements with others returning void
+    for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(),
+         e = RItoRemove.end(); i != e; ++i) {
+      ReturnInst::Create((F->getContext()), 0, (*i));
+      (*i)->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  }
+  else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    std::vector<Argument*> Args;
+    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      Args.push_back(RetArg);
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
+
+    Function::arg_iterator ai, ae;
+
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(),
+         rie = RItoRemove.end(); rii != rie; ++rii) {
+      ReturnInst* RI = (*rii);
+      Value* RetVal = RI->getReturnValue();
+      for(unsigned i = 0; i < Args.size(); i++) {
+        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+                                    Args[i]->getName()+".val", RI);
+        new StoreInst(EI, Args[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type* VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  //F->mutateType(PTy);
+  Function* newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  return newF;
+}
+
+// Remove the visc in/out attributes from kernel function
+void CGT_SPIR::removeInOutAttributes(Function* F) {
+  DEBUG(errs() << "Removing visc attributes from argument list of function " << F->getName() << "\n");
+  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ai++) {
+
+    AttributeSet AS = F->getAttributes();
+    AttrBuilder AB(AS, ai->getArgNo()+1);
+    AB.removeAttribute(Attribute::In);
+    AB.removeAttribute(Attribute::Out);
+    AB.removeAttribute(Attribute::InOut);
+    AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB);
+    F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1));
+    F->addAttributes(1+ai->getArgNo(), argAS);
+
+  }
+}
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+// Calculate execute node parameters which include, number of diemnsions for
+// dynamic instances of the kernel, local and global work group sizes.
+static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
+                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if(!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  }
+  else {
+    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if(isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if(isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value*> globalWGSizeInsts;
+  if(kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  }
+  else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+}
+
+// CodeGen for allocating space for Work Group on stack and returning a pointer
+// to its address
+static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
+  Value* WGPtr;
+  // Get int64_t and or ease of use
+  Type* Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst* WG = new AllocaInst(WGTy, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
+  Value* nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for(unsigned i=0; i < WGSize.size(); i++) {
+    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+    if(WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst* SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if(i+1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
+                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
+                               WG->getName()+"."+Twine(i+1),
+                               IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
+
+}
+
+//Get generated SPIR binary name
+static std::string getSPIRFilename(const Module& M) {
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.bc");
+
+}
+
+// Get the name of the input file from module ID
+static std::string getFilenameFromModule(const Module& M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/")+1);
+}
+
+// Changes the data layout of the Module to be compiled with SPIR backend
+// TODO: Figure out when to call it, probably after duplicating the modules
+static void changeDataLayout(Module &M) {
+  std::string spir64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024";
+    
+  M.setDataLayout(StringRef(spir64_layoutStr));
+  return;
+}
+
+static void changeTargetTriple(Module &M) {
+  std::string spir64_TargetTriple = "spir64-unknown-unknown";
+  M.setTargetTriple(StringRef(spir64_TargetTriple));
+}
+
+// Helper function, generate a string representation of a type
+static std::string printType(Type* ty) {
+  std::string type_str;
+  raw_string_ostream rso(type_str);
+  ty->print(rso);
+  return rso.str();
+}
+
+// Helper function to get mangled names of OpenCL built ins
+static StringRef getMangledName(std::string name) {
+  Twine mangledName = "_Z"+Twine(name.size())+name+"j";
+  return StringRef(mangledName.str());
+}
+
+
+// Helper function, populate a vector with all return statements in a function
+static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    ReturnInst* RI = dyn_cast<ReturnInst>(I);
+    if (RI) {
+      ReturnInstVec.push_back(RI);
+    }
+  }
+}
+
+// Helper function, populate a vector with all IntrinsicID intrinsics in a function
+static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
+}
+
+// Helper function to get mangled names of OpenCL built ins for atomics
+static StringRef getAtomicMangledName(std::string name, unsigned addrspace, bool sign) {
+  Twine mangledName = "_Z" +
+                      Twine(name.size())+name +
+                      "PU3AS"+Twine(addrspace) + "jj";
+//                      ((sign) ? "ii" : "jj");
+  return StringRef(mangledName.str());
+}
+
+// Helper funtion, returns the OpenCL function name corresponding to atomic op
+static StringRef getAtomicOpName(Intrinsic::ID ID, unsigned addrspace) {
+  switch(ID) {
+    case Intrinsic::visc_atomic_cmpxchg:
+      return getAtomicMangledName("atom_cmpxchg", addrspace, true);
+    case Intrinsic::visc_atomic_add:
+      return getAtomicMangledName("atom_add", addrspace, true);
+    case Intrinsic::visc_atomic_sub:
+      return getAtomicMangledName("atom_sub", addrspace, true);
+    case Intrinsic::visc_atomic_min:
+      return getAtomicMangledName("atom_min", addrspace, true);
+    case Intrinsic::visc_atomic_umin:
+      return getAtomicMangledName("atom_min", addrspace, false);
+    case Intrinsic::visc_atomic_max:
+      return getAtomicMangledName("atom_max", addrspace, true);
+    case Intrinsic::visc_atomic_umax:
+      return getAtomicMangledName("atom_max", addrspace, false);
+    case Intrinsic::visc_atomic_inc:
+      return getAtomicMangledName("atom_inc", addrspace, true);
+    case Intrinsic::visc_atomic_dec:
+      return getAtomicMangledName("atom_dec", addrspace, true);
+    case Intrinsic::visc_atomic_xchg:
+      return getAtomicMangledName("atom_xchg", addrspace, true);
+    case Intrinsic::visc_atomic_and:
+      return getAtomicMangledName("atom_and", addrspace, true);
+    case Intrinsic::visc_atomic_or:
+      return getAtomicMangledName("atom_or", addrspace, true);
+    case Intrinsic::visc_atomic_xor:
+      return getAtomicMangledName("atom_xor", addrspace, true);
+    default:
+      llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+static std::string getMathFunctionName(Intrinsic::ID ID) {
+  switch(ID) {
+    case Intrinsic::sin: return "_Z3sinf";
+    case Intrinsic::cos: return "_Z3cosf";
+    case Intrinsic::sqrt: return "_Z4sqrtf";
+    case Intrinsic::floor: return "_Z5floorf";
+    case Intrinsic::nvvm_rsqrt_approx_f: return "_Z5rsqrtf";
+    default:
+      llvm_unreachable("Unsupported math function!");
+  };
+}
+
+} // End of namespace
+
+char DFG2LLVM_SPIR::ID = 0;
+static RegisterPass<DFG2LLVM_SPIR> X("dfg2llvm-spir",
+                                      "Dataflow Graph to LLVM for SPIR Pass",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_SPIR/LLVMBuild.txt b/lib/DFG2LLVM_SPIR/LLVMBuild.txt
new file mode 100644
index 0000000000..72c4de9efd
--- /dev/null
+++ b/lib/DFG2LLVM_SPIR/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_SPIR
+parent = Transforms
diff --git a/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt b/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt
new file mode 100644
index 0000000000..22c219d0a1
--- /dev/null
+++ b/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_WrapperAPI
+  DFG2LLVM_WrapperAPI.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
new file mode 100644
index 0000000000..ecec258dfe
--- /dev/null
+++ b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
@@ -0,0 +1,1532 @@
+//=== DFG2LLVM_WrapperAPI.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "DFG2LLVM_WrapperAPI"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include <sstream>
+#include <fstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+using namespace inplacedfg;
+
+namespace {
+
+cl::opt<std::string> QuantizationInputsFilename(
+  "quantization-levels-filename",
+  cl::desc("<PROMISE quantization levels input file (path)>"),
+  cl::value_desc("filename"),
+  cl::Required);
+
+cl::opt<std::string> ConfigurationInputsFilename(
+  "configuration-inputs-filename",
+  cl::desc("<Autotuner configurations input file (path)>"),
+  cl::value_desc("filename"),
+  cl::Required);
+
+// Helper function declarations
+bool isValidOperandForInPlaceOperation(Value *, Function *, DFNode *,
+                                       InPlaceDFGAnalysis::InPlaceDFGParameter &);
+
+// Helper class declarations
+
+// State machine definition for pattern identification
+
+/* An assumption is made for the Wrapper API input:                           *
+ * a leaf node will contain consequtive operations that will map to a         *
+ * single convolution or fully connected layer, or a single tensor operation. *
+
+ * FullyConnectedLayer: Multiply, Add, [Activation]                           *
+ * ConvolutionLayer: Convolution, [Add], [Activation], [Pooling]              */
+
+class AbstractState;
+
+class CodeGenStateMachine {
+private:
+  Module *M;
+  Module *RtM;
+
+  std::vector<Value*> Args;
+  std::vector<IntrinsicInst*> IIs;
+  std::vector<IntrinsicInst*> IIs_remove; // Intrinsics to remove
+  AbstractState *current;
+
+public:
+  CodeGenStateMachine(Module *, Module *);
+
+  void setCurrent(AbstractState *s) {
+    current = s;
+  }
+
+  void transition(IntrinsicInst *II);
+
+  Module *getModule() {
+    return M;
+  }
+
+  Module *getRtModule() {
+    return RtM;
+  }
+
+  void addArgument(Value *Arg) {
+    Args.push_back(Arg);
+  }
+
+  void addIntrinsicInst(IntrinsicInst *II) {
+    IIs.push_back(II);
+  }
+
+  void addIntrinsicToRemove(IntrinsicInst *II) {
+    IIs_remove.push_back(II);
+  }
+
+  IntrinsicInst *getIntrinsicInstAt(unsigned idx) {
+    return IIs[idx];
+  }
+
+  void codeGen(DFNode *, Function * , const StringRef &,
+               InPlaceDFGAnalysis::InPlaceDFGParameter &);
+
+};
+
+class AbstractState {
+public:
+  enum ID
+  {
+    INITIAL_STATE,
+    FULLY_CONNECTED_LAYER_1,
+    FULLY_CONNECTED_LAYER_2,
+    FULLY_CONNECTED_LAYER_3,
+    FULLY_CONNECTED_LAYER,
+    CONVOLUTION_LAYER_1,
+    CONVOLUTION_LAYER_2,
+    CONVOLUTION_LAYER_3,
+    CONVOLUTION_LAYER_4,
+    CONVOLUTION_LAYER,
+    SINGLE_TENSOR_OPERATION,
+    NO_PATTERN,
+  };
+
+protected:
+  enum ID StateID;
+
+public:
+  enum ID getStateID() {
+    return StateID;
+  }
+
+  virtual void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) = 0;
+  virtual ~AbstractState() {}
+};
+
+class InitialState : public AbstractState {
+public:
+  InitialState() {
+    StateID = ID::INITIAL_STATE;
+    DEBUG(errs() << "new InitialState\n");
+  }
+  ~InitialState() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_1 : public AbstractState {
+public:
+  FullyConnectedLayer_1() {
+    StateID = ID::FULLY_CONNECTED_LAYER_1;
+    DEBUG(errs() << "new FullyConnectedLayer_1\n");
+  }
+  ~FullyConnectedLayer_1() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_2 : public AbstractState {
+public:
+  FullyConnectedLayer_2() {
+    StateID = ID::FULLY_CONNECTED_LAYER_2;
+    DEBUG(errs() << "new FullyConnectedLayer_2\n");
+  }
+  ~FullyConnectedLayer_2() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer_3 : public AbstractState {
+public:
+  FullyConnectedLayer_3() {
+    StateID = ID::FULLY_CONNECTED_LAYER_3;
+    DEBUG(errs() << "new FullyConnectedLayer_3\n");
+  }
+  ~FullyConnectedLayer_3() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class FullyConnectedLayer : public AbstractState {
+public:
+  FullyConnectedLayer() {
+    StateID = ID::FULLY_CONNECTED_LAYER;
+    DEBUG(errs() << "new FullyConnectedLayer\n");
+  }
+  ~FullyConnectedLayer() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_1 : public AbstractState {
+public:
+  ConvolutionLayer_1() {
+    StateID = ID::CONVOLUTION_LAYER_1;
+    DEBUG(errs() << "new ConvolutionLayer_1\n");
+  }
+  ~ConvolutionLayer_1() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_2 : public AbstractState {
+public:
+  ConvolutionLayer_2() {
+    StateID = ID::CONVOLUTION_LAYER_2;
+    DEBUG(errs() << "new ConvolutionLayer_2\n");
+  }
+  ~ConvolutionLayer_2() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_3 : public AbstractState {
+public:
+  ConvolutionLayer_3() {
+    StateID = ID::CONVOLUTION_LAYER_3;
+    DEBUG(errs() << "new ConvolutionLayer_3\n");
+  }
+  ~ConvolutionLayer_3() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer_4 : public AbstractState {
+public:
+  ConvolutionLayer_4() {
+    StateID = ID::CONVOLUTION_LAYER_4;
+    DEBUG(errs() << "new ConvolutionLayer_4\n");
+  }
+  ~ConvolutionLayer_4() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class ConvolutionLayer : public AbstractState {
+public:
+  ConvolutionLayer() {
+    StateID = ID::CONVOLUTION_LAYER;
+    DEBUG(errs() << "new ConvolutionLayer\n");
+  }
+  ~ConvolutionLayer() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class SingleTensorOperation : public AbstractState {
+public:
+  SingleTensorOperation() {
+    StateID = ID::SINGLE_TENSOR_OPERATION;
+    DEBUG(errs() << "new SingleTensorOperation\n");
+  }
+  ~SingleTensorOperation() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+class NoPattern : public AbstractState {
+public:
+  NoPattern() {
+    StateID = ID::NO_PATTERN;
+    DEBUG(errs() << "new NoPattern\n");
+  }
+  ~NoPattern() {}
+
+  void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override;
+};
+
+  
+void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_convolution:
+        {
+        Mch->addIntrinsicInst(II);
+        Mch->addArgument(II->getOperand(0)); // conv input
+        Mch->addArgument(II->getOperand(1)); // conv kernel
+
+        Mch->setCurrent(new ConvolutionLayer_1());
+        }
+        break;
+      case Intrinsic::visc_tensor_mul:
+        {
+        Mch->addIntrinsicInst(II);
+        Mch->addArgument(II->getOperand(0)); // 1st gemm input
+        Mch->addArgument(II->getOperand(1)); // 2nd gemm input
+
+        Mch->setCurrent(new FullyConnectedLayer_1());
+        }
+        break;
+
+      case Intrinsic::visc_node_id:
+        {
+
+	 DEBUG(errs() << "\t: Handling __visc_node_id \n");
+         // Get uint32 node ID
+         Value *Op = II->getOperand(0);
+
+	 std::vector<Value*> Args;
+         Args.push_back(Op); 
+
+	 Module *M = Mch->getModule();
+	 Module *RtM = Mch->getRtModule();
+	 
+         Constant* visc_node_id_call =
+          M->getOrInsertFunction(StringRef("tensor_set_node_id"),
+                  RtM->getFunction(StringRef("tensor_set_node_id"))->getFunctionType());
+
+	 CallInst::Create(visc_node_id_call, Args, "", II);
+
+	 Mch->addIntrinsicToRemove(II);
+	 Mch->setCurrent(new InitialState());
+        }
+        break;
+	
+      default: // Other HPVM intrinsic
+        {
+        Mch->addIntrinsicInst(II);
+        Mch->setCurrent(new SingleTensorOperation());
+        }
+        break;
+    }
+    delete this;
+  } // else {} // No HPVM intrinsic received. Remain at initial 
+}
+
+void SingleTensorOperation::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    Mch->setCurrent(new NoPattern());
+    delete this;
+  }
+}
+
+void FullyConnectedLayer_1::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_add:
+        {
+        IntrinsicInst *MulII = Mch->getIntrinsicInstAt(0);
+        assert((MulII == II->getOperand(0)) &&
+               "Output of mul must be used as 1st operand of add");
+        Mch->addIntrinsicInst(II);
+
+        Mch->addArgument(II->getOperand(1));     // bias
+
+        Mch->setCurrent(new FullyConnectedLayer_2());
+        }
+        break;
+      default:
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer_2::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_tanh:
+        {
+        // Type of activation : TanH
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_relu:
+        {
+        // Type of activation : ReLU
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_clipped_relu:
+        {
+        // Type of activation : Clipped ReLU
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new FullyConnectedLayer_3());
+        }
+        break;
+      default: // No activation, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    Mch->setCurrent(new FullyConnectedLayer());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer_3::transition(CodeGenStateMachine *Mch,
+                                       IntrinsicInst *II) {
+  if (!II) { // End of instruction stream
+    Mch->setCurrent(new FullyConnectedLayer());
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void FullyConnectedLayer::transition(CodeGenStateMachine *Mch,
+                                     IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    Mch->setCurrent(new NoPattern());
+    delete this;
+  }
+}
+
+void ConvolutionLayer_1::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_add:
+        {
+        IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0);
+        assert((ConvII == II->getOperand(0)) &&
+               "Output of conv must be used as 1st operand of add");
+        Mch->addIntrinsicInst(II);
+
+        Mch->addArgument(II->getOperand(1));     // bias
+
+        Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv
+        Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv
+
+        Mch->setCurrent(new ConvolutionLayer_2());
+        }
+        break;
+      default:
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else {
+    // No addition
+    Mch->addArgument(ConstantPointerNull::get(
+                     Type::getInt8PtrTy(Mch->getModule()->getContext())));
+
+    // Zero for all convolution numeric arguments FIXME???
+    IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0);
+    Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv
+    Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv
+    Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv
+    Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv
+
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    //    Mch->addArgument(ConstantInt::get(
+    //                     Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+
+    // No pooling
+    // 0 for unused pool arguments:
+    // pool_id, pool_size_v, pool_size_h, pool pad_v,
+    // pool_pad_h, pool_stride_v, pool_stride_h
+    for (int i = 0; i < 7; i++) {
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    }
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_tanh:
+        {
+	  // Type of activation : TanH
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_relu:
+        {
+	  // Type of activation : ReLU
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_clipped_relu:
+        {
+	  // Type of activation : Clipped ReLU
+	  //        Mch->addArgument(ConstantInt::get(
+	  //                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_3());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_max:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_min:
+        {
+        // pool min FIXME: 2: supported?
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_mean:
+        {
+        // pool mean
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        // No activation
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+        Mch->addIntrinsicInst(II);
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      default: // No activation, No pooling, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No pooling
+    // 0 for unused pool arguments:
+    // pool_id, pool_size_v, pool_size_h, pool pad_v,
+    // pool_pad_h, pool_stride_v, pool_stride_h
+    for (int i = 0; i < 7; i++) {
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    }
+    // No activation
+    Mch->addArgument(ConstantInt::get(
+                     Type::getInt32Ty(Mch->getModule()->getContext()), -1));
+
+    Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_3::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_pool_max:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_min:
+        {
+        // pool min FIXME: 2: supported?
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+	
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      case Intrinsic::visc_tensor_pool_mean:
+        {
+        // pool max
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        // pool_size_v, pool_size_h, pool pad_v,
+        // pool_pad_h, pool_stride_v, pool_stride_h
+        for (int i = 1; i < 7; i++) {
+            Mch->addArgument(II->getOperand(i));
+        }
+        Mch->addIntrinsicInst(II);
+
+        // Revisit last intrinsic, to add argument for activation operation
+        IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+        // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+        Intrinsic::ID ActIID = ActII->getIntrinsicID();
+        if (ActIID == Intrinsic::visc_tensor_tanh) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+        } else if (ActIID == Intrinsic::visc_tensor_relu) {
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+        } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+          Mch->addArgument(ConstantInt::get(
+                           Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+        }
+
+        Mch->setCurrent(new ConvolutionLayer_4());
+        }
+        break;
+      default: // No pooling, but HPVM intrinsic
+        Mch->setCurrent(new NoPattern());
+        break;
+    }
+  } else { // End of instruction stream
+    // No pooling
+    // 0 for unused pool arguments:
+    // pool_id, pool_size_v, pool_size_h, pool pad_v,
+    // pool_pad_h, pool_stride_v, pool_stride_h
+    for (int i = 0; i < 7; i++) {
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+    }
+
+    // Revisit last intrinsic, to add argument for activation operation
+    IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2);
+    // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU
+    Intrinsic::ID ActIID = ActII->getIntrinsicID();
+    if (ActIID == Intrinsic::visc_tensor_tanh) {
+      Mch->addArgument(ConstantInt::get(
+                       Type::getInt32Ty(Mch->getModule()->getContext()), 0));
+      } else if (ActIID == Intrinsic::visc_tensor_relu) {
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 1));
+      } else { //ActIID == Intrinsic::visc_tensor_clipped_relu
+        Mch->addArgument(ConstantInt::get(
+                         Type::getInt32Ty(Mch->getModule()->getContext()), 2));
+      }
+
+     Mch->setCurrent(new ConvolutionLayer());
+  }
+  delete this;
+}
+
+void ConvolutionLayer_4::transition(CodeGenStateMachine *Mch,
+                                    IntrinsicInst *II) {
+  if (!II) { // End of instruction stream
+    Mch->setCurrent(new ConvolutionLayer());
+  } else {
+    Mch->setCurrent(new NoPattern());
+  }
+  delete this;
+}
+
+void ConvolutionLayer::transition(CodeGenStateMachine *Mch,
+                                  IntrinsicInst *II) {
+  if (II) { // Not end of instruction stream
+    Mch->setCurrent(new NoPattern());
+    delete this;
+  }
+}
+
+void NoPattern::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {}
+
+CodeGenStateMachine::CodeGenStateMachine(Module *_M, Module *_RtM) :
+  M(_M), RtM(_RtM) {
+  current = new InitialState();
+}
+
+void CodeGenStateMachine::transition(IntrinsicInst *II) {
+  current->transition(this, II);
+}
+
+void CodeGenStateMachine::codeGen(DFNode *N, Function *F, const StringRef &strRef,
+                                  InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) {
+
+  assert( ( (current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) ||
+            (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER)     ||
+            (current->getStateID() == AbstractState::ID::SINGLE_TENSOR_OPERATION) ) &&
+            "Unsupported instruction sequence for the Wrapper API.\n" );
+
+  if ((current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) ||
+      (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER)) {
+
+    // Layer Operation.
+    DEBUG(errs() << "Layer Instruction Sequence. Validating ...\n");
+    // We have a valid instruction sequence.
+    // Make sure that the instruction sequence can be traslated:
+    // each instruction's result must be used only by the next one in sequence.
+  
+    for (unsigned p = 0; p < IIs.size()-1; p++) {
+      IntrinsicInst *II = IIs[p];
+      assert((II->hasOneUse()) &&
+            "Instruction sequence does not fit pattern: not single use\n");
+  
+      Value::user_iterator ui = II->user_begin(); // The only use
+      assert((*ui == IIs[p+1]) &&
+             "Instruction sequence does not fit pattern: not used by next instruction\n");
+    }
+
+    // Create corresponding wrapper API call
+    CallInst *CI;
+    switch (current->getStateID()) {
+      case AbstractState::ID::CONVOLUTION_LAYER:
+        {
+          Constant* wrapper_ConvLayer2 =
+            M->getOrInsertFunction(StringRef("wrapper_ConvLayer2"),
+                   RtM->getFunction(StringRef("wrapper_ConvLayer2"))->getFunctionType());
+	  
+          DEBUG(errs() << *wrapper_ConvLayer2);
+  
+          // FIXME: get last (float) arguments from clipped relu intrinsic. For now, 0
+          Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0));
+          Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0));
+
+
+          // Create string for node name, as first argument for wrapper API call
+          Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                              strRef, true);
+          GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                                 true, GlobalValue::ExternalLinkage, ConstArray, "");
+
+          // Create GEP expression to access it
+          Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+          Constant* GEPIndices[] = { Int_0, Int_0 };
+          Constant* GEPConst =
+            ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                           GV, GEPIndices);
+
+          std::vector<Value*> UpdatedArgs;
+          UpdatedArgs.push_back(GEPConst);
+          for (unsigned i = 0; i < Args.size(); i++) {
+            UpdatedArgs.push_back(Args[i]);
+          }
+          // Create wrapper API function call
+          CI = CallInst::Create(wrapper_ConvLayer2, UpdatedArgs, "");
+        }
+        break;
+      case AbstractState::ID::FULLY_CONNECTED_LAYER:
+        {
+          Constant* wrapper_FCLayer =
+            M->getOrInsertFunction(StringRef("wrapper_FCLayer"),
+                RtM->getFunction(StringRef("wrapper_FCLayer"))->getFunctionType());
+          DEBUG(errs() << *wrapper_FCLayer);
+  
+          // FIXME: get last (float) arguments from clipped relu intrinsic. For now, 0
+          Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0));
+          Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0));
+
+          // Create string for node name, as first argument for wrapper API call
+          Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                              strRef, true);
+          GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                                 true, GlobalValue::ExternalLinkage, ConstArray, "");
+
+          // Create GEP expression to access it
+          Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+          Constant* GEPIndices[] = { Int_0, Int_0 };
+          Constant* GEPConst =
+            ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                           GV, GEPIndices);
+
+          std::vector<Value*> UpdatedArgs;
+          UpdatedArgs.push_back(GEPConst);
+          for (unsigned i = 0; i < Args.size(); i++) {
+            UpdatedArgs.push_back(Args[i]);
+          }
+
+          // Create wrapper API function call
+          CI = CallInst::Create(wrapper_FCLayer, UpdatedArgs, "");
+        }
+        break;
+      default:
+        llvm_unreachable("Unexpected CodeGenStateMachine State\n");
+        break;
+    }
+
+    // Insert new call and replace all uses of pattern result with
+    // the wrapper API call
+    IntrinsicInst *IIlast = *(IIs.rbegin());
+    CI->insertBefore(IIlast);
+    IIlast->replaceAllUsesWith(CI);
+
+  }
+  else { // SINGLE_TENSOR_OPERATION
+    assert((IIs.size() == 1) &&
+            "Unexpected size of intrinsics vector in code gen state machine.\n");
+    assert(Args.empty() && "Unexpected arguments found in coge gen state machine.\n");
+    IntrinsicInst *TensorII = IIs[0];
+
+    errs() << "TensorII: " << *TensorII << "\n";
+
+    switch (TensorII->getIntrinsicID()) {
+      case Intrinsic::visc_tensor_group_convolution:
+      { /* llvm.hpvm.tensor.group.conv */
+        // Tensor group conv is not in place.
+        DEBUG(errs() << F->getName() << "\t: Handling tensor group convolution \n");
+
+        // Argument list for the runtime call
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+        Args.push_back(TensorII->getOperand(1));
+        Args.push_back(TensorII->getOperand(2));
+        Args.push_back(TensorII->getOperand(3));
+        Args.push_back(TensorII->getOperand(4));
+        Args.push_back(TensorII->getOperand(5));
+
+        Constant *conv_mode = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1);
+        Args.push_back(conv_mode);
+
+        Args.push_back(TensorII->getOperand(7));
+    
+        // Create wrapper API runtime function call
+        Constant* wrapper_tensorGroupConvolution =
+          M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"),
+            RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType());
+        CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution,
+                                        Args, "", TensorII);
+        // We can replace the call to hpvm.tensor.mul with the runtime call
+        TensorII->replaceAllUsesWith(CI);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_batchnorm:
+      { /* llvm.hpvm.tensor.batchnorm */
+
+        // Tensor batchnorm is not in place.
+	// FIXME: Add Check for InPlace Analysis 
+        DEBUG(errs() << F->getName() << "\t: Handling tensor batch normalization \n");
+
+        // Argument list for the runtime call
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+        Args.push_back(TensorII->getOperand(1));
+        Args.push_back(TensorII->getOperand(2));
+        Args.push_back(TensorII->getOperand(3));
+        Args.push_back(TensorII->getOperand(4));
+        Args.push_back(TensorII->getOperand(5));
+
+        // Create wrapper API runtime function call
+        Constant* wrapper_tensorBatchNorm =
+          M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"),
+            RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType());
+        CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm,
+                                        Args, "", TensorII);
+        // We can replace the call to hpvm.tensor.batchnorm with the wrapper API call
+        TensorII->replaceAllUsesWith(CI);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_add:
+      { /* llvm.hpvm.tensor.add */
+        DEBUG(errs() << F->getName() << "\t: Handling tensorAdd\n");
+
+	// Tensor add(a,b) is in place for argument a.
+	//        Value *Op = TensorII->getOperand(0);
+        // Test the intrinsic operand for in place operation.
+	//        bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
+
+        // Code generation will not continue if this is false, because the target
+        // may provide an in place operation(safe choice)
+        // FIXME: remove this comment - must check for in-place
+	//        assert(inplace &&
+	//               "Operand not valid for in place operation. Code gen aborted.\n");
+
+
+        // Argument list for the runtime call
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+        Args.push_back(TensorII->getOperand(1));
+
+        // Create wrapper API runtime function call
+        Constant* wrapper_tensorAdd =
+          M->getOrInsertFunction(StringRef("wrapper_tensorAdd"),
+            RtM->getFunction(StringRef("wrapper_tensorAdd"))->getFunctionType());
+        CallInst::Create(wrapper_tensorAdd, Args, "", TensorII);
+        // We can replace the call to hpvm.tensor.add with the 1st argument
+        // that, due to in place operation, now contains the result
+        TensorII->replaceAllUsesWith(TensorII->getOperand(0));
+      }
+      break;
+
+      case Intrinsic::visc_tensor_pool_max:
+      case Intrinsic::visc_tensor_pool_mean:
+      case Intrinsic::visc_tensor_pool_min:
+      {
+        DEBUG(errs() << F->getName() << "\t: Handling tensor pooling functions\n");
+
+        // Argument list for tensor pooling:
+        // input, poolFunction, window_height, window_width,
+        // vertical_pad, horizontal_pad, vertical_stride, horizontal_stride
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+
+        int pool_type = 0;
+        if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_max) {
+          pool_type = 0;
+        }
+        if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean) {
+          pool_type = 1;
+        }
+        if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_min) {
+          pool_type = 2;
+        }
+
+        Constant *constPoolType =
+          ConstantInt::get(Type::getInt32Ty(M->getContext()), pool_type);
+        Args.push_back(constPoolType);
+
+        Args.push_back(TensorII->getOperand(1));
+        Args.push_back(TensorII->getOperand(2));
+        Args.push_back(TensorII->getOperand(3));
+        Args.push_back(TensorII->getOperand(4));
+        Args.push_back(TensorII->getOperand(5));
+        Args.push_back(TensorII->getOperand(6));
+
+        // Create wrapper API runtime function call
+        Constant* wrapper_tensorPooling =
+          M->getOrInsertFunction(StringRef("wrapper_tensorPooling"),
+            RtM->getFunction(StringRef("wrapper_tensorPooling"))->getFunctionType());
+        DEBUG(errs() << *wrapper_tensorPooling);
+        CallInst* CI = CallInst::Create(wrapper_tensorPooling, Args, "", TensorII);
+
+        // Replacing intrinsic result uses with the result of the tensor runtime operation
+        TensorII->replaceAllUsesWith(CI);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_relu:
+      case Intrinsic::visc_tensor_clipped_relu:
+      case Intrinsic::visc_tensor_tanh:
+      {
+        DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions\n");
+
+        // Tensor relu(a) (and others) is in place for argument a.
+        Value *Op = TensorII->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        //-- bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP);
+        // Code generation will not continue if this is false, because the target
+        // may provide an in place operation(safe choice)
+        //-- assert(inplace &&
+        //--        "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+
+        if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_relu) {
+              // Create wrapper API runtime function call
+              Constant* wrapper_tensorRelu =
+                M->getOrInsertFunction(StringRef("wrapper_tensorRelu"),
+                  RtM->getFunction(StringRef("wrapper_tensorRelu"))->getFunctionType());
+              DEBUG(errs() << *wrapper_tensorRelu);
+              CallInst::Create(wrapper_tensorRelu, Args, "", TensorII);
+        }
+        else if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) {
+              // Create wrapper API runtime function call
+              Constant* wrapper_tensorClippedRelu =
+                M->getOrInsertFunction(StringRef("wrapper_tensorClippedRelu"),
+                  RtM->getFunction(StringRef("wrapper_tensorClippedRelu"))->getFunctionType());
+              DEBUG(errs() << *wrapper_tensorClippedRelu);
+              CallInst::Create(wrapper_tensorClippedRelu, Args, "", TensorII);
+        }
+        else if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_tanh) {
+              // Create wrapper API runtime function call
+              Constant* wrapper_tensorTanh =
+                M->getOrInsertFunction(StringRef("wrapper_tensorTanh"),
+                  RtM->getFunction(StringRef("wrapper_tensorTanh"))->getFunctionType());
+              DEBUG(errs() << *wrapper_tensorTanh);
+              CallInst::Create(wrapper_tensorTanh, Args, "", TensorII);
+        }
+     
+        // We can replace the call to hpvm.tensor.{relu,clipped relu, tanh}
+        //  with the 1st argument that, due to in place operation,
+        // now contains the result
+        TensorII->replaceAllUsesWith(TensorII->getOperand(0));
+      }
+      break;
+
+      case Intrinsic::visc_tensor_softmax:
+      { /* llvm.visc.tensor.softmax */
+
+        DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n");
+        // Tensor softmax(a) is in place for argument a.
+        Value *Op = TensorII->getOperand(0);
+
+        // Create string for node name, as first argument for wrapper API call
+        Constant *ConstArray = ConstantDataArray::getString(M->getContext(),
+                                                            strRef, true);
+        GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(),
+                               true, GlobalValue::ExternalLinkage, ConstArray, "");
+        // Create GEP expression to access it
+        Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0);
+        Constant* GEPIndices[] = { Int_0, Int_0 };
+        Constant* GEPConst =
+          ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(),
+                                         GV, GEPIndices);
+
+        Args.push_back(GEPConst);
+
+        Args.push_back(TensorII->getOperand(0));
+
+        // Create wrapper API runtime function call
+        Constant* wrapper_tensorSoftmax =
+          M->getOrInsertFunction(StringRef("wrapper_tensorSoftmax"),
+                 RtM->getFunction(StringRef("wrapper_tensorSoftmax"))->getFunctionType());
+        DEBUG(errs() << *wrapper_tensorSoftmax);
+        CallInst::Create(wrapper_tensorSoftmax, Args, "", TensorII);
+        // We can replace the call to hpvm.tensor.softmax with the 1st argument
+        // that, due to in place operation, now contains the result
+        TensorII->replaceAllUsesWith(TensorII->getOperand(0));
+      }
+      break;
+
+      
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+    }
+
+  } // No other case exists, since assertion passed
+
+
+  // Remove the instructions we translated to the simulator call.
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around.
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs.rbegin(),
+       re = IIs.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    (*ri)->eraseFromParent();
+  }
+
+
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs_remove.rbegin(),
+       re = IIs_remove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    (*ri)->eraseFromParent();
+  }
+
+}
+
+// DFG2LLVM_WrapperAPI - The first implementation.
+
+struct DFG2LLVM_WrapperAPI : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_WrapperAPI() : DFG2LLVM(ID) {}
+
+  
+private:
+
+public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addRequired<InPlaceDFGAnalysisWrapper>();
+    AU.addPreserved<BuildDFG>();
+    AU.addPreserved<InPlaceDFGAnalysisWrapper>();
+  }
+
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_WrapperAPI : public CodeGenTraversal {
+
+private:
+  //Member variables
+  unsigned nodeID; // Used as a node identifier
+
+  std::string QuantizationInputsFilenameStr;
+  std::string ConfigurationInputsFilenameStr;
+
+  InPlaceDFGAnalysis::InPlaceDFGParameter *IPP;
+
+  // VISC Runtime API and Tensor runtime API
+  Constant* llvm_hpvm_initApproxhpvmRt;
+  Constant* llvm_hpvm_cleanupApproxhpvmRt;
+  Constant* hpvm_request_tensor;
+
+  Constant* llvm_hpvm_initializeRuntimeController;
+  Constant* llvm_hpvm_clearRuntimeController;
+
+  // Functions
+
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_WrapperAPI(Module &_M, BuildDFG &_DFG,
+    InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP,
+    std::string &_QuantizationInputsFilenameStr,
+    std::string &_ConfigurationInputsFilenameStr)
+  : CodeGenTraversal(_M, _DFG), IPP(&_IPP),
+    QuantizationInputsFilenameStr(_QuantizationInputsFilenameStr),
+    ConfigurationInputsFilenameStr(_ConfigurationInputsFilenameStr) {
+    nodeID = 0;
+    initRuntimeAPI();
+  }
+
+};
+
+
+void CGT_WrapperAPI::init() {
+  // FIXME: what to do here? If anything?
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_WrapperAPI::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n");
+
+  // FIXME: set correct path
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
+
+  // Get or insert Global declarations for
+  // - initialization
+  // - cleanup
+  // - request a tensor
+  DECLARE(llvm_hpvm_initApproxhpvmRt);
+  DECLARE(llvm_hpvm_cleanupApproxhpvmRt);
+  DECLARE(hpvm_request_tensor);
+
+  DECLARE(llvm_hpvm_initializeRuntimeController);
+  DECLARE(llvm_hpvm_clearRuntimeController);
+
+  // Find visc.init and visc.cleanup calls, and add placeholder methods
+  // for initialization and cleanup of the hpvm tensor runtime
+
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  CallInst::Create(llvm_hpvm_initApproxhpvmRt,
+                   ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
+                   "", InitCall);
+
+  StringRef QRangesStrRef = StringRef(QuantizationInputsFilenameStr);
+  // Create string for node name, as first argument for wrapper API call
+  Constant *ConstArray1 = ConstantDataArray::getString(M.getContext(),
+                                                       QRangesStrRef, true);
+  GlobalVariable *GV1 = new GlobalVariable(M,ConstArray1->getType(),
+                        true, GlobalValue::ExternalLinkage, ConstArray1, "");
+  // Create GEP expression to access it
+  Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+  Constant* GEPIndices[] = { Int_0, Int_0 };
+  Constant* QRangesGEPConst =
+    ConstantExpr::getGetElementPtr(GV1->getType()->getPointerElementType(),
+                                   GV1, GEPIndices);
+
+  StringRef ConfsStrRef = StringRef(ConfigurationInputsFilenameStr);
+  // Create string for node name, as first argument for wrapper API call
+  Constant *ConstArray2 = ConstantDataArray::getString(M.getContext(),
+                                                       ConfsStrRef, true);
+  GlobalVariable *GV2 = new GlobalVariable(M,ConstArray2->getType(),
+                        true, GlobalValue::ExternalLinkage, ConstArray2, "");
+  Constant* ConfsGEPConst =
+    ConstantExpr::getGetElementPtr(GV2->getType()->getPointerElementType(),
+                                   GV2, GEPIndices);
+  ArrayRef<Value*> RTCInitArgs = {ConfsGEPConst, QRangesGEPConst};
+  CallInst::Create(llvm_hpvm_initializeRuntimeController, RTCInitArgs, "", InitCall);
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  CallInst::Create(llvm_hpvm_cleanupApproxhpvmRt, ArrayRef<Value*>(), "", CleanupCall);
+  CallInst::Create(llvm_hpvm_clearRuntimeController, ArrayRef<Value*>(), "", CleanupCall);
+
+}
+
+void CGT_WrapperAPI::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Skipping internal node\n";
+}
+
+void CGT_WrapperAPI::codeGen(DFLeafNode* N) {
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+
+
+  // Increment the node ID, for current node.
+  ++nodeID;
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+  errs() << "Node Function: " << *F << "\n";
+  // Look up if we have visited this function before. If we have, then just
+  // get the cloned function pointer from DFNode. Otherwise, create the cloned
+  // function and add it to the DFNode GenFunc.
+  Function *F_wrapper_api = N->getGenFuncForTarget(visc::PROMISE_TARGET);
+
+  assert((F_wrapper_api == NULL) &&
+         "Error: Visiting a node for which code already generated");
+
+  // Clone the function
+  ValueToValueMapTy VMap;
+  std::string FName(F->getName().data());//Twine FName = F->getName();
+
+  
+  F_wrapper_api = CloneFunction(F, VMap);
+  F_wrapper_api->setName(FName+"_wrapper_api");
+  F_wrapper_api->removeFromParent();
+  M.getFunctionList().push_back(F_wrapper_api);
+
+  N->addGenFunc(F_wrapper_api, visc::PROMISE_TARGET, true);
+
+  /* Removing HPVM in/out/inout function attributes */
+  for(Function::arg_iterator ai = F_wrapper_api->arg_begin(), ae = F_wrapper_api->arg_end();
+      ai != ae; ai++){
+    Argument *Arg = &*ai;
+    if(Arg->hasAttribute(Attribute::In))
+      Arg->removeAttr(Attribute::In);
+    if(Arg->hasAttribute(Attribute::Out))
+      Arg->removeAttr(Attribute::Out);
+    if(Arg->hasAttribute(Attribute::InOut))
+      Arg->removeAttr(Attribute::InOut);    
+  }
+
+  // Adding nounwind to generated function : FIXME: needed?
+  DEBUG(errs() << "Adding nounwind to generated function\n");
+  F_wrapper_api->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+
+  // Add llvm_visc_requestTensor calls for every pointer argument of the function
+  // (they are all expected to be tensors), at the beginning of the function.
+  // This is the first instruction of the function, insert them before this
+  Instruction* FI = &*(F_wrapper_api->getEntryBlock().begin());
+
+  // FIXME: verify that we want 1 as a target device
+  // In this backend, the target device is GPU, represented by i32 1.
+  ConstantInt *TargetDeviceID =
+    ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+
+  for (Function::arg_iterator ai = F_wrapper_api->arg_begin(),
+       ae = F_wrapper_api->arg_end(); ai != ae; ++ai) {
+    Argument* Arg = &*ai;
+    if (Arg->getType()->isPointerTy()) {
+      Value *Args[] = {Arg, TargetDeviceID};
+      CallInst::Create(hpvm_request_tensor,
+                       ArrayRef<Value*>(Args, 2),
+                       "", FI);
+    }
+  }
+
+  CodeGenStateMachine CGM(&M, runtimeModule.get());
+
+  for (inst_iterator i = inst_begin(F_wrapper_api), e = inst_end(F_wrapper_api);
+       i != e; ++i) {
+    Instruction *I = &(*i);
+    CGM.transition(dyn_cast<IntrinsicInst>(I));
+  }
+
+  errs() << "Node ID string: "<< StringRef(std::to_string(nodeID)) << "\n";
+  //CGM.codeGen(N, F_wrapper_api, N->getFuncPointer()->getName(), *IPP);
+  CGM.codeGen(N, F_wrapper_api, StringRef(std::to_string(nodeID)), *IPP);
+
+  return;
+}
+
+bool DFG2LLVM_WrapperAPI::runOnModule(Module &M) {
+
+  errs() << "\nDFG2LLVM_WrapperAPI PASS\n";
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // Get the In Place Analysis Results
+  InPlaceDFGAnalysis::InPlaceDFGParameter IPP =
+    (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP();
+
+  
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+ 
+  // Visitor for Code Generation Graph Traversal
+  CGT_WrapperAPI *CGTVisitor = new CGT_WrapperAPI(M, DFG, IPP,
+                                            QuantizationInputsFilename,
+                                            ConfigurationInputsFilename);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  
+  return true;
+}
+
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+/* Method needs to be called as part of an analysis pre-step, before code      *
+ * generation is run on a node function, so that the HPVM intrinsics are still *
+ * in place. */
+bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N,
+                                       InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) {
+
+  if (Argument *Arg = dyn_cast<Argument>(Op)) {
+    DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n");
+    assert((Arg->getParent() == Fgen) &&
+          "Extra Parameter in body of Function\n");
+    // Candidate parameter is a function argument
+    // In this case, consult the result of in place analysis
+    // Find position in arg list
+    unsigned pos = Arg->getArgNo();
+    // If this parameter cannot be used for in place operation
+    // code gen cannot continue
+    if (IPP.at(N)[pos]) {
+      DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n");
+      return false;
+    }
+  }
+  else {
+    // If it is not an argument, then it needs to be the result of
+    // another intrinsic. These are new objects that are allocated,
+    // and consumed by next intrinsic. 
+    DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n");
+    if (dyn_cast<IntrinsicInst>(Op)) {
+      DEBUG(errs() << *Arg << "\t: local, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n");
+      return false;
+    }
+  }
+}
+
+} // End of namespace
+
+char DFG2LLVM_WrapperAPI::ID = 0;
+static RegisterPass<DFG2LLVM_WrapperAPI> X("dfg2llvm-wrapperapi",
+                                           "Dataflow Graph to LLVM for WrapperAPI Pass",
+                                           false /* does not modify the CFG */,
+                                           true  /* transformation,   *
+                                                 * not just analysis */);
+
diff --git a/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt b/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt
new file mode 100644
index 0000000000..b4ebb8019d
--- /dev/null
+++ b/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_WrapperAPI/LLVMBuild.txt -------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_WrapperAPI
+parent = Transforms
diff --git a/lib/DFG2LLVM_X86/CMakeLists.txt b/lib/DFG2LLVM_X86/CMakeLists.txt
new file mode 100644
index 0000000000..6a78066c44
--- /dev/null
+++ b/lib/DFG2LLVM_X86/CMakeLists.txt
@@ -0,0 +1,11 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMDFG2LLVM_X86
+  DFG2LLVM_X86.cpp
+
+  DEPENDS intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp
new file mode 100644
index 0000000000..b693bd0be4
--- /dev/null
+++ b/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -0,0 +1,2082 @@
+//===-------------------------- DFG2LLVM_X86.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "DFG2LLVM_X86"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+// VISC Command line option to use timer or not
+static cl::opt<bool>
+VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
+// Command line option to enable device abstraction or not
+static cl::opt<bool>
+DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
+                   cl::desc("Enable visc device abstraction"));
+
+
+namespace {
+
+// Helper Functions
+static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
+  if (!isa<CallInst>(I))
+    return false;
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
+}
+
+CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
+  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
+    Instruction *I = &*ib;
+    if (isVISCCall_llvm_visc_policy_getVersion(I))
+      return cast<CallInst>(I);
+  }
+  return NULL;
+}
+
+// DFG2LLVM_X86 - The first implementation.
+struct DFG2LLVM_X86 : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_X86() :DFG2LLVM(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_X86 : public CodeGenTraversal {
+
+private:
+  //Member variables
+
+  Constant* malloc;
+  // VISC Runtime API
+  Constant* llvm_visc_x86_launch;
+  Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_x86_argument_ptr;
+
+  Constant* llvm_visc_streamLaunch;
+  Constant* llvm_visc_streamPush;
+  Constant* llvm_visc_streamPop;
+  Constant* llvm_visc_streamWait;
+  Constant* llvm_visc_createBindInBuffer;
+  Constant* llvm_visc_createBindOutBuffer;
+  Constant* llvm_visc_createEdgeBuffer;
+  Constant* llvm_visc_createLastInputBuffer;
+  Constant* llvm_visc_createThread;
+  //Constant* llvm_visc_freeThreads;
+  Constant* llvm_visc_bufferPush;
+  Constant* llvm_visc_bufferPop;
+  Constant* llvm_visc_x86_dstack_push;
+  Constant* llvm_visc_x86_dstack_pop;
+  Constant* llvm_visc_x86_getDimLimit;
+  Constant* llvm_visc_x86_getDimInstance;
+
+  //Functions
+  std::vector<IntrinsicInst*>* getUseList(Value* LI);
+  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
+  void addDoWhileLoop(Instruction*, Instruction*, Value*);
+  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+  Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
+  Argument* getArgumentFromEnd(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  StructType* getArgumentListStructTy(DFNode*);
+  Function* createFunctionFilter(DFNode* C);
+  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
+                      Value*, Value*, Instruction*);
+  Function* createLaunchFunction(DFInternalNode*);
+  Function* createPushFunction(DFInternalNode*);
+  Function* createPopFunction(DFInternalNode*);
+  Function* createWaitFunction(DFInternalNode*);
+
+  // Virtual Functions
+  void init() {
+    VISCTimer = VISCTimer_X86;
+    TargetName = "X86";
+  }
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  Function* codeGenStreamPush(DFInternalNode* N);
+  Function* codeGenStreamPop(DFInternalNode* N);
+
+public:
+  // Constructor
+  CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
+    init();
+    initRuntimeAPI();
+  }
+
+  void codeGenLaunch(DFInternalNode* Root);
+  void codeGenLaunchStreaming(DFInternalNode* Root);
+};
+
+bool DFG2LLVM_X86::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_X86 PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  //DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // TODO: Later on, we might like to do this in a separate pass, which would
+    // allow us the flexibility to switch between complete static code generation
+    // for DFG or having a customized runtime+scheduler
+    
+    // Do streaming code generation if root node is streaming. Usual otherwise
+    if(rootNode->isChildGraphStreaming())
+      CGTVisitor->codeGenLaunchStreaming(rootNode);
+    else
+      CGTVisitor->codeGenLaunch(rootNode);
+  }
+
+  delete CGTVisitor;
+  return true;
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_X86::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll";
+
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE(llvm_visc_x86_launch);
+  DECLARE(malloc);
+  DECLARE(llvm_visc_x86_wait);
+  DECLARE(llvm_visc_x86_argument_ptr);
+  DECLARE(llvm_visc_streamLaunch);
+  DECLARE(llvm_visc_streamPush);
+  DECLARE(llvm_visc_streamPop);
+  DECLARE(llvm_visc_streamWait);
+  DECLARE(llvm_visc_createBindInBuffer);
+  DECLARE(llvm_visc_createBindOutBuffer);
+  DECLARE(llvm_visc_createEdgeBuffer);
+  DECLARE(llvm_visc_createLastInputBuffer);
+  DECLARE(llvm_visc_createThread);
+  //DECLARE(llvm_visc_freeThreads);
+  DECLARE(llvm_visc_bufferPush);
+  DECLARE(llvm_visc_bufferPop);
+  DECLARE(llvm_visc_x86_dstack_push);
+  DECLARE(llvm_visc_x86_dstack_pop);
+  DECLARE(llvm_visc_x86_getDimLimit);
+  DECLARE(llvm_visc_x86_getDimInstance);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  initTimerAPI();
+
+  // Insert init context in main
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  DEBUG(errs() << "Inserting x86 timer initialization\n");
+  Instruction* I = cast<Instruction>(*VI->user_begin());
+  initializeTimerSet(I);
+  switchToTimer(visc_TimerID_NONE, I);
+  // Insert code for initializing the sceduling policy
+  Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init",
+    runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()));
+  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  DEBUG(errs() << *IPCallInst << "\n");
+
+  // If device abstraction is enabled, we add a runtime call to start the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+  // Insert print instruction at visc exit
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+
+  // Insert code for clearing the sceduling policy
+  I = cast<Instruction>(*VC->user_begin());
+  IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear",
+    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()));
+  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  errs() << *IPCallInst << "\n";
+
+  DEBUG(errs() << "Inserting x86 timer print\n");
+  printTimerSet(I);
+
+  // If device abstraction is enabled, we add a runtime call to end the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+}
+
+/* Returns vector of all wait instructions
+ */
+std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
+  // It must have been loaded from memory somewhere
+  for(Value::user_iterator ui = GraphID->user_begin(),
+      ue = GraphID->user_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      UseList->push_back(waitI);
+    }
+    //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){
+      //errs() << "Found PhiNode use of graphID\n";
+      //std::vector<IntrinsicInst*>* phiUseList  = getUseList(PN);
+      //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end());
+      //free(phiUseList);
+    //}
+    else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
+    }
+  }
+  return UseList;
+}
+
+/* Traverse the function argument list in reverse order to get argument at a
+ * distance offset fromt he end of argument list of function F
+ */
+Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
+         && "Invalid offset to access arguments!");
+  Function::arg_iterator e = F->arg_end();
+  // Last element of argument iterator is dummy. Skip it.
+  e--;
+  Argument* arg;
+  for( ; offset != 0; e--) {
+    offset--;
+    arg = &*e;
+  }
+  return arg;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
+                          Instruction* BodyEnd, Value* TerminationCond) {
+  BasicBlock* Entry = CondBlockStart->getParent();
+  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+
+  // Replace the terminator instruction of conditional with new conditional
+  // branch which goes to while.body if true and branches to while.end otherwise
+  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  ReplaceInstWithInst(CondBlock->getTerminator(), BI);
+
+  // While Body should jump to condition block
+  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
+  ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
+
+}
+
+Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+                                          BasicBlock *Body) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                            "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  return CounterPhi;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) {
+  BasicBlock* Entry = From->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body");
+
+  // To Instruction should also belong to the same basic block as the From basic
+  // block will have a terminator instruction
+  assert(To->getParent() == ForBody
+         && "To Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end");
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
+  BasicBlock* Entry = I->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
+
+  BasicBlock::iterator i(I);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == ForBody
+         && "Next Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+
+
+  // Add Phi Node for index variable
+  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
+                                      2, "index."+indexName, I);
+
+  // Add incoming edge to phi
+  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
+                        Entry);
+  // Increment index variable
+  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
+                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+                             "index."+indexName+".inc", ForBody->getTerminator());
+
+  // Compare index variable with limit
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
+                                  limit, "cond."+indexName, ForBody->getTerminator());
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+  // Add incoming edge to phi node in body
+  IndexPhi->addIncoming(IndexInc, ForBody);
+  return IndexPhi;
+}
+
+// Returns a packed struct type. The structtype is created by packing the input
+// types, output types and isLastInput buffer type. All the streaming
+// inputs/outputs are converted to i8*, since this is the type of buffer
+// handles.
+StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
+  std::vector<Type*> TyList;
+  // Input types
+  Function* CF = C->getFuncPointer();
+  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+      ai != ae; ++ai) {
+    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+      TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+    else 
+      TyList.push_back(ai->getType());
+  }
+  // Output Types
+  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
+    // All outputs of a node are streaming edge
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
+        && "All output edges of child node have to be streaming");
+    TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+  }
+  // isLastInput buffer element
+  TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+
+  StructType* STy = StructType::create(CF->getContext(), TyList,
+                        Twine("struct.thread."+CF->getName()).str(), true);
+  return STy;
+
+}
+
+void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
+                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
+                              Instruction* IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create a filter/pipeline function for the child node
+  Function* C_Pipeline = createFunctionFilter(C);
+  Function* CF = C->getFuncPointer();
+
+  // Get module context and i32 0 constant, as they would be frequently used in
+  // this function.
+  LLVMContext& Ctx = IB->getParent()->getContext();
+  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  // Marshall arguments
+  // Create a packed struct type with inputs of C followed by outputs and then
+  // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
+  // by i8*
+  //
+  StructType* STy = getArgumentListStructTy(C);
+  // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
+  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
+                                  C->getFuncPointer()->getName()+".inputs", IB);
+  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
+  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
+  // Insert elements in the struct
+  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Marshall Inputs
+  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
+    // Create constant int (i)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".arg_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getInDFEdgeAt(i);
+    if (E->getSourceDF()->isEntryNode()) {
+      // This is a Bind Input Edge
+      if(E->isStreamingEdge()) {
+        // Streaming Bind Input edge. Get buffer corresponding to it
+        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
+        new StoreInst(EdgeBufferMap[E], GEP, IB);
+      }
+      else {
+        // Non-streaming Bind edge
+        new StoreInst(Args[i], GEP, IB);
+      }
+    }
+    else {
+      // This is an edge between siblings. 
+      // This must be an streaming edge. As it is our assumption that all edges
+      // between two nodes in a DFG are streaming.
+      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
+      new StoreInst(EdgeBufferMap[E], GEP, IB);
+    }
+  }
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
+  // Marshall Outputs
+  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  for(unsigned i = 0; i < numOutputs; i++ ) {
+    // Create constant int (i+numInputs)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".out_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
+    new StoreInst(EdgeBufferMap[E], GEP, IB);
+  }
+  // Marshall last argument. isLastInput buffer
+  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create constant int (i+numInputs)
+  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
+  // Get Element pointer instruction
+  Value* GEPIndices[] = { IntZero, Int_index };
+  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                           ArrayRef<Value*>(GEPIndices, 2),
+                           Struct->getName()+".isLastInput", IB);
+  new StoreInst(isLastInputBuffer, GEP, IB);
+
+  // AllocaInst AI points to memory with all the arguments packed
+  // Call runtime to create the thread with these arguments
+  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << *graphID->getType() << "\n");
+  DEBUG(errs() << *C_Pipeline->getType() << "\n");
+  DEBUG(errs() << *Struct->getType() << "\n");
+  // Bitcast AI to i8*
+  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
+  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst* CreateThread = CallInst::Create(llvm_visc_createThread,
+                                            ArrayRef<Value*>(CreateThreadArgs, 3),
+                                            "",
+                                            IB);
+
+}
+
+Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Streaming Launch Function\n");
+  // Get Function associated with Node N
+  Function* NF = N->getFuncPointer();
+
+  // Map from Streaming edge to buffer 
+  DenseMap<DFEdge*, Value*> EdgeBufferMap;
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+  * (2) Extract each of inputs from data.addr
+  * (3) create Buffers for all the streaming edges
+  *     - Put buffers in the context
+  * (4) Go over each child node
+  *     - marshall its arguments together (use buffers in place of streaming
+  *       arguments)
+  *     - Start the threads
+  * (5) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // (1) Create Launch Function of type void (i8* args, i8* GraphID)
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
+                                  ArrayRef<Type*>(ArgTypes, 2), false);
+  Function* LaunchFunc = Function::Create(LaunchFuncTy,
+                                       NF->getLinkage(),
+                                       NF->getName()+".LaunchFunction",
+                                       &M);
+  DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Argument* data = &*LaunchFunc->arg_begin();
+  Argument* graphID = &*(++LaunchFunc->arg_begin());
+  data->setName("data.addr");
+  graphID->setName("graphID");
+  // Add a basic block to this empty function and a return null statement to it
+  DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
+  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
+                                      BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+
+  // (2) Extract each of inputs from data.addr
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  std::vector<Value*> Args;
+
+  for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
+      ai != ae; ++ai) {
+    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back(Twine(ai->getName()+"_buffer").str());
+      continue;
+    }
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  Args = extractElements(data, TyList, names, RI);
+  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
+  // (3) Create buffers for all the streaming edges
+  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
+    DFEdge* Edge = *di;
+    DEBUG(errs() << *Edge->getType() << "\n");
+    Value* size = ConstantExpr::getSizeOf(Edge->getType());
+    Value* CallArgs[] = {graphID, size};
+    if (Edge->isStreamingEdge()) {
+      CallInst* CI;
+      // Create a buffer call
+      if(Edge->getSourceDF()->isEntryNode()) {
+        // Bind Input Edge
+        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
+                                  Edge->getSourcePosition());
+        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
+                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else if(Edge->getDestDF()->isExitNode()) {
+        // Bind Output Edge
+        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else {
+        // Streaming Edge
+        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
+                              ArrayRef<Value*>(CallArgs, 2),
+                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
+                              +Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      EdgeBufferMap[Edge] = CI;
+    }
+  }
+  // Create buffer for isLastInput for all the child nodes
+  DFGraph* G = N->getChildGraph();
+  DenseMap<DFNode*, Value*> NodeLastInputMap;
+  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
+    DFNode* child = *ci;
+    if(child->isDummyNode())
+      continue;
+    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value* CallArgs[] = {graphID, size};
+    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
+                              RI);
+    NodeLastInputMap[child] = CI;
+  }
+  DEBUG(errs() <<  "Start Each child node filter\n");
+  // (4) Marshall arguments for each child node and start the thread with its
+  //     pipeline funtion
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+    
+    // Marshall all the arguments for this node into an i8*
+    // Pass to the runtime to create the thread
+    // Start the thread for child node C
+    startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
+  }
+
+  DEBUG(errs() << "Launch function:\n");
+  DEBUG(errs() << *LaunchFunc << "\n");
+
+  return LaunchFunc;
+}
+
+
+Function* CGT_X86::createPushFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Push function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createPopFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Pop function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Wait function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+/* This fuction does the steps necessary to launch a streaming graph
+ * Steps
+ * Create Pipeline/Filter function for each node in child graph of Root
+ * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
+ * Modify each of the instrinsic in host code
+ * Launch, Push, Pop, Wait
+ */
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
+  IntrinsicInst* LI = Root->getInstruction();
+  Function* RootLaunch = createLaunchFunction(Root);
+  //Function* RootPush = createPushFunction(Root);
+  //Function* RootPop = createPopFunction(Root);
+  //Function* RootWait = createWaitFunction(Root);
+  // Substitute launch intrinsic main
+  DEBUG(errs() <<  "Substitute launch intrinsic\n");
+  Value* LaunchInstArgs[] = {RootLaunch,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_streamWait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_streamPush,
+                            ArrayRef<Value*>(PushArgs, 2),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_streamPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    DEBUG(errs() << "Replace:\n\t" << *II << "\n");
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << "\twith " << *CI << "\n");
+  }
+
+
+}
+
+void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
+  // TODO: Place an assert to check if the constant passed by launch intrinsic
+  // as the number of arguments to DFG is same as the number of arguments of the
+  // root of DFG
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Get Launch Instruction
+  IntrinsicInst* LI = Root->getInstruction();
+  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
+  DEBUG(errs() << "Generating Launch Function\n");
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type i8*(i8*)
+  * (2) Extract each of inputs from data.addr and pass them as arguments to the
+  * call to Root function
+  * (3) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // Create Launch Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                            ArrayRef<Type*>(i8Ty->getPointerTo()),
+                            false);
+  Function* AppFunc = Function::Create(AppFuncTy,
+                                       Root->getFuncPointer()->getLinkage(),
+                                       "LaunchDataflowGraph",
+                                       &M);
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*AppFunc->arg_begin();
+  data->setName("data.addr");
+  // Add a basic block to this empty function and a return null statement to it
+  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
+  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
+                                      Constant::getNullValue(AppFunc->getReturnType()),
+                                      BB);
+  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+  // Find the X86 function generated for Root and
+//  Function* RootF_X86 = Root->getGenFunc();
+  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
+  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "Error: Generated Function for Root node with no x86 wrapper\n");
+
+  // Generate a call to RootF_X86 with null parameters for now
+  std::vector<Value*>Args;
+  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  }
+  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
+
+  // Extract input data from i8* data.addr and patch them to correct argument of
+  // call to RootF_X86. For each argument
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+      ai != ae; ++ai) {
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
+  // Patch the elements to the call arguments
+  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
+    CI->setArgOperand(i, elements[i]);
+
+  // Add timers around Call to RootF_X86 function
+  switchToTimer(visc_TimerID_COMPUTATION, CI);
+  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
+
+  // Code for returning the output
+  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                             CI->getType()->getPointerTo(),
+                             CI->getName()+".addr",
+                             RI);
+  new StoreInst(CI, OutputAddrCast, RI);
+  switchToTimer(visc_TimerID_NONE, RI);
+
+  DEBUG(errs() << "Application specific function:\n");
+  DEBUG(errs() << *AppFunc << "\n");
+
+  // Substitute launch intrinsic main
+  Value* LaunchInstArgs[] = {AppFunc,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_x86_wait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_bufferPush,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_bufferPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << *CI << "\n");
+  }
+
+}
+
+Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using OutputMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
+                              ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+//  Function* CF_X86 = C->getGenFunc();
+  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(CF_X86 != NULL
+         && "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function to be called from x86 backend is not an x86 function\n");
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+
+  std::vector<Value*> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_X86, IB));
+  }
+
+  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
+    Args.push_back(I64Zero);
+
+  errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
+  errs() << "Node Function type: " << *CF->getType() << "\n";
+  errs() << "Arguments: " << Args.size() << "\n";
+
+  // Call the F_X86 function associated with this node
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  unsigned numArgs = CI->getNumArgOperands();
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit = NULL;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_X86 function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if(isa<Constant>(C->getDimLimits()[j])) {
+      indexLimit = C->getDimLimits()[j];
+      DEBUG(errs() << "In Constant case:\n"
+             << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    else {
+      indexLimit = VMap[C->getDimLimits()[j]];
+      DEBUG(errs() << "In VMap case:"
+             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
+  }
+  // Insert call to runtime to push the dim limits and instanceID on the depth
+  // stack
+  Value* args[] = {
+    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
+    CI->getArgOperand(numArgs-3+0), // limitX
+    CI->getArgOperand(numArgs-6+0), // iX
+    CI->getArgOperand(numArgs-3+1), // limitY
+    CI->getArgOperand(numArgs-6+1), // iY
+    CI->getArgOperand(numArgs-3+2), // limitZ
+    CI->getArgOperand(numArgs-6+2)  // iZ
+  };
+
+  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
+  DEBUG(errs() << "Push on stack: " << *Push << "\n");
+  // Insert call to runtime to pop the dim limits and instanceID from the depth
+  // stack
+  BasicBlock::iterator i(CI);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == CI->getParent()
+         && "Next Instruction should also belong to the same basic block!");
+
+  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
+  DEBUG(errs() << *CI->getParent()->getParent());
+}
+
+/* This function takes a DFNode, and creates a filter function for it. By filter
+ * function we mean a function which keeps on getting input from input buffers,
+ * applying the function on the inputs and then pushes data on output buffers
+ */
+// Create a function with void* (void*) type.
+// Create a new basic block
+// Add a return instruction to the basic block
+// extract arguments from the aggregate data input. Type list would be
+// Replace the streaming inputs with i8* types signifying handle to
+// corresponding buffers
+// Add a boolean argument isLastInput
+// Add runtime API calls to get input for each of the streaming inputs
+// Add a call to the generated function of the child node
+// Add runtime API calls to push output for each of the streaming outputs
+// Add loop around the basic block, which exits the loop if isLastInput is false
+
+Function* CGT_X86::createFunctionFilter(DFNode* C) {
+  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
+
+  /* Create a function with same argument list as child.*/
+  DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
+  // Get the generated function for child node
+  Function* CF = C->getFuncPointer();
+  // Create Filter Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
+  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
+                          CF->getLinkage(),
+                          CF->getName()+"_Pipeline",
+                          &M);
+  DEBUG(errs() << "Generating Pipline Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*CF_Pipeline->arg_begin();
+  data->setName("data.addr");
+  // Create a new basic block
+  DEBUG(errs() << "\tCreate new BB and add a return function\n");
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  // Add a return instruction to the basic block
+  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
+                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
+
+
+  /* Extract the elements from the aggregate argument to the function.
+   * Replace the streaming inputs with i8* types signifying handle to
+   * corresponding buffers
+   * Add outputs to the list as well
+   * Add isLastInput to the list
+   */
+  DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
+  // These Args will be used when passing arguments to the generated function
+  // inside loop, and reading outputs as well.
+  std::vector<Value*> Args;
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  // Adding inputs
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back((Twine(i->getName())+"_buffer").str());
+    }
+    else {
+      TyList.push_back(i->getType());
+      names.push_back(i->getName());
+    }
+  }
+  // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
+  // because we get there buffer handles
+  StructType* RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
+    TyList.push_back(i8Ty->getPointerTo());
+    names.push_back("out");
+  }
+  /* Add a boolean argument isLastInput */
+  DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
+  TyList.push_back(i8Ty->getPointerTo());
+  names.push_back("isLastInput_buffer");
+
+  // Extract the inputs, outputs and
+  Args = extractElements(data, TyList, names, RI);
+  for(unsigned i=0; i<Args.size(); i++) {
+    DEBUG(errs() << *Args[i] << "\n");
+  }
+
+  // Split the Args vector into, input output and isLastInput
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = RetTy->getNumElements();
+  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
+  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
+
+  /* Add runtime API calls to get input for each of the streaming input edges */
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
+  // First read the termination condition variable islastInput
+  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
+                                        ArrayRef<Value*>(isLastInput),
+                                        "",
+                                        RI);
+
+  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
+                 Type::getInt64Ty(CF_Pipeline->getContext()),
+                 false,
+                 "isLastInput",
+                 RI);
+  isLastInput = BI;
+  // Create a loop termination condition
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
+      RI);
+
+  // Get input from buffers of all the incoming streaming edges
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
+                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
+                                            "",
+                                            RI);
+      CastInst* BI;
+      if(i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr,
+                              bufferIn,
+                              i->getType(),
+                              i->getName()+".addr",
+                              RI);
+      }
+      else if(i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn,
+                                    i->getType(),
+                                    i->getName()+".addr",
+                                    RI);
+      }
+      else {
+        BI = CastInst::CreateIntegerCast(bufferIn,
+                                         i->getType(),
+                                         false,
+                                         i->getName()+".addr",
+                                         RI);
+      }
+      // Replace the argument in Args vector. We would be using the vector as
+      // parameters passed to the call
+      InputArgs[i->getArgNo()] = BI;
+    }
+  }
+  /* Add a call to the generated function of the child node */
+  DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
+//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+//                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
+  DEBUG(errs() << "Type: "
+               << *CGenF->getType()
+               << "\n");
+  CallInst* CI = CallInst::Create(CGenF,
+                                  InputArgs,
+                                  CGenF->getName()+".output",
+                                  RI);
+
+  /* Add runtime API calls to push output for each of the streaming outputs */
+  // FIXME: Assumption
+  // All edges between siblings are streaming edges
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
+  for (unsigned i=0; i< numOutputs; i++) {
+    // Extract output
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
+                           "",RI);
+    // Convert to i64
+    CastInst* BI;
+    if(EI->getType()->isPointerTy())
+      BI = CastInst::Create(CastInst::PtrToInt,EI,
+                            Type::getInt64Ty(CF_Pipeline->getContext()),
+                            "",
+                            RI);
+    else
+      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
+                                       false, "", RI);
+    // Push to Output buffer
+    Value* bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush,
+                                           ArrayRef<Value*>(bufferOutArgs, 2),
+                                           "",
+                                           RI);
+  }
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond);
+//  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
+//                RI, Cond);
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  // Pointers to keep the created loop structure
+  BasicBlock *EntryBB, *CondBB, *BodyBB;
+  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
+  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
+  EntryBB = CondStartI->getParent();
+
+  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
+  CondBB = CondStartI->getParent();
+  BodyBB = CI->getParent();
+  Instruction *CntI = NULL;
+  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
+
+  // If the node function calls the visc runtime call to get policy, we update
+  // it with the counter information. This means we need to pass an additional
+  // argument to the generated function, that is the iteration number, and then
+  // use it as an argument to the policy_getVersion call 
+  if (GetPolicyCI) {
+    CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
+    assert(CntI && "Counter instruction not found\n");
+
+    // Create new function type (with additional argument for iteration number)
+    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
+    std::vector<Type*> NewArgTypes;
+    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
+         ai != ae ; ++ai) {
+      NewArgTypes.push_back(ai->getType());
+    }
+    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
+    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
+    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
+    // At least one (the last) argument exists (we added it)
+    Function::arg_iterator ae = NewCGenF->arg_end();
+    --ae;
+    Argument *CntArg = &*ae;
+    CntArg->setName("iteration");
+    // Replace the old cpu gen func with this one
+    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
+
+    // Add counter to the actual parameter list, to create the new call
+    InputArgs.push_back(CntI);
+    CallInst* newCI = CallInst::Create(NewCGenF,
+                                       InputArgs,
+                                       NewCGenF->getName()+".output");
+    ReplaceInstWithInst(CI, newCI);
+
+    // Set second operand of the policy_getVersion call to the last function
+    // argument
+    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
+    GetPolicyCI->setArgOperand(1, CntArg);
+  }
+
+  // Return the Function pointer
+  DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
+  DEBUG(errs() << *CF_Pipeline << "\n");
+  return CF_Pipeline;
+}
+
+void CGT_X86::codeGen(DFInternalNode* N) {
+  // Check if N is root node and its graph is streaming. We do not do codeGen
+  // for Root in such a case
+  if(N->isRoot() && N->isChildGraphStreaming())
+    return;
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  // Sort children in topological order before code generation
+  N->getChildGraph()->sortChildren();
+
+  // Only process if all children have a CPU x86 function
+  // Otherwise skip to end
+  bool codeGen = true;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+
+    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+      errs() << "No CPU x86 version for child node "
+             << C->getFuncPointer()->getName()
+             << "\n  Skip code gen for parent node "
+             << N->getFuncPointer()->getName() << "\n";
+      codeGen = false;
+    }
+  }
+
+  if (codeGen) {
+    Function* F = N->getFuncPointer();
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function* F_X86;
+  
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+  
+    // Create new function with the same type
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Increment dest iterator
+      ++dest_iterator;
+    }
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
+    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
+
+    // Add Index and Dim arguments except for the root node and the child graph of
+    // parent node is not streaming
+    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+      F_X86 = addIdxDimArgs(F_X86);
+
+    BB = &*F_X86->begin();
+    RI = cast<ReturnInst>(BB->getTerminator());
+  
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Loop over the arguments, to create the VMap.
+    dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      // Add mapping and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+
+    // Iterate over children in topological order
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      DFNode* C = *ci;
+      // Skip dummy node call
+      if (C->isDummyNode())
+        continue;
+  
+      // Create calls to CPU function of child node
+      invokeChild_X86(C, F_X86, VMap, RI);
+  
+    }
+ 
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    DFNode* C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType* OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_X86->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge* E = C->getInDFEdgeAt(i);
+  
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode* SrcDF = E->getSourceDF();
+  
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_X86, i);
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
+        // edge is from a internal node
+        // Check - code should already be generated for this source dfnode
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+  
+        // Find Output Value associated with the Source DFNode using OutputMap
+        Value* CI = OutputMap[SrcDF];
+  
+        // Extract element at source position from this call instruction
+        std::vector<unsigned> IndexList;
+        IndexList.push_back(E->getSourcePosition());
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
+        inputVal = EI;
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
+
+  }
+
+  //-------------------------------------------------------------------------//
+  // Here, we need to check if this node (N) has more than one versions
+  // If so, we query the policy and have a call to each version
+  // If not, we see which version exists, check that it is in fact an x86
+  // function and save it as the CPU_TARGET function
+
+  // TODO: visc_id per node, so we can use this for id for policies
+  // For now, use node function name and change it later
+  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+  errs() << "Node: " << N->getFuncPointer()->getName()
+                     << " with tag " << N->getTag() << "\n";
+  errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+  errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+
+  if (N->getTag() == visc::None) {
+    // No code is available for this node. This (usually) means that this
+    // node is a node that
+    // - from the accelerator backends has been mapped to an intermediate
+    // node, and thus they have not produced a genFunc
+    // - a child node had no CPU hint, thus no code gen for CPU could 
+    // take place
+    errs() << "No GenFunc - Skipping CPU code generation for node "
+           << N->getFuncPointer()->getName() << "\n";
+  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
+    // There is a single version for this node according to code gen hints.
+    // Therefore, we do not need to check the policy, we simply use the
+    // available implementation, whichever target it is for.
+
+    // Sanity check - to be removed TODO
+    switch (N->getTag()) {
+      case visc::CPU_TARGET:
+        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::GPU_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::SPIR_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
+        break;
+      default:
+        assert(false && "Unreachable: we checked that tag was single target!\n");
+        break;
+    }
+
+    // If device abstraction is enabled, then we may need to edit the node 
+    // function. In case this is a GPU or SPIR gen func, we issue a call to
+    // the runtime that waits for the device to be available
+    if (DeviceAbstraction) {
+      Function *NodeGenFunc = NULL;
+      switch (N->getTag()) {
+        case visc::GPU_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+          break;
+        case visc::SPIR_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET);
+          break;
+        default:
+          break;
+      }
+
+      if (NodeGenFunc) {
+        // If we found a function to edit, we add the call to the runtime as
+        // its first statement
+        BasicBlock *BB = &*NodeGenFunc->begin();
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
+      }
+
+    }
+
+    Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::None);
+    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+    N->setTag(visc::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    errs() << "After editing\n";
+    errs() << "Node: " << N->getFuncPointer()->getName()
+                       << " with tag " << N->getTag() << "\n";
+    errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+    errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+    //  assert(false && "got to the point where we have to select\n");
+  } else {
+    // We have more than one targets
+    
+    errs() << "Node Name (for policy) : "
+           << N->getFuncPointer()->getName() << "\n";
+
+    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    // These assertions express what we can support with the current runtime.
+    // Code generation works the same way even for other target combinations.
+    // For now, we want either CPU and GPU, or CPU and SPIR
+    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
+    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
+           "Generated functions without appropriate x86 wrapper\n");
+
+    FunctionType *FT = CF->getFunctionType();
+    if (GF)
+      assert(FT == GF->getFunctionType() &&
+             "Type mismatch between generated functions for GPU and CPU targets.\n");
+    if (SF)
+      assert(FT == SF->getFunctionType() &&
+             "Type mismatch between generated functions for SPIR and CPU targets.\n");
+
+    // Code generation of wrapper function
+    Function *F_wrapper;
+    ValueToValueMapTy VMap;
+    F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M);
+
+    // Copy argument names over
+    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
+    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName());
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+    // Gather all arguments of wrapper in a vector, to prepare the call to
+    // the individual gen functions
+    std::vector<Value *> GenFuncCallArgs;
+    for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end();
+         i != e; ++i) {
+      GenFuncCallArgs.push_back(&*i);
+    }
+
+    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
+
+    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
+
+    StringRef FName = N->getFuncPointer()->getName();
+    size_t nameSize = FName.size()+1;
+    std::vector<Constant *> NameV;
+    for (char c: FName) {
+      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
+    }
+    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
+    ArrayType *NameType =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
+    AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent);
+    Constant *NameConst = ConstantArray::get(NameType, NameV);
+    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
+    CastInst *BI = BitCastInst::CreatePointerCast(AI,
+                     Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
+    std::vector<Value *> Args;
+    Args.push_back(BI);
+    Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
+    Function *RTF =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion",
+      runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType()));
+    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
+
+    ConstantInt *CmpConst =
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
+    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp,
+                                    CmpInst::ICMP_EQ,
+                                    RTFInst, CmpConst,
+                                    "", BBcurrent);
+
+    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
+    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
+    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+
+    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
+    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (GF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (SF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    RI = ReturnInst::Create(M.getContext(),
+                            UndefValue::get(FT->getReturnType()), BBfalse);
+
+    // Now, make the node cpu gen func to be this one
+    // Remove all other versions and update the tag
+    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::CPU_TARGET);
+
+    // assert(false && "got to the point where we have to combine\n");
+  }
+
+}
+
+// Code generation for leaf nodes
+void CGT_X86::codeGen(DFLeafNode* N) {
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // At this point, the X86 backend does not support code generation for
+  // the case where allocation node is used, so we skip. This means that a
+  // CPU version will not be created, and therefore code generation will
+  // only succeed if another backend (nvptx or spir) has been invoked to
+  // generate a node function for the node including the allocation node.
+  if (N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+
+    errs() << "Check for cudnn or promise hint for node "
+           << N->getFuncPointer()->getName() <<  "\n";
+
+    switch (N->getTag()) {
+       case visc::CUDNN_TARGET: {
+          errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for cudnn
+         assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && "");
+         // Store the CUDNN x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->removeGenFuncForTarget(visc::CUDNN_TARGET);
+         N->setTag(visc::None);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::PROMISE_TARGET: {
+          errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for promise
+         assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && "");
+         // Store the PROMISE x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->setTag(visc::None);
+         N->removeGenFuncForTarget(visc::PROMISE_TARGET);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::GPU_TARGET:
+         // A leaf node should not have an x86 function for GPU
+         // by design of DFG2LLVM_NVPTX backend
+         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+         break;
+       case visc::SPIR_TARGET:
+         // A leaf node should not have an x86 function for SPIR
+         // by design of DFG2LLVM_SPIR backend
+         assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+         break;
+       default:
+         break;
+    }
+
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  std::vector<IntrinsicInst *> IItoRemove;
+  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+  // Get the function associated woth the dataflow node
+  Function *F = N->getFuncPointer();
+
+  // Clone the function, if we are seeing this function for the first time.
+  Function *F_X86;
+  ValueToValueMapTy VMap;
+  F_X86 = CloneFunction(F, VMap);
+  F_X86->removeFromParent();
+  // Insert the cloned function into the module
+  M.getFunctionList().push_back(F_X86);
+
+  // Add the new argument to the argument list. Add arguments only if the cild
+  // graph of parent node is not streaming
+  if(!N->getParent()->isChildGraphStreaming())
+    F_X86 = addIdxDimArgs(F_X86);
+
+  // Add generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+  // Go through the arguments, and any pointer arguments with in attribute need
+  // to have x86_argument_ptr call to get the x86 ptr of the argument
+  // Insert these calls in a new BB which would dominate all other BBs
+  // Create new BB
+  BasicBlock* EntryBB = &*F_X86->begin();
+  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
+  // Insert calls
+  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+        ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
+      assert(ai->getType()->isPointerTy()
+          && "Only pointer arguments can have visc in/out attributes ");
+      Function::arg_iterator aiNext = ai;
+      ++aiNext;
+      Argument* size = &*aiNext;
+      assert(size->getType() == Type::getInt64Ty(M.getContext())
+          && "Next argument after a pointer should be an i64 type");
+      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
+                                                    Type::getInt8PtrTy(M.getContext()),
+                                                    ai->getName()+".i8ptr",
+                                                    Terminator);
+      Value* ArgPtrCallArgs[] = {BI, size};
+      CallInst::Create(llvm_visc_x86_argument_ptr,
+                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
+                                              "",
+                                              Terminator);
+
+    }
+  }
+  errs() << *BB << "\n";
+
+  // Go through all the instructions
+  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
+    Instruction *I = &(*i);
+    DEBUG(errs() << *I << "\n");
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscQueryIntrinsic(I)) {
+      IntrinsicInst* II = cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /***********************************************************************
+      *                        Handle VISC Query intrinsics                  *
+      ***********************************************************************/
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *******************/
+      case Intrinsic::visc_getNode: {
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
+        break;
+      }
+      /************************* llvm.visc.getParentNode() ****************/
+      case Intrinsic::visc_getParentNode: {
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        // get the parent node of the arg node
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*************************** llvm.visc.getNumDims() *****************/
+      case Intrinsic::visc_getNumDims: {
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
+        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+
+        II->replaceAllUsesWith(numOfDimConstant);
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*********************** llvm.visc.getNodeInstanceID() **************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNodeInstanceID_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 + (3-dim);
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
+          assert(indexVal && "Index argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
+
+          II->replaceAllUsesWith(indexVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is for an ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
+                                          ArrayRef<Value*>(args, 2),
+                                          "nodeInstanceID", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+        break;
+      }
+      /********************** llvm.visc.getNumNodeInstances() *************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
+
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNumNodeInstances_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 - dim;
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
+          assert(limitVal && "Limit argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
+
+          II->replaceAllUsesWith(limitVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is from the ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
+                                          ArrayRef<Value*>(args, 2),
+                                          "numNodeInstances", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+
+        break;
+      }
+      default:
+        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
+              II->getIntrinsicID() << "\n");
+        assert(false && "Unknown VISC Intrinsic!");
+        break;
+      }
+
+    } else {
+      //TODO: how to handle address space qualifiers in load/store
+    }
+
+  }
+
+  //TODO:
+  // When to replace the uses?
+  // In which order is it safe to replace the instructions in
+  // IItoReplace?
+  // Probably in the reverse order in the vectors
+  // It is a good idea to have them in one vector and chech the type
+  // using dyn_cast in order to determine if we replace with inst or value
+
+
+  //TODO: maybe leave these instructions to be removed by a later DCE pass
+  for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
+       i != IItoRemove.end(); ++i) {
+    (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
+    (*i)->eraseFromParent();
+  }
+
+  DEBUG(errs() << *F_X86);
+}
+
+} // End of namespace
+
+char DFG2LLVM_X86::ID = 0;
+static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86",
+                                    "Dataflow Graph to LLVM for X86 backend",
+                                    false /* does not modify the CFG */,
+                                    true /* transformation, not just analysis */);
+
diff --git a/lib/DFG2LLVM_X86/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86/DFG2LLVM_X86.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_X86/LLVMBuild.txt b/lib/DFG2LLVM_X86/LLVMBuild.txt
new file mode 100644
index 0000000000..1e82065bf0
--- /dev/null
+++ b/lib/DFG2LLVM_X86/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_X86/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_X86
+parent = Transforms
diff --git a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
new file mode 100644
index 0000000000..75569addda
--- /dev/null
+++ b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( DFG2LLVM_X86_dsoc 
+  DFG2LLVM_X86_dsoc.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
new file mode 100644
index 0000000000..fbe5e4f6bd
--- /dev/null
+++ b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
@@ -0,0 +1,2128 @@
+//===-------------------------- DFG2LLVM_X86.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "DFG2LLVM_X86"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+// VISC Command line option to use timer or not
+static cl::opt<bool>
+VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
+// Command line option to enable device abstraction or not
+static cl::opt<bool>
+DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
+                   cl::desc("Enable visc device abstraction"));
+
+
+namespace {
+
+// Helper Functions
+static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
+  if (!isa<CallInst>(I))
+    return false;
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
+}
+
+CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
+  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
+    Instruction *I = &*ib;
+    if (isVISCCall_llvm_visc_policy_getVersion(I))
+      return cast<CallInst>(I);
+  }
+  return NULL;
+}
+
+// DFG2LLVM_X86 - The first implementation.
+struct DFG2LLVM_X86 : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_X86() :DFG2LLVM(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_X86 : public CodeGenTraversal {
+
+private:
+  //Member variables
+
+  Constant* malloc;
+  // VISC Runtime API
+  Constant* llvm_visc_x86_launch;
+  Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_x86_argument_ptr;
+
+  Constant* llvm_visc_streamLaunch;
+  Constant* llvm_visc_streamPush;
+  Constant* llvm_visc_streamPop;
+  Constant* llvm_visc_streamWait;
+  Constant* llvm_visc_createBindInBuffer;
+  Constant* llvm_visc_createBindOutBuffer;
+  Constant* llvm_visc_createEdgeBuffer;
+  Constant* llvm_visc_createLastInputBuffer;
+  Constant* llvm_visc_createThread;
+  //Constant* llvm_visc_freeThreads;
+  Constant* llvm_visc_bufferPush;
+  Constant* llvm_visc_bufferPop;
+  Constant* llvm_visc_x86_dstack_push;
+  Constant* llvm_visc_x86_dstack_pop;
+  Constant* llvm_visc_x86_getDimLimit;
+  Constant* llvm_visc_x86_getDimInstance;
+
+  //Functions
+  std::vector<IntrinsicInst*>* getUseList(Value* LI);
+  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
+  void addDoWhileLoop(Instruction*, Instruction*, Value*);
+  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+  Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
+  Argument* getArgumentFromEnd(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  StructType* getArgumentListStructTy(DFNode*);
+  Function* createFunctionFilter(DFNode* C);
+  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
+                      Value*, Value*, Instruction*);
+  Function* createLaunchFunction(DFInternalNode*);
+  Function* createPushFunction(DFInternalNode*);
+  Function* createPopFunction(DFInternalNode*);
+  Function* createWaitFunction(DFInternalNode*);
+
+  // Virtual Functions
+  void init() {
+    VISCTimer = VISCTimer_X86;
+    TargetName = "X86";
+  }
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  Function* codeGenStreamPush(DFInternalNode* N);
+  Function* codeGenStreamPop(DFInternalNode* N);
+
+public:
+  // Constructor
+  CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
+    init();
+    initRuntimeAPI();
+  }
+
+  void codeGenLaunch(DFInternalNode* Root);
+  void codeGenLaunchStreaming(DFInternalNode* Root);
+};
+
+bool DFG2LLVM_X86::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_X86 PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  //DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // TODO: Later on, we might like to do this in a separate pass, which would
+    // allow us the flexibility to switch between complete static code generation
+    // for DFG or having a customized runtime+scheduler
+    
+    // Do streaming code generation if root node is streaming. Usual otherwise
+    if(rootNode->isChildGraphStreaming())
+      CGTVisitor->codeGenLaunchStreaming(rootNode);
+    else
+      CGTVisitor->codeGenLaunch(rootNode);
+  }
+
+  delete CGTVisitor;
+  return true;
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_X86::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll";
+
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE(llvm_visc_x86_launch);
+  DECLARE(malloc);
+  DECLARE(llvm_visc_x86_wait);
+  DECLARE(llvm_visc_x86_argument_ptr);
+  DECLARE(llvm_visc_streamLaunch);
+  DECLARE(llvm_visc_streamPush);
+  DECLARE(llvm_visc_streamPop);
+  DECLARE(llvm_visc_streamWait);
+  DECLARE(llvm_visc_createBindInBuffer);
+  DECLARE(llvm_visc_createBindOutBuffer);
+  DECLARE(llvm_visc_createEdgeBuffer);
+  DECLARE(llvm_visc_createLastInputBuffer);
+  DECLARE(llvm_visc_createThread);
+  //DECLARE(llvm_visc_freeThreads);
+  DECLARE(llvm_visc_bufferPush);
+  DECLARE(llvm_visc_bufferPop);
+  DECLARE(llvm_visc_x86_dstack_push);
+  DECLARE(llvm_visc_x86_dstack_pop);
+  DECLARE(llvm_visc_x86_getDimLimit);
+  DECLARE(llvm_visc_x86_getDimInstance);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  initTimerAPI();
+
+  // Insert init context in main
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  DEBUG(errs() << "Inserting x86 timer initialization\n");
+  Instruction* I = cast<Instruction>(*VI->user_begin());
+  initializeTimerSet(I);
+  switchToTimer(visc_TimerID_NONE, I);
+  // Insert code for initializing the sceduling policy
+  Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init",
+    runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()));
+  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  DEBUG(errs() << *IPCallInst << "\n");
+
+  // If device abstraction is enabled, we add a runtime call to start the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+  // Insert print instruction at visc exit
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+
+  // Insert code for clearing the sceduling policy
+  I = cast<Instruction>(*VC->user_begin());
+  IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear",
+    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()));
+  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  errs() << *IPCallInst << "\n";
+
+  DEBUG(errs() << "Inserting x86 timer print\n");
+  printTimerSet(I);
+
+  // If device abstraction is enabled, we add a runtime call to end the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+}
+
+/* Returns vector of all wait instructions
+ */
+std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
+  // It must have been loaded from memory somewhere
+  for(Value::user_iterator ui = GraphID->user_begin(),
+      ue = GraphID->user_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      UseList->push_back(waitI);
+    }
+    //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){
+      //errs() << "Found PhiNode use of graphID\n";
+      //std::vector<IntrinsicInst*>* phiUseList  = getUseList(PN);
+      //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end());
+      //free(phiUseList);
+    //}
+    else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
+    }
+  }
+  return UseList;
+}
+
+/* Traverse the function argument list in reverse order to get argument at a
+ * distance offset fromt he end of argument list of function F
+ */
+Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
+         && "Invalid offset to access arguments!");
+  Function::arg_iterator e = F->arg_end();
+  // Last element of argument iterator is dummy. Skip it.
+  e--;
+  Argument* arg;
+  for( ; offset != 0; e--) {
+    offset--;
+    arg = &*e;
+  }
+  return arg;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
+                          Instruction* BodyEnd, Value* TerminationCond) {
+  BasicBlock* Entry = CondBlockStart->getParent();
+  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+
+  // Replace the terminator instruction of conditional with new conditional
+  // branch which goes to while.body if true and branches to while.end otherwise
+  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  ReplaceInstWithInst(CondBlock->getTerminator(), BI);
+
+  // While Body should jump to condition block
+  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
+  ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
+
+}
+
+Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+                                          BasicBlock *Body) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                            "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  return CounterPhi;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) {
+  BasicBlock* Entry = From->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body");
+
+  // To Instruction should also belong to the same basic block as the From basic
+  // block will have a terminator instruction
+  assert(To->getParent() == ForBody
+         && "To Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end");
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
+  BasicBlock* Entry = I->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
+
+  BasicBlock::iterator i(I);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == ForBody
+         && "Next Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+
+
+  // Add Phi Node for index variable
+  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
+                                      2, "index."+indexName, I);
+
+  // Add incoming edge to phi
+  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
+                        Entry);
+  // Increment index variable
+  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
+                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+                             "index."+indexName+".inc", ForBody->getTerminator());
+
+  // Compare index variable with limit
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
+                                  limit, "cond."+indexName, ForBody->getTerminator());
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+  // Add incoming edge to phi node in body
+  IndexPhi->addIncoming(IndexInc, ForBody);
+  return IndexPhi;
+}
+
+// Returns a packed struct type. The structtype is created by packing the input
+// types, output types and isLastInput buffer type. All the streaming
+// inputs/outputs are converted to i8*, since this is the type of buffer
+// handles.
+StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
+  std::vector<Type*> TyList;
+  // Input types
+  Function* CF = C->getFuncPointer();
+  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+      ai != ae; ++ai) {
+    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+      TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+    else 
+      TyList.push_back(ai->getType());
+  }
+  // Output Types
+  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
+    // All outputs of a node are streaming edge
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
+        && "All output edges of child node have to be streaming");
+    TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+  }
+  // isLastInput buffer element
+  TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+
+  StructType* STy = StructType::create(CF->getContext(), TyList,
+                        Twine("struct.thread."+CF->getName()).str(), true);
+  return STy;
+
+}
+
+void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
+                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
+                              Instruction* IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create a filter/pipeline function for the child node
+  Function* C_Pipeline = createFunctionFilter(C);
+  Function* CF = C->getFuncPointer();
+
+  // Get module context and i32 0 constant, as they would be frequently used in
+  // this function.
+  LLVMContext& Ctx = IB->getParent()->getContext();
+  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  // Marshall arguments
+  // Create a packed struct type with inputs of C followed by outputs and then
+  // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
+  // by i8*
+  //
+  StructType* STy = getArgumentListStructTy(C);
+  // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
+  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
+                                  C->getFuncPointer()->getName()+".inputs", IB);
+  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
+  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
+  // Insert elements in the struct
+  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Marshall Inputs
+  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
+    // Create constant int (i)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".arg_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getInDFEdgeAt(i);
+    if (E->getSourceDF()->isEntryNode()) {
+      // This is a Bind Input Edge
+      if(E->isStreamingEdge()) {
+        // Streaming Bind Input edge. Get buffer corresponding to it
+        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
+        new StoreInst(EdgeBufferMap[E], GEP, IB);
+      }
+      else {
+        // Non-streaming Bind edge
+        new StoreInst(Args[i], GEP, IB);
+      }
+    }
+    else {
+      // This is an edge between siblings. 
+      // This must be an streaming edge. As it is our assumption that all edges
+      // between two nodes in a DFG are streaming.
+      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
+      new StoreInst(EdgeBufferMap[E], GEP, IB);
+    }
+  }
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
+  // Marshall Outputs
+  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  for(unsigned i = 0; i < numOutputs; i++ ) {
+    // Create constant int (i+numInputs)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".out_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
+    new StoreInst(EdgeBufferMap[E], GEP, IB);
+  }
+  // Marshall last argument. isLastInput buffer
+  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create constant int (i+numInputs)
+  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
+  // Get Element pointer instruction
+  Value* GEPIndices[] = { IntZero, Int_index };
+  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                           ArrayRef<Value*>(GEPIndices, 2),
+                           Struct->getName()+".isLastInput", IB);
+  new StoreInst(isLastInputBuffer, GEP, IB);
+
+  // AllocaInst AI points to memory with all the arguments packed
+  // Call runtime to create the thread with these arguments
+  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << *graphID->getType() << "\n");
+  DEBUG(errs() << *C_Pipeline->getType() << "\n");
+  DEBUG(errs() << *Struct->getType() << "\n");
+  // Bitcast AI to i8*
+  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
+  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst* CreateThread = CallInst::Create(llvm_visc_createThread,
+                                            ArrayRef<Value*>(CreateThreadArgs, 3),
+                                            "",
+                                            IB);
+
+}
+
+Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Streaming Launch Function\n");
+  // Get Function associated with Node N
+  Function* NF = N->getFuncPointer();
+
+  // Map from Streaming edge to buffer 
+  DenseMap<DFEdge*, Value*> EdgeBufferMap;
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+  * (2) Extract each of inputs from data.addr
+  * (3) create Buffers for all the streaming edges
+  *     - Put buffers in the context
+  * (4) Go over each child node
+  *     - marshall its arguments together (use buffers in place of streaming
+  *       arguments)
+  *     - Start the threads
+  * (5) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // (1) Create Launch Function of type void (i8* args, i8* GraphID)
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
+                                  ArrayRef<Type*>(ArgTypes, 2), false);
+  Function* LaunchFunc = Function::Create(LaunchFuncTy,
+                                       NF->getLinkage(),
+                                       NF->getName()+".LaunchFunction",
+                                       &M);
+  DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Argument* data = &*LaunchFunc->arg_begin();
+  Argument* graphID = &*(++LaunchFunc->arg_begin());
+  data->setName("data.addr");
+  graphID->setName("graphID");
+  // Add a basic block to this empty function and a return null statement to it
+  DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
+  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
+                                      BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+
+  // (2) Extract each of inputs from data.addr
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  std::vector<Value*> Args;
+
+  for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
+      ai != ae; ++ai) {
+    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back(Twine(ai->getName()+"_buffer").str());
+      continue;
+    }
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  Args = extractElements(data, TyList, names, RI);
+  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
+  // (3) Create buffers for all the streaming edges
+  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
+    DFEdge* Edge = *di;
+    DEBUG(errs() << *Edge->getType() << "\n");
+    Value* size = ConstantExpr::getSizeOf(Edge->getType());
+    Value* CallArgs[] = {graphID, size};
+    if (Edge->isStreamingEdge()) {
+      CallInst* CI;
+      // Create a buffer call
+      if(Edge->getSourceDF()->isEntryNode()) {
+        // Bind Input Edge
+        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
+                                  Edge->getSourcePosition());
+        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
+                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else if(Edge->getDestDF()->isExitNode()) {
+        // Bind Output Edge
+        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else {
+        // Streaming Edge
+        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
+                              ArrayRef<Value*>(CallArgs, 2),
+                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
+                              +Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      EdgeBufferMap[Edge] = CI;
+    }
+  }
+  // Create buffer for isLastInput for all the child nodes
+  DFGraph* G = N->getChildGraph();
+  DenseMap<DFNode*, Value*> NodeLastInputMap;
+  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
+    DFNode* child = *ci;
+    if(child->isDummyNode())
+      continue;
+    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value* CallArgs[] = {graphID, size};
+    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
+                              RI);
+    NodeLastInputMap[child] = CI;
+  }
+  DEBUG(errs() <<  "Start Each child node filter\n");
+  // (4) Marshall arguments for each child node and start the thread with its
+  //     pipeline funtion
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+    
+    // Marshall all the arguments for this node into an i8*
+    // Pass to the runtime to create the thread
+    // Start the thread for child node C
+    startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
+  }
+
+  DEBUG(errs() << "Launch function:\n");
+  DEBUG(errs() << *LaunchFunc << "\n");
+
+  return LaunchFunc;
+}
+
+
+Function* CGT_X86::createPushFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Push function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createPopFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Pop function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Wait function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+/* This fuction does the steps necessary to launch a streaming graph
+ * Steps
+ * Create Pipeline/Filter function for each node in child graph of Root
+ * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
+ * Modify each of the instrinsic in host code
+ * Launch, Push, Pop, Wait
+ */
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
+  IntrinsicInst* LI = Root->getInstruction();
+  Function* RootLaunch = createLaunchFunction(Root);
+  //Function* RootPush = createPushFunction(Root);
+  //Function* RootPop = createPopFunction(Root);
+  //Function* RootWait = createWaitFunction(Root);
+  // Substitute launch intrinsic main
+  DEBUG(errs() <<  "Substitute launch intrinsic\n");
+  Value* LaunchInstArgs[] = {RootLaunch,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_streamWait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_streamPush,
+                            ArrayRef<Value*>(PushArgs, 2),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_streamPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    DEBUG(errs() << "Replace:\n\t" << *II << "\n");
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << "\twith " << *CI << "\n");
+  }
+
+
+}
+
+void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
+  // TODO: Place an assert to check if the constant passed by launch intrinsic
+  // as the number of arguments to DFG is same as the number of arguments of the
+  // root of DFG
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Get Launch Instruction
+  IntrinsicInst* LI = Root->getInstruction();
+  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
+  DEBUG(errs() << "Generating Launch Function\n");
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type i8*(i8*)
+  * (2) Extract each of inputs from data.addr and pass them as arguments to the
+  * call to Root function
+  * (3) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // Create Launch Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                            ArrayRef<Type*>(i8Ty->getPointerTo()),
+                            false);
+  Function* AppFunc = Function::Create(AppFuncTy,
+                                       Root->getFuncPointer()->getLinkage(),
+                                       "LaunchDataflowGraph",
+                                       &M);
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*AppFunc->arg_begin();
+  data->setName("data.addr");
+  // Add a basic block to this empty function and a return null statement to it
+  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
+  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
+                                      Constant::getNullValue(AppFunc->getReturnType()),
+                                      BB);
+  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+  // Find the X86 function generated for Root and
+//  Function* RootF_X86 = Root->getGenFunc();
+  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
+  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "Error: Generated Function for Root node with no x86 wrapper\n");
+
+  // Generate a call to RootF_X86 with null parameters for now
+  std::vector<Value*>Args;
+  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  }
+  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
+
+  // Extract input data from i8* data.addr and patch them to correct argument of
+  // call to RootF_X86. For each argument
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+      ai != ae; ++ai) {
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
+  // Patch the elements to the call arguments
+  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
+    CI->setArgOperand(i, elements[i]);
+
+  // Add timers around Call to RootF_X86 function
+  switchToTimer(visc_TimerID_COMPUTATION, CI);
+  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
+
+  // Code for returning the output
+  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                             CI->getType()->getPointerTo(),
+                             CI->getName()+".addr",
+                             RI);
+  new StoreInst(CI, OutputAddrCast, RI);
+  switchToTimer(visc_TimerID_NONE, RI);
+
+  DEBUG(errs() << "Application specific function:\n");
+  DEBUG(errs() << *AppFunc << "\n");
+
+  // Substitute launch intrinsic main
+  Value* LaunchInstArgs[] = {AppFunc,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_x86_wait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_bufferPush,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_bufferPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << *CI << "\n");
+  }
+
+}
+
+Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using OutputMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
+                              ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+//  Function* CF_X86 = C->getGenFunc();
+  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(CF_X86 != NULL
+         && "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function to be called from x86 backend is not an x86 function\n");
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+
+  std::vector<Value*> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_X86, IB));
+  }
+
+  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
+    Args.push_back(I64Zero);
+
+  errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
+  errs() << "Node Function type: " << *CF->getType() << "\n";
+  errs() << "Arguments: " << Args.size() << "\n";
+
+  // Call the F_X86 function associated with this node
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  unsigned numArgs = CI->getNumArgOperands();
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit = NULL;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_X86 function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if(isa<Constant>(C->getDimLimits()[j])) {
+      indexLimit = C->getDimLimits()[j];
+      DEBUG(errs() << "In Constant case:\n"
+             << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    else {
+      indexLimit = VMap[C->getDimLimits()[j]];
+      DEBUG(errs() << "In VMap case:"
+             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
+  }
+  // Insert call to runtime to push the dim limits and instanceID on the depth
+  // stack
+  Value* args[] = {
+    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
+    CI->getArgOperand(numArgs-3+0), // limitX
+    CI->getArgOperand(numArgs-6+0), // iX
+    CI->getArgOperand(numArgs-3+1), // limitY
+    CI->getArgOperand(numArgs-6+1), // iY
+    CI->getArgOperand(numArgs-3+2), // limitZ
+    CI->getArgOperand(numArgs-6+2)  // iZ
+  };
+
+  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
+  DEBUG(errs() << "Push on stack: " << *Push << "\n");
+  // Insert call to runtime to pop the dim limits and instanceID from the depth
+  // stack
+  BasicBlock::iterator i(CI);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == CI->getParent()
+         && "Next Instruction should also belong to the same basic block!");
+
+  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
+  DEBUG(errs() << *CI->getParent()->getParent());
+}
+
+/* This function takes a DFNode, and creates a filter function for it. By filter
+ * function we mean a function which keeps on getting input from input buffers,
+ * applying the function on the inputs and then pushes data on output buffers
+ */
+// Create a function with void* (void*) type.
+// Create a new basic block
+// Add a return instruction to the basic block
+// extract arguments from the aggregate data input. Type list would be
+// Replace the streaming inputs with i8* types signifying handle to
+// corresponding buffers
+// Add a boolean argument isLastInput
+// Add runtime API calls to get input for each of the streaming inputs
+// Add a call to the generated function of the child node
+// Add runtime API calls to push output for each of the streaming outputs
+// Add loop around the basic block, which exits the loop if isLastInput is false
+
+Function* CGT_X86::createFunctionFilter(DFNode* C) {
+  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
+
+  /* Create a function with same argument list as child.*/
+  DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
+  // Get the generated function for child node
+  Function* CF = C->getFuncPointer();
+  // Create Filter Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
+  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
+                          CF->getLinkage(),
+                          CF->getName()+"_Pipeline",
+                          &M);
+  DEBUG(errs() << "Generating Pipline Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*CF_Pipeline->arg_begin();
+  data->setName("data.addr");
+  // Create a new basic block
+  DEBUG(errs() << "\tCreate new BB and add a return function\n");
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  // Add a return instruction to the basic block
+  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
+                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
+
+
+  /* Extract the elements from the aggregate argument to the function.
+   * Replace the streaming inputs with i8* types signifying handle to
+   * corresponding buffers
+   * Add outputs to the list as well
+   * Add isLastInput to the list
+   */
+  DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
+  // These Args will be used when passing arguments to the generated function
+  // inside loop, and reading outputs as well.
+  std::vector<Value*> Args;
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  // Adding inputs
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back((Twine(i->getName())+"_buffer").str());
+    }
+    else {
+      TyList.push_back(i->getType());
+      names.push_back(i->getName());
+    }
+  }
+  // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
+  // because we get there buffer handles
+  StructType* RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
+    TyList.push_back(i8Ty->getPointerTo());
+    names.push_back("out");
+  }
+  /* Add a boolean argument isLastInput */
+  DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
+  TyList.push_back(i8Ty->getPointerTo());
+  names.push_back("isLastInput_buffer");
+
+  // Extract the inputs, outputs and
+  Args = extractElements(data, TyList, names, RI);
+  for(unsigned i=0; i<Args.size(); i++) {
+    DEBUG(errs() << *Args[i] << "\n");
+  }
+
+  // Split the Args vector into, input output and isLastInput
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = RetTy->getNumElements();
+  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
+  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
+
+  /* Add runtime API calls to get input for each of the streaming input edges */
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
+  // First read the termination condition variable islastInput
+  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
+                                        ArrayRef<Value*>(isLastInput),
+                                        "",
+                                        RI);
+
+  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
+                 Type::getInt64Ty(CF_Pipeline->getContext()),
+                 false,
+                 "isLastInput",
+                 RI);
+  isLastInput = BI;
+  // Create a loop termination condition
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
+      RI);
+
+  // Get input from buffers of all the incoming streaming edges
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
+                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
+                                            "",
+                                            RI);
+      CastInst* BI;
+      if(i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr,
+                              bufferIn,
+                              i->getType(),
+                              i->getName()+".addr",
+                              RI);
+      }
+      else if(i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn,
+                                    i->getType(),
+                                    i->getName()+".addr",
+                                    RI);
+      }
+      else {
+        BI = CastInst::CreateIntegerCast(bufferIn,
+                                         i->getType(),
+                                         false,
+                                         i->getName()+".addr",
+                                         RI);
+      }
+      // Replace the argument in Args vector. We would be using the vector as
+      // parameters passed to the call
+      InputArgs[i->getArgNo()] = BI;
+    }
+  }
+  /* Add a call to the generated function of the child node */
+  DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
+//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+//                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
+  DEBUG(errs() << "Type: "
+               << *CGenF->getType()
+               << "\n");
+  CallInst* CI = CallInst::Create(CGenF,
+                                  InputArgs,
+                                  CGenF->getName()+".output",
+                                  RI);
+
+  /* Add runtime API calls to push output for each of the streaming outputs */
+  // FIXME: Assumption
+  // All edges between siblings are streaming edges
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
+  for (unsigned i=0; i< numOutputs; i++) {
+    // Extract output
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
+                           "",RI);
+    // Convert to i64
+    CastInst* BI;
+    if(EI->getType()->isPointerTy())
+      BI = CastInst::Create(CastInst::PtrToInt,EI,
+                            Type::getInt64Ty(CF_Pipeline->getContext()),
+                            "",
+                            RI);
+    else
+      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
+                                       false, "", RI);
+    // Push to Output buffer
+    Value* bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush,
+                                           ArrayRef<Value*>(bufferOutArgs, 2),
+                                           "",
+                                           RI);
+  }
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond);
+//  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
+//                RI, Cond);
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  // Pointers to keep the created loop structure
+  BasicBlock *EntryBB, *CondBB, *BodyBB;
+  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
+  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
+  EntryBB = CondStartI->getParent();
+
+  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
+  CondBB = CondStartI->getParent();
+  BodyBB = CI->getParent();
+  Instruction *CntI = NULL;
+  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
+
+  // If the node function calls the visc runtime call to get policy, we update
+  // it with the counter information. This means we need to pass an additional
+  // argument to the generated function, that is the iteration number, and then
+  // use it as an argument to the policy_getVersion call 
+  if (GetPolicyCI) {
+    CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
+    assert(CntI && "Counter instruction not found\n");
+
+    // Create new function type (with additional argument for iteration number)
+    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
+    std::vector<Type*> NewArgTypes;
+    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
+         ai != ae ; ++ai) {
+      NewArgTypes.push_back(ai->getType());
+    }
+    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
+    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
+    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
+    // At least one (the last) argument exists (we added it)
+    Function::arg_iterator ae = NewCGenF->arg_end();
+    --ae;
+    Argument *CntArg = &*ae;
+    CntArg->setName("iteration");
+    // Replace the old cpu gen func with this one
+    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
+
+    // Add counter to the actual parameter list, to create the new call
+    InputArgs.push_back(CntI);
+    CallInst* newCI = CallInst::Create(NewCGenF,
+                                       InputArgs,
+                                       NewCGenF->getName()+".output");
+    ReplaceInstWithInst(CI, newCI);
+
+    // Set second operand of the policy_getVersion call to the last function
+    // argument
+    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
+    GetPolicyCI->setArgOperand(1, CntArg);
+  }
+
+  // Return the Function pointer
+  DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
+  DEBUG(errs() << *CF_Pipeline << "\n");
+  return CF_Pipeline;
+}
+
+void CGT_X86::codeGen(DFInternalNode* N) {
+  // Check if N is root node and its graph is streaming. We do not do codeGen
+  // for Root in such a case
+  if(N->isRoot() && N->isChildGraphStreaming())
+    return;
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  // Sort children in topological order before code generation
+  N->getChildGraph()->sortChildren();
+
+  // Only process if all children have a CPU x86 function
+  // Otherwise skip to end
+  bool codeGen = true;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+
+    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+      errs() << "No CPU x86 version for child node "
+             << C->getFuncPointer()->getName()
+             << "\n  Skip code gen for parent node "
+             << N->getFuncPointer()->getName() << "\n";
+      codeGen = false;
+    }
+  }
+
+  if (codeGen) {
+    Function* F = N->getFuncPointer();
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function* F_X86;
+  
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+  
+    // Create new function with the same type
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Increment dest iterator
+      ++dest_iterator;
+    }
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
+    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
+
+    // Add Index and Dim arguments except for the root node and the child graph of
+    // parent node is not streaming
+    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+      F_X86 = addIdxDimArgs(F_X86);
+
+    BB = &*F_X86->begin();
+    RI = cast<ReturnInst>(BB->getTerminator());
+  
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Loop over the arguments, to create the VMap.
+    dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      // Add mapping and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+
+    // Iterate over children in topological order
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      DFNode* C = *ci;
+      // Skip dummy node call
+      if (C->isDummyNode())
+        continue;
+  
+      // Create calls to CPU function of child node
+      invokeChild_X86(C, F_X86, VMap, RI);
+  
+    }
+ 
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    DFNode* C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType* OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_X86->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge* E = C->getInDFEdgeAt(i);
+  
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode* SrcDF = E->getSourceDF();
+  
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_X86, i);
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
+        // edge is from a internal node
+        // Check - code should already be generated for this source dfnode
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+  
+        // Find Output Value associated with the Source DFNode using OutputMap
+        Value* CI = OutputMap[SrcDF];
+  
+        // Extract element at source position from this call instruction
+        std::vector<unsigned> IndexList;
+        IndexList.push_back(E->getSourcePosition());
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
+        inputVal = EI;
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
+
+  }
+
+  //-------------------------------------------------------------------------//
+  // Here, we need to check if this node (N) has more than one versions
+  // If so, we query the policy and have a call to each version
+  // If not, we see which version exists, check that it is in fact an x86
+  // function and save it as the CPU_TARGET function
+
+  // TODO: visc_id per node, so we can use this for id for policies
+  // For now, use node function name and change it later
+  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+  errs() << "Node: " << N->getFuncPointer()->getName()
+                     << " with tag " << N->getTag() << "\n";
+  errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+  errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+
+  if (N->getTag() == visc::None) {
+    // No code is available for this node. This (usually) means that this
+    // node is a node that
+    // - from the accelerator backends has been mapped to an intermediate
+    // node, and thus they have not produced a genFunc
+    // - a child node had no CPU hint, thus no code gen for CPU could 
+    // take place
+    errs() << "No GenFunc - Skipping CPU code generation for node "
+           << N->getFuncPointer()->getName() << "\n";
+  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
+    // There is a single version for this node according to code gen hints.
+    // Therefore, we do not need to check the policy, we simply use the
+    // available implementation, whichever target it is for.
+
+    // Sanity check - to be removed TODO
+    switch (N->getTag()) {
+      case visc::CPU_TARGET:
+        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::GPU_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::SPIR_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
+        break;
+      default:
+        assert(false && "Unreachable: we checked that tag was single target!\n");
+        break;
+    }
+
+    // If device abstraction is enabled, then we may need to edit the node 
+    // function. In case this is a GPU or SPIR gen func, we issue a call to
+    // the runtime that waits for the device to be available
+    if (DeviceAbstraction) {
+      Function *NodeGenFunc = NULL;
+      switch (N->getTag()) {
+        case visc::GPU_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+          break;
+        case visc::SPIR_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET);
+          break;
+        default:
+          break;
+      }
+
+      if (NodeGenFunc) {
+        // If we found a function to edit, we add the call to the runtime as
+        // its first statement
+        BasicBlock *BB = &*NodeGenFunc->begin();
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
+      }
+
+    }
+
+    Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::None);
+    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+    N->setTag(visc::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    errs() << "After editing\n";
+    errs() << "Node: " << N->getFuncPointer()->getName()
+                       << " with tag " << N->getTag() << "\n";
+    errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+    errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+    //  assert(false && "got to the point where we have to select\n");
+  } else {
+    // We have more than one targets
+    
+    errs() << "Node Name (for policy) : "
+           << N->getFuncPointer()->getName() << "\n";
+
+    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    // These assertions express what we can support with the current runtime.
+    // Code generation works the same way even for other target combinations.
+    // For now, we want either CPU and GPU, or CPU and SPIR
+    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
+    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
+           "Generated functions without appropriate x86 wrapper\n");
+
+    FunctionType *FT = CF->getFunctionType();
+    if (GF)
+      assert(FT == GF->getFunctionType() &&
+             "Type mismatch between generated functions for GPU and CPU targets.\n");
+    if (SF)
+      assert(FT == SF->getFunctionType() &&
+             "Type mismatch between generated functions for SPIR and CPU targets.\n");
+
+    // Code generation of wrapper function
+    Function *F_wrapper;
+    ValueToValueMapTy VMap;
+    F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M);
+
+    // Copy argument names over
+    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
+    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName());
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+    // Gather all arguments of wrapper in a vector, to prepare the call to
+    // the individual gen functions
+    std::vector<Value *> GenFuncCallArgs;
+    for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end();
+         i != e; ++i) {
+      GenFuncCallArgs.push_back(&*i);
+    }
+
+    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
+
+    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
+
+    StringRef FName = N->getFuncPointer()->getName();
+    size_t nameSize = FName.size()+1;
+    std::vector<Constant *> NameV;
+    for (char c: FName) {
+      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
+    }
+    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
+    ArrayType *NameType =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
+    AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent);
+    Constant *NameConst = ConstantArray::get(NameType, NameV);
+    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
+    CastInst *BI = BitCastInst::CreatePointerCast(AI,
+                     Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
+    std::vector<Value *> Args;
+    Args.push_back(BI);
+    Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
+    Function *RTF =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion",
+      runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType()));
+    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
+
+    ConstantInt *CmpConst =
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
+    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp,
+                                    CmpInst::ICMP_EQ,
+                                    RTFInst, CmpConst,
+                                    "", BBcurrent);
+
+    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
+    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
+    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+
+    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
+    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (GF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (SF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    RI = ReturnInst::Create(M.getContext(),
+                            UndefValue::get(FT->getReturnType()), BBfalse);
+
+    // Now, make the node cpu gen func to be this one
+    // Remove all other versions and update the tag
+    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::CPU_TARGET);
+
+    // assert(false && "got to the point where we have to combine\n");
+  }
+
+}
+
+// Code generation for leaf nodes
+void CGT_X86::codeGen(DFLeafNode* N) {
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // At this point, the X86 backend does not support code generation for
+  // the case where allocation node is used, so we skip. This means that a
+  // CPU version will not be created, and therefore code generation will
+  // only succeed if another backend (nvptx or spir) has been invoked to
+  // generate a node function for the node including the allocation node.
+  if (N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+
+    errs() << "Check for cudnn or promise hint for node "
+           << N->getFuncPointer()->getName() <<  "\n";
+
+    switch (N->getTag()) {
+       case visc::CUDNN_TARGET: {
+          errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for cudnn
+         assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && "");
+         // Store the CUDNN x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->removeGenFuncForTarget(visc::CUDNN_TARGET);
+         N->setTag(visc::None);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::PROMISE_TARGET: {
+          errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for promise
+         assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && "");
+         // Store the PROMISE x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->setTag(visc::None);
+         N->removeGenFuncForTarget(visc::PROMISE_TARGET);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::GPU_TARGET:
+         // A leaf node should not have an x86 function for GPU
+         // by design of DFG2LLVM_NVPTX backend
+         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+         break;
+       case visc::SPIR_TARGET:
+         // A leaf node should not have an x86 function for SPIR
+         // by design of DFG2LLVM_SPIR backend
+         assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+         break;
+       default:
+         break;
+    }
+
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  std::vector<IntrinsicInst *> IItoRemove;
+  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+  // Get the function associated woth the dataflow node
+  Function *F = N->getFuncPointer();
+
+  // Clone the function, if we are seeing this function for the first time.
+  Function *F_X86;
+  ValueToValueMapTy VMap;
+  F_X86 = CloneFunction(F, VMap);
+  F_X86->removeFromParent();
+  // Insert the cloned function into the module
+  M.getFunctionList().push_back(F_X86);
+
+  // Add the new argument to the argument list. Add arguments only if the cild
+  // graph of parent node is not streaming
+  if(!N->getParent()->isChildGraphStreaming())
+    F_X86 = addIdxDimArgs(F_X86);
+
+  // Add generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+  /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/
+  /* This part of the code is meant to handle turning the CPU backend into an
+   "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime
+   needs to be essentially deactivated.                                      */
+
+  /* We look into the leaf node's function for function call starting from
+   "tensor". These are functions with which we replaced the ApproxHPVM
+   intrinsics, and for which we have LLVM implementations. If found, it means
+   we are dealing with an AproxHPVM program.                                 */
+  bool isApproxHPVMnode = false;
+  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
+    Instruction *I = &(*i);
+    DEBUG(errs() << *I << "\n");
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if ((CI->getCalledFunction()->getName()).startswith("tensor")) {
+        isApproxHPVMnode = true;
+        break;
+      }
+    }
+  }
+
+  /*As in CUDNN backend, we remove the in out attributes of tensor operations,
+   aiming to deactivate the HPVM runtime calls. This has been tested through
+   CUDNN backend for the internal node codegen, and should ensure that code
+   does not insert llvm_visc_x86_argument_ptr in the generated function for
+   leaf node codegen as well.                                                */
+
+  /* Removing HPVM in/out/inout function attributes */
+  if (isApproxHPVMnode) {
+    for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) {
+      Argument *Arg = &*ai;
+      if(Arg->hasAttribute(Attribute::In))
+        Arg->removeAttr(Attribute::In);
+      if(Arg->hasAttribute(Attribute::Out))
+        Arg->removeAttr(Attribute::Out);
+      if(Arg->hasAttribute(Attribute::InOut))
+        Arg->removeAttr(Attribute::InOut);    
+    }
+  }else{
+    printf("****** NO REMOVEAL *** \n\n");
+  }
+
+  /*** FIXME: HACK FOR DSSOC DEMO -- END ***/
+
+  // Go through the arguments, and any pointer arguments with in attribute need
+  // to have x86_argument_ptr call to get the x86 ptr of the argument
+  // Insert these calls in a new BB which would dominate all other BBs
+  // Create new BB
+  BasicBlock* EntryBB = &*F_X86->begin();
+  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
+  // Insert calls
+  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+        ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
+      assert(ai->getType()->isPointerTy()
+          && "Only pointer arguments can have visc in/out attributes ");
+      Function::arg_iterator aiNext = ai;
+      ++aiNext;
+      Argument* size = &*aiNext;
+      assert(size->getType() == Type::getInt64Ty(M.getContext())
+          && "Next argument after a pointer should be an i64 type");
+      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
+                                                    Type::getInt8PtrTy(M.getContext()),
+                                                    ai->getName()+".i8ptr",
+                                                    Terminator);
+      Value* ArgPtrCallArgs[] = {BI, size};
+      CallInst::Create(llvm_visc_x86_argument_ptr,
+                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
+                                              "",
+                                              Terminator);
+
+    }
+  }
+  errs() << *BB << "\n";
+
+  // Go through all the instructions
+  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
+    Instruction *I = &(*i);
+    DEBUG(errs() << *I << "\n");
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscQueryIntrinsic(I)) {
+      IntrinsicInst* II = cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /***********************************************************************
+      *                        Handle VISC Query intrinsics                  *
+      ***********************************************************************/
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *******************/
+      case Intrinsic::visc_getNode: {
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
+        break;
+      }
+      /************************* llvm.visc.getParentNode() ****************/
+      case Intrinsic::visc_getParentNode: {
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        // get the parent node of the arg node
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*************************** llvm.visc.getNumDims() *****************/
+      case Intrinsic::visc_getNumDims: {
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
+        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+
+        II->replaceAllUsesWith(numOfDimConstant);
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*********************** llvm.visc.getNodeInstanceID() **************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNodeInstanceID_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 + (3-dim);
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
+          assert(indexVal && "Index argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
+
+          II->replaceAllUsesWith(indexVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is for an ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
+                                          ArrayRef<Value*>(args, 2),
+                                          "nodeInstanceID", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+        break;
+      }
+      /********************** llvm.visc.getNumNodeInstances() *************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
+
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNumNodeInstances_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 - dim;
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
+          assert(limitVal && "Limit argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
+
+          II->replaceAllUsesWith(limitVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is from the ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
+                                          ArrayRef<Value*>(args, 2),
+                                          "numNodeInstances", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+
+        break;
+      }
+      default:
+        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
+              II->getIntrinsicID() << "\n");
+        assert(false && "Unknown VISC Intrinsic!");
+        break;
+      }
+
+    } else {
+      //TODO: how to handle address space qualifiers in load/store
+    }
+
+  }
+
+  //TODO:
+  // When to replace the uses?
+  // In which order is it safe to replace the instructions in
+  // IItoReplace?
+  // Probably in the reverse order in the vectors
+  // It is a good idea to have them in one vector and chech the type
+  // using dyn_cast in order to determine if we replace with inst or value
+
+
+  //TODO: maybe leave these instructions to be removed by a later DCE pass
+  for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
+       i != IItoRemove.end(); ++i) {
+    (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
+    (*i)->eraseFromParent();
+  }
+
+  DEBUG(errs() << *F_X86);
+}
+
+} // End of namespace
+
+char DFG2LLVM_X86::ID = 0;
+static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc",
+                                    "Dataflow Graph to LLVM for X86 backend (DSOCC version)",
+                                    false /* does not modify the CFG */,
+                                    true /* transformation, not just analysis */);
+
diff --git a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt
new file mode 100644
index 0000000000..a6c4de9537
--- /dev/null
+++ b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_X86_dsoc
+parent = Transforms
+
diff --git a/lib/ExtractHPVMLeafNodes/CMakeLists.txt b/lib/ExtractHPVMLeafNodes/CMakeLists.txt
new file mode 100644
index 0000000000..6421b528d7
--- /dev/null
+++ b/lib/ExtractHPVMLeafNodes/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( ExtractHPVMLeafNodes
+  ExtractHPVMLeafNodes.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp
new file mode 100644
index 0000000000..cd7ead9f6c
--- /dev/null
+++ b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp
@@ -0,0 +1,246 @@
+//===------------------- ExtractHPVMLeafNodeGenFunctions.cpp -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+//
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ExtractHPVMLeafNodes"
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Pass.h"
+#include "llvm/SupportVISC/DFGTreeTraversal.h"
+#include "llvm/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/FileSystem.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace extracthpvmleaf;
+using namespace dfg2llvm;
+
+namespace {
+
+class PrintLeafNodes : public DFGTreeTraversal {
+  public:
+  virtual void process(DFInternalNode* N) override;
+  virtual void process(DFLeafNode* N) override;
+
+  // Constructor
+  PrintLeafNodes(Module &_M, BuildDFG &_DFG) : DFGTreeTraversal(_M, _DFG) {}
+
+};
+
+}
+
+void PrintLeafNodes::process(DFInternalNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+  return; // nothing to do
+}
+
+void PrintLeafNodes::process(DFLeafNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+  if((N->isDummyNode())) {
+    DEBUG(errs() << "Skipping Dummy Node: " << N->getFuncPointer()->getName() << "\n");
+    return;
+  }
+
+  // Find function generated for node
+  Function *F = N->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(F != NULL
+         && "This pass is invoked after code generation for x86 is completed.\nFound leaf node for which code generation has not happened!\n");
+  assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function from x86 pass is not an x86 function\n");
+
+  std::string module_name = std::string("./build/") + std::string(F->getName().str().c_str()) + std::string("_module.ll");
+  Twine tw(module_name);
+  // Create a new module for the node function
+  //Twine tw = Twine(F->getName()).concat(Twine("_module.ll"));
+  Module *m = new Module(tw.str(), F->getParent()->getContext());
+  // Create a new function for F. It will be written to a new module.
+  ValueToValueMapTy VMap;
+  Function *ClonedF = CloneFunction(F, VMap);
+  // Remove it from current module
+  ClonedF->removeFromParent();
+  // Insert it to the newly created module for it
+  m->getFunctionList().push_back(ClonedF);
+
+  std::vector<Instruction*> ItoRemove;
+
+  for (inst_iterator i = inst_begin(ClonedF), e = inst_end(ClonedF); i != e; ++i) {
+    Instruction *I = &(*i);
+    errs() << *I << "\n";
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      errs() << "Found call instruction\n";
+
+      Function *CalledF = CI->getCalledFunction();
+      StringRef CallName = CalledF->getName();
+      errs() << "CallName: " << CallName << "\n";
+
+//      if (CallName.startswith("llvm_visc")) { //TODO
+      if ((CallName.startswith("llvm_visc")) || (CallName.startswith("tensor"))) { //TODO
+//        errs() << "This is an HPVM runtime call. Include its declaration.\n";
+        errs() << "This is an HPVM runtime call or tensor. Include its declaration.\n";
+
+        FunctionType *CalledFType = CalledF->getFunctionType();
+
+        std::vector<Value*> Fargs;
+        for (unsigned argno = 0; argno < CI->getNumArgOperands(); argno++) {
+          Fargs.push_back(CI->getArgOperand(argno));
+        }
+        Function *FDecl = cast<Function>(m->getOrInsertFunction(CallName, CalledFType));
+        CallInst *NewCI = CallInst::Create(CalledFType, FDecl, Fargs, CallName, CI);
+        errs() << "NewCI: " << *NewCI << "\n";
+        CI->replaceAllUsesWith(NewCI);
+        ItoRemove.push_back(CI);
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < ItoRemove.size() ; i++) {
+    ItoRemove[i]->eraseFromParent();
+  }
+
+  ItoRemove.clear();
+
+  // Print new module
+  legacy::PassManager Passes;
+
+  errs() << "Writing to File --- " << tw.str() << "\n";
+  std::error_code EC;
+  tool_output_file Out(tw.str(), EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
+  }
+
+  Passes.add(createPrintModulePass(Out.os()));
+  Passes.run(*m);
+  // Declare success.
+  Out.keep();
+
+  // Any call that is to F, needs to call the new external function
+  // Edit initial module to do so
+  // This is the name with which the function is called now
+  StringRef FName = ClonedF->getName();
+  FunctionType *FType = F->getFunctionType();
+
+  // This is a node function, so it is only called through the dataflow graph
+  assert(F->hasOneUse() && "F is an HPVM node function\n");
+
+/*
+  errs() << "F uses: " << F->getNumUses()  << "\n" ;
+  for(Value::user_iterator ui = F->user_begin(),
+      ue = F->user_end(); ui!=ue; ++ui) {
+    errs() << "use : "<< **ui << "\n";
+  }
+*/
+
+  // Get the parent node's generated x86 function
+  DFInternalNode *ParentNode = N->getParent();
+  Function *PGenF = ParentNode->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(PGenF != NULL
+         && "This pass is invoked after code generation for x86 is completed.\nFound node for which code generation has not happened!\n");
+  assert(ParentNode->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function from x86 pass is not an x86 function\n");
+
+  for (inst_iterator i = inst_begin(PGenF), e = inst_end(PGenF); i != e; ++i) {
+    Instruction *I = &(*i);
+    errs() << *I << "\n";
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      errs() << "Found call instruction\n";
+
+      StringRef CallName = CI->getCalledFunction()->getName();
+      errs() << "CallName: " << CallName << "\n";
+      errs() << "F->getName(): " << F->getName() << "\n";
+
+      if (CallName == F->getName()) {
+        // Found the call to the leaf node function we moved to the other module.
+        // Replace the call
+        std::vector<Value*> Fargs;
+        for (unsigned argno = 0; argno < CI->getNumArgOperands(); argno++) {
+          Fargs.push_back(CI->getArgOperand(argno));
+        }
+        Function *FDecl = cast<Function>(M.getOrInsertFunction(FName, FType));
+        CallInst *NewCI = CallInst::Create(FType, FDecl, Fargs, FName, CI);
+        errs() << "NewCI: " << *NewCI << "\n";
+        CI->replaceAllUsesWith(NewCI);
+        ItoRemove.push_back(CI);
+      }
+    }
+  }
+  
+  for (unsigned i = 0; i < ItoRemove.size() ; i++) {
+    ItoRemove[i]->eraseFromParent();
+  }
+
+  // Clean up
+  ClonedF->eraseFromParent();
+  delete m;
+
+  F->replaceAllUsesWith(UndefValue::get(F->getType()));
+  F->eraseFromParent();
+
+  return;
+}
+
+void ExtractHPVMLeafNodeFunctions::run(Module &M, BuildDFG &DFG) {
+
+  errs() << "\nEXTRACT HPVM LEAF NODE FUNCTIONS PASS\n";
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+
+  // Visitor for Graph Traversal
+  PrintLeafNodes *LeafVisitor = new PrintLeafNodes(M, DFG);
+
+  // Iterate over all the DFGs
+  // Analyse the edges for parameters that are valid to be used in place
+  for (auto rootNode: Roots) {
+    LeafVisitor->visit(rootNode);
+  }
+
+  delete LeafVisitor;
+  return;
+}
+
+namespace {
+struct ExtractHPVMLeafNodeGenFunctionsWrapper : public ModulePass {
+  static char ID;
+  ExtractHPVMLeafNodeGenFunctionsWrapper() : ModulePass(ID) {}
+
+  bool runOnModule(Module &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+} // end anonymous namespace
+
+void ExtractHPVMLeafNodeGenFunctionsWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BuildDFG>();
+  AU.addPreserved<BuildDFG>();
+}
+
+bool ExtractHPVMLeafNodeGenFunctionsWrapper::runOnModule(Module &M) {
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  ExtractHPVMLeafNodeFunctions ELNF;
+  ELNF.run(M, DFG);
+
+  return false;
+}
+
+char ExtractHPVMLeafNodeGenFunctionsWrapper::ID = 0;
+static RegisterPass<ExtractHPVMLeafNodeGenFunctionsWrapper> X(
+         "hpvm-extract-leaf-gen",
+         "Pass to extract leaf nodes to modules in HPVM",
+         false /* does not modify the CFG */,
+true /* transformation, not just analysis */);
diff --git a/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/ExtractHPVMLeafNodes/LLVMBuild.txt b/lib/ExtractHPVMLeafNodes/LLVMBuild.txt
new file mode 100644
index 0000000000..9862f559e5
--- /dev/null
+++ b/lib/ExtractHPVMLeafNodes/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ExtractHPVMLeafNodes
+parent = Transforms
+
diff --git a/lib/FuseHPVMTensorNodes/CMakeLists.txt b/lib/FuseHPVMTensorNodes/CMakeLists.txt
new file mode 100644
index 0000000000..374f3b26f1
--- /dev/null
+++ b/lib/FuseHPVMTensorNodes/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMFuseHPVMTensorNodes
+  FuseHPVMTensorNodes.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
new file mode 100644
index 0000000000..541efe4e1d
--- /dev/null
+++ b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
@@ -0,0 +1,1007 @@
+//===                        FuseHPVMTensorNodes.cpp                       ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "FuseTensorNodes"
+
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include "llvm/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+using namespace viscUtils;
+
+namespace tensorfuse {
+/***                                Classes                                 ***/
+
+/***                            Helper Functions                            ***/
+
+/* Return the constant integer represented by value V */
+static unsigned getNumericValue(Value* V) {
+  assert(isa<ConstantInt>(V)
+         && "Value indicating the number of arguments should be a constant integer");
+  return cast<ConstantInt>(V)->getZExtValue();
+}
+
+/* Query the kind of edge described by a createEdge intrinsic IIe             *
+ * with respect to node handle IIn                                            */
+static bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) {
+  Value* Src = IIe->getArgOperand(1);
+  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src);
+  assert(ArgII && "First argument of createEdge is not an intrinsic");
+  return (ArgII == IIn);
+}
+static bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) {
+  Value* Src = IIe->getArgOperand(0);
+  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src);
+  assert(ArgII && "First argument of createEdge is not an intrinsic");
+  return (ArgII == IIn);
+}
+
+/* Populates vector with all incoming edge intrinsics to node II              */
+static void getIncomingEdgeIntrinsicList(IntrinsicInst *II,
+                                        std::vector<IntrinsicInst*> &EdgeList) {
+  for(Value::user_iterator ui = II->user_begin(),
+      ue = II->user_end(); ui!=ue; ++ui) {
+    IntrinsicInst* useI = dyn_cast<IntrinsicInst>(*ui);
+    assert(useI &&
+           "HPVM graph intrinsic used in non HPVM intrinsic instruction\n");
+    if (useI->getIntrinsicID() != Intrinsic::visc_createEdge)
+      continue; // Skip all non edge intrinsics
+
+    // For edge intrinsics, test the descination operand
+    if (useI->getOperand(1) == II) { // Argument is the destination
+      EdgeList.push_back(useI);
+    }
+  }
+  return;
+}
+
+/* Returns true if argument at position argno is coming from a dataflow edge  *
+ * in the vector EdgeList                                                     */
+static bool isIncomingEdgeArgument(unsigned argno,
+                                   std::vector<IntrinsicInst*> &EdgeList) {
+  for (IntrinsicInst *ii : EdgeList) {
+    if (getNumericValue(ii->getOperand(4)) == argno)
+      return true;
+  }
+  return false;
+}
+
+  
+// Check that this is a valid HPVM Tensor Node (starts with an HPVM intrinsic)
+// Return the node intrinsic function
+static IntrinsicInst *isValidHPVMTensorNode(DFNode *N) {
+  
+  Function *F = N->getFuncPointer();
+  //IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*(inst_begin(F)));
+
+  IntrinsicInst *II;
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; I++){
+
+    if(dyn_cast<IntrinsicInst>(&*I)){
+      II = dyn_cast<IntrinsicInst>(&*I);
+      if ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){
+	errs()<<"** Tensor Intrinsic = " << *II << "\n";
+      }
+      
+    }
+  }
+  
+  //assert(II &&
+  //        "HPVM tensor intrinsic expected as first instruction of HPVM tensor node\n");
+
+  //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") &&
+  //        "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+  
+  return II;
+}
+  
+
+// Returns the next node in a node sequence, or NULL if it does not exist.
+// We consider two nodes a sequence if SrcN has a single successor, DstN,
+// and DstN a single predeccessor, SrcN (other than the Root node)
+static DFNode *findNextNodeInSequence(DFNode *SrcN) {
+
+  DFNode *DstN = NULL;
+
+  for (DFNode::successor_iterator si = SrcN->successors_begin(),
+       se = SrcN->successors_end(); si != se; ++si) {
+    DFNode *N = *si;
+    if (N->isDummyNode()) {
+      continue;
+    }
+    if (!DstN)
+      DstN = N;
+    if (DstN != N) {
+      errs() << "Found different destination nodes: no node sequence.\n";
+      return NULL;
+    }
+  }
+
+  // If we reach this point, DstN is the unique successor of SrcN
+
+  // Now, test that the DstN has a single predeccessor except Root (dummy)
+  for (DFNode::indfedge_iterator eb = DstN->indfedge_begin(),
+       ee = DstN->indfedge_end(); eb != ee; ++eb) {
+    DFNode *SN = (*eb)->getSourceDF();
+    if ((SN != SrcN) && (!(SN->isDummyNode()))) {
+      // Does not satisfy requirement
+      return NULL;
+    }
+  }
+
+  return DstN;
+}
+
+/***                                Methods                                 ***/
+
+/* Create an identical bind (in or out, depending on the argument intrinsic)  *
+ * with different src (true) or dst (false) port                              */
+IntrinsicInst* FuseHPVMTensorNodes::createIdenticalBindWithDifferentPort(
+                               IntrinsicInst* II, unsigned port, bool srcport) {
+  // Argument of the function to be called
+  ConstantInt* PortConstant =
+    ConstantInt::get(Type::getInt32Ty(II->getContext()), port);
+  Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1);
+  Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant;
+
+  Value* BindArgs[] = {II->getArgOperand(0),
+                       SrcPort,
+                       DstPort,
+                       II->getArgOperand(3)
+                      };
+  Function* BindF = II->getCalledFunction();
+  CallInst* BindInst = CallInst::Create(BindF,
+                                        ArrayRef<Value*>(BindArgs, 4),
+                                        "");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst);
+
+  return newII;
+}
+
+/* Given two createNode intrinsics describing connected nodes, this function  *
+ * returns the argument list type of the fused function                       */
+void FuseHPVMTensorNodes::createArgTypes(IntrinsicInst* II1,
+                                         IntrinsicInst* II2,
+                                         std::vector<Type*> &ArgTypes) {
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  // Arguments of the first node are simply added
+  for(auto& arg: F1->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    ArgTypes.push_back(arg.getType());
+  }
+
+  // Arguments of the second node are added only if they are not the output of
+  // the previous node
+
+  // Find all incoming edges.
+  std::vector<IntrinsicInst *> IncomingEdgeList;
+  getIncomingEdgeIntrinsicList(II2, IncomingEdgeList);
+
+  // Their source must be the first fusion node, otherwise they would not have
+  // been fusion candidates
+  for (IntrinsicInst *ii : IncomingEdgeList) {
+    assert((ii->getOperand(0) == II1) && "Unexpected source operand\n");
+  }
+
+  // Add argument type to the new function only if it is not incoming from
+  // an edge 
+  for(auto& arg: F2->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned inport = arg.getArgNo();
+    if (isIncomingEdgeArgument(inport, IncomingEdgeList))
+      continue;
+    ArgTypes.push_back(arg.getType());
+  }
+}
+
+/* Get the return type of the function for fused node II1-II2                 */
+StructType* FuseHPVMTensorNodes::createReturnType(IntrinsicInst* II1,
+                                                  IntrinsicInst* II2) {
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  // Based on the HPVM tensor node assumptions and the patterns we want to
+  // support, when two nodes are fused the result will always be the result
+  // of the second node.
+  StructType* F1RetTy = dyn_cast<StructType>(F1->getReturnType());
+  assert(F1RetTy && "Return Type must always be a struct");
+  StructType* F2RetTy = dyn_cast<StructType>(F2->getReturnType());
+  assert(F2RetTy && "Return Type must always be a struct");
+
+  return F2RetTy;
+}
+
+/* Copy argument names, from functions of II1 and II2 to F                    */
+void FuseHPVMTensorNodes::copyArgumentNames(IntrinsicInst* II1,
+                                            IntrinsicInst* II2,
+                                            Function* F) {
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  Function::arg_iterator dest_it = F->arg_begin();
+
+  // Argument names of the first node are simply copied
+  for(auto& arg: F1->getArgumentList()) {
+    dest_it->setName("s_" + arg.getName());
+    dest_it++;
+  }
+
+  // For the second node, we ignore those arguments that are incoming edges
+  // (from II1)
+  // Find all incoming edges.
+  std::vector<IntrinsicInst *> IncomingEdgeList;
+  getIncomingEdgeIntrinsicList(II2, IncomingEdgeList);
+
+  // Their source must be the first fusion node, otherwise they would not have
+  // been fusion candidates
+  for (IntrinsicInst *ii : IncomingEdgeList) {
+    assert((ii->getOperand(0) == II1) && "Unexpected source operand\n");
+  }
+
+  // Copy argument name to the new function only if it is not incoming from
+  // an edge 
+  for(auto& arg: F2->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned inport = arg.getArgNo();
+    if (isIncomingEdgeArgument(inport, IncomingEdgeList))
+      continue;
+
+    dest_it->setName("d_" + arg.getName());
+    dest_it++;
+  }
+  assert((dest_it == F->arg_end()) &&
+         "Argument list of fused function not fully traversed\n");
+  return;
+}
+
+/* Copy attributes, from functions of II1 and II2 to F                        */
+void FuseHPVMTensorNodes::copyAttrList(IntrinsicInst* II1,
+                                       IntrinsicInst* II2,
+                                       Function* F) {
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  Function::arg_iterator f1_ai = F1->arg_begin(), f1_ae = F1->arg_end();
+  Function::arg_iterator f2_ai = F2->arg_begin(), f2_ae = F2->arg_end();
+  Function::arg_iterator f_ai = F->arg_begin(), f_ae = F->arg_end();
+
+  // For the second node, we have to ignore the arguments that are incoming
+  // edges (from II1)
+  // Find all incoming edges.
+  std::vector<IntrinsicInst *> IncomingEdgeList;
+  getIncomingEdgeIntrinsicList(II2, IncomingEdgeList);
+
+  // Their source must be the first fusion node, otherwise they would not have
+  // been fusion candidates
+  for (IntrinsicInst *ii : IncomingEdgeList) {
+    assert((ii->getOperand(0) == II1) && "Unexpected source operand\n");
+  }
+
+  // Copy attributes of F1
+  for(; f1_ai != f1_ae && f_ai != f_ae; ++f1_ai, ++f_ai) {
+    AttributeSet AS = F1->getAttributes();
+    DEBUG(errs() << "Copying attributes from "
+                 << F1->getName() << " at " << f1_ai->getArgNo() << "\n");
+    AttrBuilder AB(AS, f1_ai->getArgNo()+1);
+    AttributeSet argAS = AttributeSet::get(F1->getContext(),
+                                           f_ai->getArgNo()+1, AB);
+    F->addAttributes(f_ai->getArgNo()+1, argAS);
+  }
+
+  // Copy needed attributes of F2
+  for(; f2_ai != f2_ae && f_ai != f_ae; ++f2_ai) {
+    unsigned inport = f2_ai->getArgNo();
+    if (isIncomingEdgeArgument(inport, IncomingEdgeList))
+      continue;
+
+    AttributeSet AS = F2->getAttributes();
+    DEBUG(errs() << "Copying attributes from "
+                 << F2->getName() << " at " << f2_ai->getArgNo() << "\n");
+    AttrBuilder AB(AS, f2_ai->getArgNo()+1);
+    AttributeSet argAS = AttributeSet::get(F2->getContext(),
+                                           f_ai->getArgNo()+1, AB);
+    F->addAttributes(f_ai->getArgNo()+1, argAS);
+    ++f_ai;;
+  }
+  return;
+}
+
+/* Creates and inserts an empty function of the rght type for the fused node  */
+Function* FuseHPVMTensorNodes::createEmptyDFNodeFunction(IntrinsicInst* II1,
+                                                         IntrinsicInst* II2,
+                                                         Module &M) {
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  DEBUG(errs () << "Constructing argument list\n");
+ // Construct argument list
+  std::vector<Type*> ArgTypes;
+  createArgTypes(II1, II2, ArgTypes);
+
+  DEBUG(errs () << "Constructing return type\n");
+  // Construct return type
+  StructType* FRetTy = createReturnType(II1, II2);
+
+  FunctionType* FTy = FunctionType::get(FRetTy, ArgTypes, false);
+  // Create a function with the new type
+  Function* F = Function::Create(FTy, F1->getLinkage(),
+                                 F1->getName()+"_"+F2->getName(), &M);
+
+  DEBUG(errs () << "Copying argument names\n");
+  // Copy argument names from original functions
+  copyArgumentNames(II1, II2, F);
+  // Copy argument attributes from original functions
+  copyAttrList(II1, II2, F);
+
+   return F;
+}
+
+/* Inline first node function, updating required mappings                     *
+ * - F1: first node function                                                  *
+ * - M:  module containing the node function                                  *
+ * - Ffused: fused node function                                              *
+ * - VMap: maps values used in the body of F1 to those that mst be used in    *
+           the body of the fused function instead                             *
+ * OutVs: This maps the output struct field index to the stored value         */
+void FuseHPVMTensorNodes::inlineFirstNodeFunction(Module &M, Function *F1,
+                                                  Function *Ffused,
+                                                  ValueMap<Value*, Value*> &VMap,
+                                                  std::vector<Value*> &OutVs) {
+
+  ReturnInst *RI = cast<ReturnInst>(Ffused->getEntryBlock().getTerminator());
+
+  inst_iterator f1_i = inst_begin(F1);
+  // First, we copy the HPVM intrinsics of F1 into Ffused, applying the mapping
+  for (inst_iterator f1_e = inst_end(F1); f1_i != f1_e; ++f1_i) {
+    Instruction *I = &(*f1_i);
+    if (!(BuildDFG::isViscIntrinsic(I))) {
+      // We are done with the node computation
+      break;
+    }
+
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    assert ( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+	      || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id") )
+	     && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+
+    std::vector<Value*> Args;
+    for(unsigned i = 0; i < II->getNumArgOperands(); i++) {
+      Value *V = II->getArgOperand(i);
+      if (isa<Constant>(V)) { // Constants can be reused
+        Args.push_back(V);
+      } else {
+        assert((VMap.find(V) != VMap.end()) &&
+              "Attempted to use value without existing mapping in VMap");
+        Args.push_back(VMap[V]);
+      }
+    }
+    
+    Function *F = Intrinsic::getDeclaration(&M, II->getIntrinsicID());
+    CallInst* CI =
+      CallInst::Create(F, Args,
+                       F->getReturnType()->isVoidTy()? "" : "s_"+II->getName(), RI);
+    // Update the map with the newly created value
+    VMap[II] = CI;
+  }
+
+  // We continue with gathering information about the return values
+  for (inst_iterator f1_e = inst_end(F1); f1_i != f1_e; ++f1_i) {
+    Instruction *I = &(*f1_i);
+    InsertValueInst* IV = dyn_cast<InsertValueInst>(I);
+    if (!IV) {
+      // End of insertvalue instructions. This should be a return statement
+      assert((dyn_cast<ReturnInst>(I)) && "Unexpected Instruction\n");
+      break; // Done processing this function
+    }
+    OutVs.push_back(IV->getOperand(1));
+  }
+  return;
+}
+
+/* Inline second node function, updating required mappings                    *
+ * - F2: second node function                                                 *
+ * - M:  module containing the node function                                  *
+ * - Ffused: fused node function                                              *
+ * - VMap: maps values used in the body of F2 to those that mst be used in    *
+           the body of the fused function instead                             */
+void FuseHPVMTensorNodes::inlineSecondNodeFunction(Module &M, Function *F2,
+                             Function *Ffused, ValueMap<Value*, Value*> &VMap) {
+
+  ReturnInst *RI = cast<ReturnInst>(Ffused->getEntryBlock().getTerminator());
+
+  // Copy the body of F2 into Ffused, applying the mapping
+  inst_iterator f2_i = inst_begin(F2);
+  for (inst_iterator f2_e = inst_end(F2); f2_i != f2_e; ++f2_i) {
+    Instruction *I = &(*f2_i);
+    if ((BuildDFG::isViscIntrinsic(I))) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+	     || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id"))
+        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+
+      if ( (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id")) {
+	continue; // Skip adding visc.node.id calls in nodes other than first node
+      }
+						
+      std::vector<Value*> Args;
+      for(unsigned i = 0; i < II->getNumArgOperands(); i++) {
+        Value *V = II->getArgOperand(i);
+        if (isa<Constant>(V)) { // Constants can be reused 
+          Args.push_back(V);
+        } else {
+          assert((VMap.find(V) != VMap.end()) &&
+                "Attempted to use value without existing mapping in VMap");
+          Args.push_back(VMap[V]);
+        }
+      }
+      Function *F = Intrinsic::getDeclaration(&M, II->getIntrinsicID());
+      CallInst* CI =
+        CallInst::Create(F, Args,
+                         F->getReturnType()->isVoidTy()? "" : II->getName(),
+                         RI);
+      // Update the map with the newly created value
+      VMap[II] = CI;
+    } else if (InsertValueInst* IV = dyn_cast<InsertValueInst>(I)) {
+      Value *AggOp = IV->getAggregateOperand();
+      Value *InsOp = IV->getInsertedValueOperand();
+      assert(((VMap.find(AggOp) != VMap.end()) ||
+              (isa<Constant>(AggOp)) ) &&
+            "Attempted to use value without existing mapping in VMap");
+      assert(((VMap.find(InsOp) != VMap.end()) ||
+             (isa<Constant>(InsOp))) &&
+            "Attempted to use value without existing mapping in VMap");
+      InsertValueInst* IVI = InsertValueInst::Create(
+        (isa<Constant>(AggOp)) ? AggOp : VMap[AggOp],
+        (isa<Constant>(InsOp)) ? InsOp : VMap[InsOp],
+        IV->getIndices(),
+        IV->getName(),
+        RI);
+      // Update the map with the newly created value
+      VMap[IV] = IVI;
+    } else {
+      ReturnInst* RetI = dyn_cast<ReturnInst>(I);
+      assert(RetI && "Unexpected Instruction\n");
+      Value *RetVal = RetI->getOperand(0);
+      ReturnInst *newRI = ReturnInst::Create(Ffused->getContext(),
+                                             VMap[RetVal]);
+      ReplaceInstWithInst(RI, newRI);
+    }
+  }
+  return;
+}
+
+/* Create function of leaf node after fusion                                  *
+ * - create type                                                              *
+ * - create empty function of the type                                        *
+ * - inline body of first function (applying and updating appropriate         *
+ *   mappings)                                                                *
+ * - inline body of second function (applying and updating appropriate        *
+ *   mappings)                                                                */
+Function* FuseHPVMTensorNodes::createLeafDFNodeFunction(IntrinsicInst* II1,
+                                                        IntrinsicInst* II2,
+                                                        Module &M) {
+  DEBUG(errs () << "Creating function signature\n");
+
+  /* Create empty node function of the correct type */
+  Function* Ffused = createEmptyDFNodeFunction(II1, II2, M);
+
+  // Get return type, needed for building the assignmens to the return struct
+  StructType* FfusedRetTy = cast<StructType>(Ffused->getReturnType());
+
+  /* Mapping information required for using the correct values in the body of *
+   * the fused node function                                                  */
+
+  // This map maps the values used in the original function bodies with
+  // the ones that need to be used in the fused function body.
+  ValueMap<Value*, Value*> FusedValueMap;
+
+  // Intemediate information saved for return values of first node function
+  // This maps the output port to the value returned through the outgoing edge
+  std::vector<Value*> OutValues;
+
+  DEBUG(errs () << "Creating function body\n");
+
+  // Add a basic block to the new, empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", Ffused);
+  ReturnInst::Create(M.getContext(), UndefValue::get(FfusedRetTy), BB);
+
+  // Get the node functions
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  // Initially, update FusedValueMap: it is populated with the arguments of F1
+  Function::arg_iterator fused_arg_it = Ffused->arg_begin();
+  // Argument names of the first node are simply copied
+  for(auto& arg: F1->getArgumentList()) {
+    FusedValueMap[&arg] = &*fused_arg_it;
+    ++fused_arg_it;
+  }
+
+
+  //  for(const auto& v: FusedValueMap) {
+  //    errs() << "key = " << *(v.first) << "\t";
+  //    errs() << "value = " << *(v.second) << "\n";
+  //  }
+
+  // Invoke function that inlines F1 into Ffused, using and updating mappings
+  inlineFirstNodeFunction(M, F1, Ffused, FusedValueMap, OutValues);
+
+  // Compute mapping between inputs of F2 and outputs of F1
+  std::vector<IntrinsicInst *> IncomingEdgeList;
+  getIncomingEdgeIntrinsicList(II2, IncomingEdgeList);
+  std::vector<unsigned> PortMap(IncomingEdgeList.size(), 0);
+  for (IntrinsicInst * ii : IncomingEdgeList) {
+    unsigned srcPort = getNumericValue(ii->getOperand(3));
+    unsigned dstPort = getNumericValue(ii->getOperand(4));
+    PortMap[dstPort] = srcPort;
+  }
+
+  // FusedValueMap is now populated with the arguments of F2 as well
+  for(auto& arg: F2->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned inport = arg.getArgNo();
+    if (isIncomingEdgeArgument(inport, IncomingEdgeList)) {
+      // Get the mappings of the return values of F1 if incoming edge argument
+      Value *V = OutValues[PortMap[inport]];
+      FusedValueMap[&arg] = (isa<Constant>(V)) ? V: FusedValueMap[V];
+    }
+    else {
+      // Get new argument otherwise
+      FusedValueMap[&arg] = &*fused_arg_it;
+      ++fused_arg_it;
+    }
+  }
+
+  // Invoke function that inlines F2 into Ffused, using and updating mappings
+  inlineSecondNodeFunction(M, F2, Ffused, FusedValueMap);
+
+  // Done with fused node function
+  return Ffused;
+}
+
+/* Updates parent of fused nodes to use the new node intrinsic                */
+void FuseHPVMTensorNodes::updateParentNodeFunction(IntrinsicInst* II1,
+                                                   IntrinsicInst* II2,
+                                                   IntrinsicInst* IInew) {
+
+  // Compute the required shifting of positions for edges/binds to the second
+  // fusion node. No shifting is required for the first fusion node.
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+  std::vector<unsigned> ShiftMap(F2->getFunctionType()->getNumParams(), 0);
+  unsigned shiftCount = F1->getFunctionType()->getNumParams();
+
+  // Find all incoming edges.
+  std::vector<IntrinsicInst *> IncomingEdgeList;
+  getIncomingEdgeIntrinsicList(II2, IncomingEdgeList);
+  // Their source must be the first fusion node, otherwise they would not have
+  // been fusion candidates
+  for (IntrinsicInst *ii : IncomingEdgeList) {
+    assert((ii->getOperand(0) == II1) && "Unexpected source operand\n");
+  }
+
+  // Compute shift map for n2: maps position in F2 arg list to Ffused arg list 
+  for(auto& arg: F2->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned inport = arg.getArgNo();
+    if (isIncomingEdgeArgument(inport, IncomingEdgeList))
+      continue;
+
+    ShiftMap[inport] = shiftCount;
+    shiftCount++;
+  }
+
+  std::vector<IntrinsicInst*> IItoRemove;
+
+  // First, iterate over uses of the first node's createNode intrinsic
+  for (Value::user_iterator i = II1->user_begin(), ie = II1->user_end();
+       i != ie; ++i) {
+    Instruction *VI = dyn_cast<Instruction>(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+    assert(II && "Use of a node handle outside of a visc intrinsic\n");
+
+    switch(II->getIntrinsicID()) {
+      case Intrinsic::visc_createEdge:
+        {
+        if (isOutgoingEdgeIntrinsic(II,II1)) {
+          assert(isIncomingEdgeIntrinsic(II,II2) &&
+                 "Outgoing edge of node 1 should only go to node 2\n");
+          IItoRemove.push_back(II);
+        }
+        }
+        break;
+      case Intrinsic::visc_bind_input:
+        {
+        }
+        break;
+      case Intrinsic::visc_bind_output:
+        {
+          assert(false &&
+                 "Source node of node fusion not expected in bind.out\n");
+        }
+        break;
+      default:
+        llvm_unreachable("Unknown use of HPVM createNode handle\n");
+        break;
+    }
+  }
+
+  // Delete gathered instructions - they are the edges between n1-n2
+  for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(),
+       ie = IItoRemove.end(); ib != ie; ++ib) {
+    DEBUG(errs() << "Erasing: " << **ib << "\n");
+    (*ib)->eraseFromParent();
+  }
+  II1->replaceAllUsesWith(IInew);
+  II1->eraseFromParent();
+
+  IItoRemove.clear();
+
+  // Then, iterate over uses of the second node's createNode intrinsic
+  for (Value::user_iterator i = II2->user_begin(), ie = II2->user_end();
+       i != ie; ++i) {
+    Instruction *VI = dyn_cast<Instruction>(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+    assert(II && "Use of a node handle outside of a visc intrinsic\n");
+
+    switch(II->getIntrinsicID()) {
+      case Intrinsic::visc_createEdge:
+        {
+        assert(isOutgoingEdgeIntrinsic(II,II2) &&
+               "Node 2 is expected to have only outgoing edges at this point\n");
+        }
+        break;
+      case Intrinsic::visc_bind_input:
+        {
+        /* The index must be updated to the matching argument position of *
+         * the fused functionm using ShiftMap                             */
+        unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+        IntrinsicInst *newII =
+          createIdenticalBindWithDifferentPort(II,
+                                               ShiftMap[dstPos],
+                                               false);
+        newII->insertBefore(II);
+        IItoRemove.push_back(II);
+        }
+        break;
+      case Intrinsic::visc_bind_output:
+        {
+          assert(false &&
+                 "Source node of node fusion not expected in bind.out\n");
+        }
+        break;
+      default:
+        llvm_unreachable("Unknown use of HPVM createNode handle\n");
+        break;
+    }
+  }
+
+  // Delete gathered instructions - they are the old bindings of n2
+  for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(),
+       ie = IItoRemove.end(); ib != ie; ++ib) {
+    DEBUG(errs() << "Erasing: " << **ib << "\n");
+    (*ib)->eraseFromParent();
+  }
+
+  II2->replaceAllUsesWith(IInew);
+  II2->eraseFromParent();
+
+  return;
+}
+
+/* Performs all operations required at the IR level for fusion of HPVM tensor *
+ * nodes with intrinsic instructions II1 and II2                              *
+ * - Creates fused node function                                              *
+ * - Creates createNode intrinsic for it and returns it                       *
+ * - Updates parent function:                                                 *
+ * - - adds new intrinsic                                                     *
+ * - - edges and binds consistently use the new intrinsic                     *
+ * - Removes old functions                                                    */
+IntrinsicInst* FuseHPVMTensorNodes::FuseHPVMTensorNodesStep(IntrinsicInst* II1,
+                                                            IntrinsicInst* II2,
+                                                            Module &M) {
+  // Get the node functions
+  Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts());
+  Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts());
+
+  // Create fused node function
+  Function *Ffused = createLeafDFNodeFunction(II1, II2, M);
+  addHint(Ffused, getPreferredTarget(F1));
+
+  // FIX PARENT DFNode'S FUNCTION
+
+  // Generate createNode Intrinsic for fused node and insert it
+  Function* CreateNodeF = Intrinsic::getDeclaration(&M,
+                                                    Intrinsic::visc_createNode);
+  Constant* Fp = ConstantExpr::getPointerCast(Ffused,
+                                          Type::getInt8PtrTy(M.getContext()));
+  CallInst *CI = CallInst::Create(CreateNodeF,
+                                  ArrayRef<Value*>(Fp),
+                                  Ffused->getName()+".node");
+  IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
+  CreateNodeII->insertBefore(II1);
+
+  // By the assumptions about the fusion pattern structure, all edges that have
+  // II1 as source will have II2 as destination and vice versa.
+  // We can simply delete them.
+
+  // All createEdge intrinsics with destination argument = II1 need to use
+  // CreateNodeII instead.
+  // Similarly with bind.in
+
+  // All createEdge intrinsics with source argument = II1 need to use
+  // CreateNodeII instead
+  // Similarly with bind.out
+
+  // By the assumptions about the fusion pattern structure, the first node
+  // cannot be the argument of a bind.out
+  // The second node can be the argument of a bind.in.
+  // For the bind.in, we need to adjust the destination port.
+  updateParentNodeFunction(II1, II2, CreateNodeII);
+
+  // Remove old node functions
+  removeHint(F1, getPreferredTarget(F1));
+  removeHint(F2, getPreferredTarget(F2));
+  F1->replaceAllUsesWith(UndefValue::get(F1->getType()));
+  F1->eraseFromParent();
+  F2->replaceAllUsesWith(UndefValue::get(F2->getType()));
+  F2->eraseFromParent();
+
+  return CreateNodeII;
+}
+
+/* Fuse node sequence described by creaetNode intrinsics in IIs.              *
+ * Contents of IIs are cleared.                                               */
+void FuseHPVMTensorNodes::FuseHPVMTensorNodeSequence(
+                                  std::vector<IntrinsicInst*> &IIs, Module &M) {
+  for (IntrinsicInst *II : IIs) {
+    assert((II->getIntrinsicID() == Intrinsic::visc_createNode) &&
+           "Expected createNode intrinsic in fuse intrinsic sequence\n");
+  }
+
+  if (IIs.size() < 2) {
+    errs() << "Warning: Attempted to fuse fewer than 2 nodes\n";
+    return;
+  }
+
+  for (unsigned i = 0; i + 1 < IIs.size(); i++) {
+    IntrinsicInst *II1 = IIs[i];
+    IntrinsicInst *II2 = IIs[i+1];
+    IIs[i+1] = FuseHPVMTensorNodesStep(II1, II2, M);
+  }
+  IIs.clear();
+  return;
+}
+
+/* Run method for FuseHPVMTensorNodes class, simply invokes fusion of all the *
+ * sequenses in member variable FTs.                                          */
+void FuseHPVMTensorNodes::run(Module &M, FusionTargets &FTs) {
+  for (unsigned i = 0; i < FTs.size(); i++) {
+    FuseHPVMTensorNodeSequence(FTs[i], M);
+  }
+  return;
+}
+
+// Print Fusion Targets. The argument vector contains createNode intrinsics
+// of nodes to be fused).
+void FuseHPVMTensorNodes::printFusionTargets(FusionTargets &FTs) {
+  errs() << "Print Fusion Targets\n";
+  errs() << "Found " << FTs.size() << " targets\n";
+  for (FuseHPVMTensorNodes::FusionTargets::iterator ii = FTs.begin(),
+       ie = FTs.end(); ii != ie ; ++ii) {
+    errs() << "Target:\n";
+    std::vector<IntrinsicInst*> IIv = *ii;
+    for (std::vector< IntrinsicInst*>::iterator pi = IIv.begin(),
+         pe = IIv.end(); pi != pe; ++pi) {
+      errs() << "\t" << *((*pi)->getOperand(0)) << "\n";
+    }
+  }
+  return;
+}
+
+void FindFusionTargetsTraversal::codeGen(DFInternalNode *N) {
+  DEBUG(errs() << "Skipping Internal Node: "
+               << N->getFuncPointer()->getName() << "\n");
+  return;
+}
+
+  
+void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
+  DEBUG(errs() << "Inside leaf node: "
+               << N->getFuncPointer()->getName() << "\n");
+
+  // Skip fusion check if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+
+  if(!preferredTargetIncludes(N, visc::PROMISE_TARGET)) {
+    // Only fuse if we plan to target PROMISE/Layers API
+    // The CUDNN backend would be able to generate calls for the fused node,
+    // but not the other way around
+    DEBUG(errs() << "No PROMISE hint. Skipping node: "
+                 << N->getFuncPointer()->getName() << "\n");
+    return;
+  }
+
+  visc::Target StartNodePreferredTarget = getPreferredTarget(N);
+  // Make sure that this is a valid HPVM Tensor Node
+  // Find first instruction, and check that it is an HPVM tensor intrinsic
+  IntrinsicInst *II = isValidHPVMTensorNode(N);
+
+  std::vector<IntrinsicInst*> CurrentNodeSequence;
+
+  switch(II->getIntrinsicID()) {
+
+    /*case Intrinsic::visc_node_id:
+    { // Found beginning of pattern conv-bias-activation-pooling.
+
+    }
+    break;
+    */
+    
+    case Intrinsic::visc_tensor_convolution:
+      { // Found beginning of pattern conv-bias-activation-pooling.
+        // Look for the rest
+        CurrentNodeSequence.push_back(N->getInstruction());
+
+        // Look for bias
+        DFNode *SN = findNextNodeInSequence(N);
+        if (!SN) {
+          return; // Did not find a node sequence starting at N. Simpy return.
+        }
+        if (getPreferredTarget(SN) != StartNodePreferredTarget) {
+          return; // Node in sequence has different hint. Simpy return.
+        }
+        IntrinsicInst *SII = isValidHPVMTensorNode(SN);
+        if (SII->getIntrinsicID() != Intrinsic::visc_tensor_add) {
+          // Successor is not the bias operation, thus does not fit the pattern.
+          return;
+        }
+        // Otherwise, push this node to the current sequence
+        CurrentNodeSequence.push_back(SN->getInstruction());
+
+        // This is a valid sequence.
+        // We still need to fuse activation and/or pooling if we find them 
+        // Continue with next node, looking for activation (relu, clipped relu, tanh)
+        SN = findNextNodeInSequence(SN);
+        if (!SN) {
+          // Did not find a node sequence starting at N.Use current sequence.
+          break;
+        }
+        if (getPreferredTarget(SN) != StartNodePreferredTarget) {
+          break; // Node in sequence has different hint. Use current sequence.
+        }
+        SII = isValidHPVMTensorNode(SN);
+
+        if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) ||
+            (SII->getIntrinsicID() == Intrinsic::visc_tensor_relu) ||
+            (SII->getIntrinsicID() == Intrinsic::visc_tensor_tanh)) {
+          // Successor is activation. Push this node to the current sequence.
+          CurrentNodeSequence.push_back(SN->getInstruction());
+
+          // Will continue, looking for pooling in the next node
+          SN = findNextNodeInSequence(SN);
+          if (!SN) {
+            break; // No node in sequence. Use currently found sequence.
+          }
+          if (getPreferredTarget(SN) != StartNodePreferredTarget) {
+            break; // Node in sequence has different hint. Use current sequence.
+          }
+          SII = isValidHPVMTensorNode(SN);
+        } //else {} // Look for pooling in this node
+
+        if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_max) ||
+            (SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_min) ||
+            (SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean)) {
+          // Successor is a pool operation. Use currently found sequence.
+          CurrentNodeSequence.push_back(SN->getInstruction());      
+        }
+      }
+      break;
+    case Intrinsic::visc_tensor_mul:
+      { // Found beginning of pattern gemm-bias-activation. Look for the rest
+        CurrentNodeSequence.push_back(N->getInstruction());
+        // Look for bias
+        DFNode *SN = findNextNodeInSequence(N);
+        if (!SN) {
+          return; // Did not find a node sequence starting at N. Simpy return.
+        }
+        if (getPreferredTarget(SN) != StartNodePreferredTarget) {
+          return; // Node in sequence has different hint. Simpy return.
+        }
+        IntrinsicInst *SII = isValidHPVMTensorNode(SN);
+        if (SII->getIntrinsicID() != Intrinsic::visc_tensor_add) {
+          // Successor is not the bias operation, thus does not fit the pattern.
+          return;
+        }
+        // Otherwise, push this node to the current sequence
+        CurrentNodeSequence.push_back(SN->getInstruction());
+        // This is a possible fuse target, gemm-add.
+        // We need to reach the end of the function, where the found sequence
+        // is added.
+
+        // If the next operation is activation, we fuse that as well.
+        // Continue with next node, looking for activation (relu, clipped relu, tanh)
+        SN = findNextNodeInSequence(SN);
+        if (SN) {
+          if (getPreferredTarget(SN) == StartNodePreferredTarget) {
+            SII = isValidHPVMTensorNode(SN);
+            if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) ||
+                (SII->getIntrinsicID() == Intrinsic::visc_tensor_relu) ||
+                (SII->getIntrinsicID() == Intrinsic::visc_tensor_tanh)) {
+              // We found activation in sequence. Push in vector as well.
+              CurrentNodeSequence.push_back(SN->getInstruction());
+            }
+          }
+        }
+      }
+      break;
+    default:
+      DEBUG(errs() << "No pattern begins at this node\n");
+      break;
+  }
+
+  if (CurrentNodeSequence.size() != 0) {
+    // A sequence was found. Store the node sequence in FTs.
+    FTs.push_back(CurrentNodeSequence);
+  }
+
+  return;
+}
+
+bool FuseHPVMTensorNodesWrapper::runOnModule(Module &M) {
+
+  errs() << "\nFUSE HPVM TENSOR NODES PASS\n";
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // Visitor for Fuse Target Detection Graph Traversal
+  FindFusionTargetsTraversal *FTTVisitor =
+                                         new FindFusionTargetsTraversal(M, DFG);
+
+  errs() << "Find targets\n";
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    FTTVisitor->visit(rootNode);
+  }
+
+  FuseHPVMTensorNodes::FusionTargets &FTs = FTTVisitor->getFusionTargets();
+
+  FuseHPVMTensorNodes Fuse;
+  //  Fuse.printFusionTargets(FTs);
+
+  Fuse.run(M, FTs);
+
+  delete FTTVisitor;
+
+  return true;
+}
+
+char FuseHPVMTensorNodesWrapper::ID = 0;
+static RegisterPass<FuseHPVMTensorNodesWrapper> X("hpvm-fuse",
+  "Fuse HPVM Tensor Nodes Pass",
+  false /* does not modify the CFG */,
+  true /* transformation, not just analysis */);
+
+} // End of namespace
+
diff --git a/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/FuseHPVMTensorNodes/LLVMBuild.txt b/lib/FuseHPVMTensorNodes/LLVMBuild.txt
new file mode 100644
index 0000000000..55a6ee5150
--- /dev/null
+++ b/lib/FuseHPVMTensorNodes/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = FuseHPVMTensorNodes
+parent = Transforms
diff --git a/lib/GenVISC/CMakeLists.txt b/lib/GenVISC/CMakeLists.txt
new file mode 100644
index 0000000000..710e8f2729
--- /dev/null
+++ b/lib/GenVISC/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMGenVISC
+  GenVISC.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/GenVISC/GenVISC.cpp b/lib/GenVISC/GenVISC.cpp
new file mode 100644
index 0000000000..a4d9f2c2a4
--- /dev/null
+++ b/lib/GenVISC/GenVISC.cpp
@@ -0,0 +1,1590 @@
+//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "genvisc"
+#include "llvm/GenVISC/GenVISC.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/SupportVISC/VISCUtils.h"
+
+
+using namespace llvm;
+using namespace viscUtils;
+
+
+namespace genvisc {
+
+// Helper Functions
+
+static Function* transformReturnTypeToStruct(Function* F);
+static Type* getReturnTypeFromReturnInst(Function* F);
+
+// Check if the dummy function call is a __visc__node call
+#define IS_VISC_CALL(callName) \
+  static bool isVISCCall_##callName(Instruction* I) { \
+    if(!isa<CallInst>(I)) \
+      return false; \
+    CallInst* CI = cast<CallInst>(I); \
+    return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
+  }
+
+static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
+  // Check if the instruction is Call Instruction
+  assert(isa<CallInst>(I) && "Expecting CallInst");
+  CallInst* CI = cast<CallInst>(I);
+  DEBUG(errs() << "Found call: " << *CI << "\n");
+
+  // Find the correct intrinsic call
+  Module* M = CI->getParent()->getParent()->getParent();
+  Function* F;
+  std::vector<Type*> ArgTypes;
+  std::vector<Value*> args;
+  if(Intrinsic::isOverloaded(IntrinsicID)) {
+    // This is an overloaded intrinsic. The types must exactly match. Get the
+    // argument types
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+      ArgTypes.push_back(CI->getArgOperand(i)->getType());
+      args.push_back(CI->getArgOperand(i));
+    }
+    F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
+    DEBUG(errs() << *F << "\n");
+  }
+  else { // Non-overloaded intrinsic
+    F = Intrinsic::getDeclaration(M, IntrinsicID);
+    FunctionType* FTy = F->getFunctionType();
+    DEBUG(errs() << *F << "\n");
+
+    // Create argument list
+    assert(CI->getNumArgOperands() == FTy->getNumParams()
+        && "Number of arguments of call do not match with Intrinsic");
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+      Value* V = CI->getArgOperand(i);
+      // Either the type should match or both should be of pointer type
+      assert((V->getType() == FTy->getParamType(i) ||
+          (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
+          && "Dummy function call argument does not match with Intrinsic argument!");
+      // If the types do not match, then both must be pointer type and pointer
+      // cast needs to be performed
+      if(V->getType() != FTy->getParamType(i)) {
+        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+      }
+      args.push_back(V);
+    }
+  }
+  // Insert call instruction
+  CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+
+  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+
+  CI->replaceAllUsesWith(Inst);
+  // If the previous instruction needs to be erased, insert it in the vector
+  // Erased
+  if(Erase != NULL)
+    Erase->push_back(CI);
+}
+
+IS_VISC_CALL(launch) /* Exists but not required */
+IS_VISC_CALL(edge) /* Exists but not required */
+IS_VISC_CALL(createNodeND)
+//IS_VISC_CALL(createNode)
+//IS_VISC_CALL(createNode1D)
+//IS_VISC_CALL(createNode2D)
+//IS_VISC_CALL(createNode3D)
+IS_VISC_CALL(bindIn)
+IS_VISC_CALL(bindOut)
+IS_VISC_CALL(push)
+IS_VISC_CALL(pop)
+IS_VISC_CALL(getNode)
+IS_VISC_CALL(getParentNode)
+IS_VISC_CALL(barrier)
+IS_VISC_CALL(malloc)
+IS_VISC_CALL(return)
+IS_VISC_CALL(getNodeInstanceID_x)
+IS_VISC_CALL(getNodeInstanceID_y)
+IS_VISC_CALL(getNodeInstanceID_z)
+IS_VISC_CALL(getNumNodeInstances_x)
+IS_VISC_CALL(getNumNodeInstances_y)
+IS_VISC_CALL(getNumNodeInstances_z)
+// Atomics
+IS_VISC_CALL(atomic_cmpxchg)
+IS_VISC_CALL(atomic_add)
+IS_VISC_CALL(atomic_sub)
+IS_VISC_CALL(atomic_xchg)
+IS_VISC_CALL(atomic_inc)
+IS_VISC_CALL(atomic_dec)
+IS_VISC_CALL(atomic_min)
+IS_VISC_CALL(atomic_max)
+IS_VISC_CALL(atomic_umin)
+IS_VISC_CALL(atomic_umax)
+IS_VISC_CALL(atomic_and)
+IS_VISC_CALL(atomic_or)
+IS_VISC_CALL(atomic_xor)
+// Misc Fn
+IS_VISC_CALL(floor)
+IS_VISC_CALL(rsqrt)
+IS_VISC_CALL(sqrt)
+IS_VISC_CALL(sin)
+IS_VISC_CALL(cos)
+
+
+IS_VISC_CALL(init)
+IS_VISC_CALL(node)
+IS_VISC_CALL(cleanup)
+IS_VISC_CALL(wait)
+IS_VISC_CALL(trackMemory)
+IS_VISC_CALL(untrackMemory)
+IS_VISC_CALL(requestMemory)
+IS_VISC_CALL(attributes)
+IS_VISC_CALL(hint)
+
+// Tensor Operators
+IS_VISC_CALL(tensor_mul)
+IS_VISC_CALL(tensor_convolution)
+IS_VISC_CALL(tensor_group_convolution)
+IS_VISC_CALL(tensor_batchnorm)
+IS_VISC_CALL(tensor_add)
+IS_VISC_CALL(tensor_pool_max)
+IS_VISC_CALL(tensor_pool_min)
+IS_VISC_CALL(tensor_pool_mean)
+IS_VISC_CALL(tensor_relu)
+IS_VISC_CALL(tensor_clipped_relu)
+IS_VISC_CALL(tensor_tanh)
+IS_VISC_CALL(tensor_sigmoid)
+IS_VISC_CALL(tensor_softmax)
+
+IS_VISC_CALL(node_id)
+
+
+// Return the constant integer represented by value V
+static unsigned getNumericValue(Value* V) {
+  assert(isa<ConstantInt>(V)
+         && "Value indicating the number of arguments should be a constant integer");
+  return cast<ConstantInt>(V)->getZExtValue();
+}
+
+
+
+// Add <numArgs> to the argument list of Function <F>. The names for these arguments
+// should be put in the string array <names>. Ideally the length of <names>
+// array should be numArgs. But, even when the length is not numArgs the
+// arguments would be added correctly. The names however would not be as
+// intuitive.
+static Function* addArgs(Function* F, unsigned numArgs, std::string names[]) {
+  if(numArgs == 0) return F; // Return if no arguments are to be added.
+
+  // Create the argument type list with added argument types
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  for(unsigned i = 0; i < numArgs; ++i) {
+//    ArgTypes.push_back(Type::getInt32Ty(F->getContext()));
+    ArgTypes.push_back(Type::getInt64Ty(F->getContext()));
+  }
+  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+
+  // Change the function type
+  Function* newF = cloneFunction(F, newFT, false);
+
+  // Add names to the extra arguments to the Function argument list
+  unsigned numOldArgs = F->getFunctionType()->getNumParams();
+  for(Function::arg_iterator ai = newF->arg_begin(), ae = newF->arg_end();
+      ai != ae; ++ai) {
+    if (ai->getArgNo() < numOldArgs)
+      continue;
+    ai->setName(names[(ai->getArgNo() - numOldArgs) % names->size()]);
+  }
+
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  return newF;
+}
+
+
+// Take the __visc__return instruction and generate code for combining the
+// values being returned into a struct and returning it.
+// The first operand is the number of returned values
+static Value* genCodeForReturn(CallInst* CI) {
+  LLVMContext& Ctx = CI->getContext();
+  assert(isVISCCall_return(CI)
+      && "__visc__return instruction expected!");
+
+  // Parse the dummy function call here
+  assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n");
+  unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
+
+  assert(CI->getNumArgOperands()-1 == numRetVals &&
+         "Too few arguments for __visc_return call!\n");
+  DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
+
+  std::vector<Type*> ArgTypes;
+  for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
+    ArgTypes.push_back(CI->getArgOperand(i)->getType());
+  }
+  Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
+  StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
+
+  InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy),
+                                                CI->getArgOperand(1),
+                                                0,
+                                                "returnStruct",
+                                                CI);
+  DEBUG(errs() << "Code generation for return:\n");
+  DEBUG(errs() << *IV << "\n");
+
+  for(unsigned i=2; i < CI->getNumArgOperands(); i++) {
+    IV = InsertValueInst::Create(IV,
+                                 CI->getArgOperand(i),
+                                 i-1,
+                                 IV->getName(),
+                                 CI);
+    DEBUG(errs() << *IV << "\n");
+  }
+  
+  return IV;
+}
+
+// The visc launch intrinsic requires all the input parameters to the kernel
+// function be placed in contiguous memory and pointer to that input be passed
+// as the second argument to the launch intrinsic. This generates code to bring
+// together all the input and dimension arguments in one packed struct
+// <InStruct>. First pack the arguments to the kernel function and then add the
+// dimension arguments depending on the hierarchy of DFG user wants to generate.
+static void marshallArguments(unsigned levels, unsigned numArgs, unsigned argOffset, unsigned numDims, unsigned dimOffset, Value* InStruct, CallInst* CI, Function* KernelF) {
+  DEBUG(errs() << "Kernel Function = " << KernelF->getName() << "\n");
+
+  // Get module context and i32 0 constant, as they would be frequently used in
+  // this function.
+  LLVMContext& Ctx = CI->getParent()->getContext();
+  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  // Find the arguments to be passed to kernel function and pack them in a
+  // struct. Specifically first generate a GEP instruction to find the correct
+  // memory location in InStruct and then generate Store instruction to store
+  // the argument in that location.
+  Function::arg_iterator ai = KernelF->arg_begin();
+  Function::arg_iterator ae = KernelF->arg_end();
+
+  for(unsigned i = 0; i < numArgs && ai != ae; i++, ai++) {
+    Value* arg = CI->getArgOperand(i+argOffset);
+    DEBUG(errs() << "Argument: " << ai->getName() << "\n");
+    DEBUG(errs() << "Passing: " << *arg << "\n");
+    // Create constant int (i)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, InStruct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             InStruct->getName()+"."+ai->getName(),
+                             CI);
+    // Store instruction
+    if(GEP->getType()->getPointerElementType() != arg->getType()) {
+      // Arguments type might not match with the kernel function definition
+      // One reason might be because of default argument promotions, where all
+      // arguments of type float are always promoted to double and types char,
+      // short int are promoted to int.
+      // LLVM 4.0 also promotes pointers to i8*. In case both are pointer types,
+      // we just issue a warning and cast it to appropriate type
+      if(arg->getType() == Type::getDoubleTy(Ctx)) {
+        DEBUG(errs() << "Cast from " << *arg->getType() << " To " <<
+            *GEP->getType()->getPointerElementType() << "\n");
+        CastInst* CastI = BitCastInst::CreateFPCast(arg,
+            GEP->getType()->getPointerElementType(), GEP->getName()+".cast",
+            CI);
+        new StoreInst(CastI, GEP, CI);
+      } else if (arg->getType() == Type::getInt32Ty(Ctx)) {
+        CastInst* CastI = BitCastInst::CreateIntegerCast(arg,
+            GEP->getType()->getPointerElementType(), false,
+            GEP->getName()+".cast", CI);
+        new StoreInst(CastI, GEP, CI);
+      } else if (arg->getType()->isPointerTy() && GEP->getType()->getPointerElementType()->isPointerTy()) {
+        errs() << "WARNING: Argument type mismatch between kernel and __visc__node call. Forcing cast\n";
+        CastInst* CastI = CastInst::CreatePointerCast(arg,
+            GEP->getType()->getPointerElementType(), GEP->getName()+".cast",
+            CI);
+        new StoreInst(CastI, GEP, CI);
+      } else {
+        errs() << "Error: Mismatch in argument types\n";
+        errs() << "__visc__node call: " << *CI << "\n";
+        errs() << "Argument: " << *arg << "\n";
+        errs() << "Expected: " << *ai << "\n";
+        llvm_unreachable("Mismatch in argument types of kernel function and __visc__node call");
+      }
+    } else {
+      new StoreInst(arg, GEP, CI);
+    }
+  }
+
+  // Based on the hierarchy of the DFG we want, we need to pass the dimension
+  // for each level. The number of dimensions we need to pass to the launch
+  // intrinsic is the product of the number of levels and dimesions at each
+  // level.
+  // Marshall dim arguments
+  DEBUG(errs() << *CI << "\n");
+  std::string names[] = {"dimX", "dimY", "dimZ"};
+  for(unsigned i=0; i< numDims*levels; i++) {
+    Value* arg = CI->getArgOperand(i+dimOffset);
+    DEBUG(errs() << "Passing: " << *arg << "\n");
+    // Create constant int (i)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numArgs);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, InStruct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             InStruct->getName()+"."+names[i%numDims]+Twine(i/levels),
+                             CI);
+    // Store instruction
+    DEBUG(errs() << *arg << " " << *GEP << "\n");
+    StoreInst* SI = new StoreInst(arg, GEP, CI);
+    DEBUG(errs() << *SI << "\n");
+
+  }
+}
+
+// Returns vector of all wait instructions, waiting on the passed graphID value
+static std::vector<CallInst*>* getWaitList(Value* GraphID) {
+  DEBUG(errs() << "Getting Uses of: " << *GraphID << "\n");
+  std::vector<CallInst*>* WaitList = new std::vector<CallInst*>();
+  // It must have been loaded from memory somewhere
+  for(Value::user_iterator ui = GraphID->user_begin(),
+      ue = GraphID->user_end(); ui!=ue; ++ui) {
+    if(CallInst* waitI = dyn_cast<CallInst>(*ui)) {
+      DEBUG(errs() << "Use: " << *waitI << "\n");
+      assert(isVISCCall_wait(waitI)
+             && "GraphID can only be used by __visc__wait call");
+      WaitList->push_back(waitI);
+    }
+    //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){
+      //errs() << "Found PhiNode use of graphID\n";
+      //std::vector<CallInst*>* phiWaitList  = getWaitList(PN);
+      //WaitList->insert(WaitList->end(), phiWaitList->begin(), phiWaitList->end());
+      //free(phiWaitList);
+    //}
+    else {
+      DEBUG(errs() << *(*ui) << "\n");
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
+    }
+  }
+  return WaitList;
+}
+
+// Analyse the attribute call for this function. Add the in and out
+// attributes to pointer parameters.
+static void handleVISCAttributes(Function* F, CallInst* CI) {
+  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n");
+  // Parse the dummy function call here
+  unsigned offset = 0;
+  // Find number of In pointers
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__attributes call!");
+  unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
+  DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
+
+  for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) {
+    Value* V = CI->getArgOperand(i);
+    if(Argument* arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1+arg->getArgNo(), Attribute::In);
+    }
+    else {
+      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
+      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
+    }
+  }
+  // Find number of Out Pointers
+  offset += 1 + numInPtrs;
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__attributes call!");
+  unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
+  DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
+  for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) {
+    Value* V = CI->getArgOperand(i);
+    if(Argument* arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1+arg->getArgNo(), Attribute::Out);
+    }
+    else {
+      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
+      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
+    }
+  }
+  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n");
+}
+
+// Recursively generate internal nodes for all the levels. Node at each level
+// will create the appropriate instances of the child node at that level using
+// the visc createNode intrinsic, and pass on the remaining dimensions to the
+// child node.
+static Function* genInternalNode(Function* KernelF, unsigned level,
+                                 unsigned numArgs, unsigned numDims, unsigned dimOffset, CallInst* CI) {
+  // Create new function with the same type
+  Module* module = KernelF->getParent();
+  Function* ChildNodeF;
+
+  // Recursively generate node for lower level
+  if(level > 1) {
+    ChildNodeF = genInternalNode(KernelF, level-1, numArgs, numDims, dimOffset, CI);
+    addHint(ChildNodeF, getPreferredTarget(KernelF));
+//    Internal nodes always get a CPU hint. If code geneation for them is not
+//     needed and can be skipped, this is handled by the accelerator backends
+//    addHint(ChildNodeF, visc::CPU_TARGET);
+  } else {
+    ChildNodeF = KernelF;
+  }
+
+  // Generate Internal node for current level
+  Function* InternalF = Function::Create(ChildNodeF->getFunctionType(),
+                                         ChildNodeF->getLinkage(),
+                                         KernelF->getName()+"Internal_level"+Twine(level),
+                                         module);
+  // Create a basic block in this function
+  BasicBlock *BB = BasicBlock::Create(InternalF->getContext(), "entry", InternalF);
+  ReturnInst* RI = ReturnInst::Create(InternalF->getContext(),
+                                      UndefValue::get(InternalF->getReturnType()), BB);
+  // Copy correct attributes
+  InternalF->setAttributes(ChildNodeF->getAttributes());
+  // Loop over the arguments, copying the names of arguments over.
+  Function::arg_iterator dest_iterator = InternalF->arg_begin();
+  for (Function::const_arg_iterator i = ChildNodeF->arg_begin(), e = ChildNodeF->arg_end();
+       i != e; ++i, ++dest_iterator) {
+    DEBUG(errs() << "Copying argument: " << i->getName() << "\n");
+    dest_iterator->setName(i->getName()); // Copy the name over...
+    DEBUG(errs() << "New Argument: " << *dest_iterator << "\n");
+  }
+
+  // Add extra dimesnion arguments
+  std::string dimNames[] = {"dimX", "dimY", "dimZ"};
+  DEBUG(errs() << "Adding extra args to function Function:\n" << *InternalF << "\n");
+  InternalF = addArgs(InternalF, numDims, dimNames);
+  // update RI
+  RI = cast<ReturnInst>(InternalF->getEntryBlock().getTerminator());
+  DEBUG(errs() << "After Adding extra args to function Function:\n" << *InternalF << "\n");
+
+  // Insert createNode intrinsic
+  // First generate constant expression to bitcast the function pointer to
+  // internal node to i8*
+  Value* NodeF = ConstantExpr::getPointerCast(ChildNodeF, Type::getInt8PtrTy(module->getContext()));
+
+  // Use args vectors to get the arguments for visc createNode
+  // intrinsic
+  std::vector<Value*> args;
+
+  // Push the i8* pointer to internal node into the args vector
+  args.push_back(NodeF);
+
+  // Traverse the argument list of internal node function in reverse to get the
+  // dimesnions to be used to create instances of child node at this level
+  Function::arg_iterator ai = InternalF->arg_end();
+  for(unsigned i=0; i<numDims; i++, ai--);
+  DEBUG(errs() << "Iterator at: " << *ai << "\n");
+
+  // ai now points to the first dimension argument to be passed to the
+  // createNode intrinsic. Follow it to push the dim argument into
+  // the args vector
+  for(unsigned i=0; i < numDims; i++, ai++) {
+    args.push_back(&*ai);
+  }
+
+  // Based on the number of dimensions choose the appropriate visc createNode
+  // intrinsic
+  DEBUG(errs() << "Number of dims = " << numDims << "\n");
+  Intrinsic::ID createNodeXD;
+  switch(numDims) {
+  case 0:
+    createNodeXD = Intrinsic::visc_createNode;
+    break;
+  case 1:
+    createNodeXD = Intrinsic::visc_createNode1D;
+    break;
+  case 2:
+    createNodeXD = Intrinsic::visc_createNode2D;
+    break;
+  case 3:
+    createNodeXD = Intrinsic::visc_createNode3D;
+    break;
+  default:
+    llvm_unreachable("Invalid number of dimensions!");
+    break;
+  };
+
+  // Generate the visc createNode intrinsic, using the args vector as parameter
+  Function* CreateNodeF = Intrinsic::getDeclaration(module, createNodeXD);
+  DEBUG(errs() << "Function chosen:\n" << *CreateNodeF << "\n");
+  CallInst *CreateNodeCall = CallInst::Create(CreateNodeF, args, ChildNodeF->getName()+".node", RI);
+  DEBUG(errs() << "Generate call: " << *CreateNodeCall << "\n");
+
+  // Generate Bind intrinsics
+  Function* bindInputF = Intrinsic::getDeclaration(module, Intrinsic::visc_bind_input);
+  DEBUG(errs() << "Generating input binding:\n" << *bindInputF << "\n");
+  for(unsigned i=0; i < ChildNodeF->getArgumentList().size(); i++) {
+    std::vector<Value*> bindArgs;
+    bindArgs.push_back(CreateNodeCall);
+    bindArgs.push_back(ConstantInt::get(Type::getInt32Ty(module->getContext()), i));
+    bindArgs.push_back(ConstantInt::get(Type::getInt32Ty(module->getContext()), i));
+    bindArgs.push_back(ConstantInt::getFalse(module->getContext()));
+    CallInst* bindInputCall = CallInst::Create(bindInputF, bindArgs, "", RI);
+    DEBUG(errs() << *bindInputCall << "\n");
+  }
+
+  // Print the generated internal node for debugging
+  DEBUG(errs() << "Generated Function:\n" << *InternalF << "\n");
+
+  return InternalF;
+}
+
+// Change the OpenCL query function calls with visc intrinsics in function F.
+static void replaceOpenCLCallsWithVISCIntrinsics(Function *F) {
+  Module* module = F->getParent();
+  std::vector<CallInst *> IItoRemove;
+
+  // Get first instruction
+  inst_iterator i = inst_begin(F);
+  Instruction *FI = &(*i);
+
+  // Insert getNode intrinsic
+  Intrinsic::ID getNodeID = Intrinsic::visc_getNode;
+  Function* GetNodeF = Intrinsic::getDeclaration(module, getNodeID);
+  std::vector<Value*> args;
+  CallInst *GetNodeCall = CallInst::Create(GetNodeF, args, F->getName()+".node", FI);
+  DEBUG(errs() << "Generate getNode intrinsic: " << *GetNodeCall << "\n");
+
+  // Insert getParentNode intrinsic
+  Intrinsic::ID getParentNodeID = Intrinsic::visc_getParentNode;
+  Function* GetParentNodeF = Intrinsic::getDeclaration(module, getParentNodeID);
+  args.push_back(GetNodeCall);
+  CallInst *GetParentNodeCall = CallInst::Create(GetParentNodeF, args, F->getName()+".parentNode", FI);
+  DEBUG(errs() << "Generate getParentNode intrinsic: " << *GetParentNodeCall << "\n");
+
+  // Iterate through all instructions
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    CallInst *CI;
+
+    // Find OpenCL function calls
+    if ((CI = dyn_cast<CallInst>(I))) {
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_global_id")) {
+        DEBUG(errs() << "Found get_global_id call: " << *CI << "\n");
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNodeInstanceID;
+        Intrinsic::ID getNumNodeInstancesID;
+        switch (dim) {
+        case 0:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x;
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x;
+          break;
+        case 1:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y;
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y;
+          break;
+        case 2:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z;
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+
+
+        // Creating getNodeInstanceID intrinsic for parent node
+        ArrayRef<Value *> Args0(GetParentNodeCall);
+        Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID);
+        CallInst* ParentIDIntrinsic = CallInst::Create(GetNodeInstanceIDF, Args0, "", CI);
+
+        // Creating getNumNodeInstances intrinsic for this node
+        ArrayRef<Value *> Args1(GetNodeCall);
+        Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID);
+        CallInst* InstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args1, "", CI);
+        // Creating mul instruction
+        BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul,
+                                  ParentIDIntrinsic,
+                                  InstancesIntrinsic,
+                                  "", CI);
+        // Creating getNodeInstanceID intrinsic for this node
+        CallInst* LocalIDIntrinsic = CallInst::Create(GetNodeInstanceIDF, Args1, "", CI);
+        // Creating add instruction
+        BinaryOperator* AddInst = BinaryOperator::Create(Instruction::Add,
+                                  MulInst,
+                                  LocalIDIntrinsic,
+                                  "", CI);
+        CI->replaceAllUsesWith(AddInst);
+        IItoRemove.push_back(CI);
+      }
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_local_id")) {
+        DEBUG(errs() << "Found get_local_id call: " << *CI << "\n");
+        // Value *arg0 = CI->getOperand(0);
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+
+        // Argument of the function to be called
+        ArrayRef<Value *> Args(GetNodeCall);
+
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNodeInstanceID;
+        switch (dim) {
+        case 0:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x;
+          break;
+        case 1:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y;
+          break;
+        case 2:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+        Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID);
+        CallInst* VI = CallInst::Create(GetNodeInstanceIDF, Args, "", CI);
+        CI->replaceAllUsesWith(VI);
+        IItoRemove.push_back(CI);
+      }
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_group_id")) {
+        DEBUG(errs() << "Found get_group_id call: " << *CI << "\n");
+        // Value *arg0 = CI->getOperand(0);
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+
+        // Argument of the function to be called
+        ArrayRef<Value *> Args(GetParentNodeCall);
+
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNodeInstanceID;
+        switch (dim) {
+        case 0:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x;
+          break;
+        case 1:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y;
+          break;
+        case 2:
+          getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+        Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID);
+        CallInst* VI = CallInst::Create(GetNodeInstanceIDF, Args, "", CI);
+        CI->replaceAllUsesWith(VI);
+        IItoRemove.push_back(CI);
+      }
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_global_size")) {
+        DEBUG(errs() << "Found get_global_size call: " << *CI << "\n");
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNumNodeInstancesID;
+        switch (dim) {
+        case 0:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x;
+          break;
+        case 1:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y;
+          break;
+        case 2:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+
+
+        // Creating getNumNodeInstances intrinsic for parent node
+        ArrayRef<Value *> Args0(GetParentNodeCall);
+        Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID);
+        CallInst* ParentInstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args0, "", CI);
+        // Creating getNumNodeInstances intrinsic for this node
+        ArrayRef<Value *> Args1(GetNodeCall);
+        CallInst* InstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args1, "", CI);
+        // Creating mul instruction
+        BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul,
+                                  ParentInstancesIntrinsic,
+                                  InstancesIntrinsic,
+                                  "", CI);
+        CI->replaceAllUsesWith(MulInst);
+        IItoRemove.push_back(CI);
+
+      }
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_local_size")) {
+        DEBUG(errs() << "Found get_local_size call: " << *CI << "\n");
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+
+        // Argument of the function to be called
+        ArrayRef<Value *> Args(GetNodeCall);
+
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNumNodeInstancesID;
+        switch (dim) {
+        case 0:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x;
+          break;
+        case 1:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y;
+          break;
+        case 2:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+        Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID);
+        CallInst* VI = CallInst::Create(GetNumNodeInstancesF, Args, "", CI);
+        CI->replaceAllUsesWith(VI);
+        IItoRemove.push_back(CI);
+      }
+      if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_num_groups")) {
+        DEBUG(errs() << "Found get_num_groups call: " << *CI << "\n");
+        CallSite OpenCLCallSite(CI);
+        Value *arg0 = OpenCLCallSite.getArgument(0);
+
+        // Argument of the function to be called
+        ArrayRef<Value *> Args(GetParentNodeCall);
+
+        // Find the intrinsic function to be called
+        unsigned dim = getNumericValue(arg0);
+        Intrinsic::ID getNumNodeInstancesID;
+        switch (dim) {
+        case 0:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x;
+          break;
+        case 1:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y;
+          break;
+        case 2:
+          getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z;
+          break;
+        default:
+          assert(false && "Invalid dimension from valid OpenCL source!");
+          break;
+        }
+        Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID);
+        CallInst* VI = CallInst::Create(GetNumNodeInstancesF, Args, "", CI);
+        CI->replaceAllUsesWith(VI);
+        IItoRemove.push_back(CI);
+      }
+    }
+  }
+
+  for (std::vector<CallInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri)
+    (*ri)->eraseFromParent();
+
+}
+
+
+// Public Functions of GenVISC pass
+bool GenVISC::runOnModule(Module &M) {
+  errs() << "\nGENVISC PASS\n";
+  this->M = &M;
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  // Insert init context in main
+  DEBUG(errs() << "Locate __visc__init()\n");
+  Function* VI = M.getFunction("__visc__init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  Instruction* I = cast<Instruction>(*VI->user_begin());
+
+  // Insert print instruction at visc exit
+  DEBUG(errs() << "Locate __visc__cleanup()\n");
+  Function* VC = M.getFunction("__visc__cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+  I = cast<Instruction>(*VC->user_begin());
+
+  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
+
+  std::vector<Instruction*> toBeErased;
+  std::vector<Function*> functions;
+
+  for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
+    Function* f = &*mi;
+    functions.push_back(f);
+  }
+
+  // Iterate over all functions in the module
+  for (unsigned i = 0; i < functions.size(); i++) {
+    Function* f = functions[i];
+    DEBUG(errs() << "Function: " << f->getName() << "\n");
+
+    // List with the required additions in the function's return type
+    std::vector<Type*> FRetTypes;
+
+    enum mutateTypeCause {
+      mtc_None,
+      mtc_BIND,
+      mtc_RETURN,
+      mtc_NUM_CAUSES
+    } bind;
+    bind = mutateTypeCause::mtc_None;
+
+    // Iterate over all the instructions in this function
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
+      Instruction* I = &*i; // Grab pointer to Instruction
+      // If not a call instruction, move to next instruction
+      if(!isa<CallInst>(I))
+        continue;
+
+      CallInst* CI = cast<CallInst>(I);
+      LLVMContext& Ctx = CI->getContext();
+      // If __visc__node call found, generate the test case
+
+      if(isVISCCall_node(I)) {
+        errs() << "Found visc node call in Function: " << f->getName() << "\n";
+        assert(CI->getNumArgOperands() >= 5
+               && "__visc__node call should have atleast 5 arguments!");
+        generateTest(CI);
+        // Place this call in the list of instructions to be erased.
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_init(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
+      }
+      if(isVISCCall_cleanup(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
+      }
+      if(isVISCCall_wait(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
+      }
+      if(isVISCCall_trackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
+      }
+      if(isVISCCall_untrackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
+      }
+      if(isVISCCall_requestMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
+      }
+      if(isVISCCall_hint(I)) {
+        assert(isa<ConstantInt>(CI->getArgOperand(0))
+               && "Argument to hint must be constant integer!");
+        ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
+
+        visc::Target t = (visc::Target) hint->getZExtValue();
+        addHint(CI->getParent()->getParent(), t);
+        DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_launch(I)) {
+        Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
+        DEBUG(errs() << *LaunchF << "\n");
+        // Get i8* cast to function pointer
+        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+
+        Value* LaunchArgs[] = {F, CI->getArgOperand(2), isStreaming};
+        CallInst* LaunchInst = CallInst::Create(LaunchF,
+                                                ArrayRef<Value*>(LaunchArgs, 3),
+                                                "graphID", CI);
+        DEBUG(errs() << "Found visc launch call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
+        CI->replaceAllUsesWith(LaunchInst);
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_push(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
+      }
+      if(isVISCCall_pop(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
+      }
+      if(isVISCCall_createNodeND(I)) {
+        assert(CI->getNumArgOperands() > 0 &&
+               "Too few arguments for __visc__createNodeND call");
+        unsigned numDims = getNumericValue(CI->getArgOperand(0));
+        // We need as meny dimension argments are there are dimensions
+        assert(CI->getNumArgOperands()-2 == numDims &&
+              "Too few arguments for __visc_createNodeND call!\n");
+
+        Function* CreateNodeF;
+        switch (numDims) {
+        case 0:
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
+          break;
+        case 1:
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
+          break;
+        case 2:
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
+          break;
+        case 3:
+          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
+          break;
+        default:
+          llvm_unreachable("Unsupported number of dimensions\n");
+          break;
+        }
+        DEBUG(errs() << *CreateNodeF << "\n");
+        DEBUG(errs() << *I << "\n");
+        DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n");
+
+        // Get i8* cast to function pointer
+        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+
+        CallInst* CreateNodeInst;
+        switch (numDims) {
+        case 0:
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(F),
+                                            graphFunc->getName()+".node", CI);
+          break;
+        case 1:
+          {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 2),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
+        case 2:
+          {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          Value* CreateNodeArgs[] = {F,
+                                     CI->getArgOperand(2),
+                                     CI->getArgOperand(3)};
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 3),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
+        case 3:
+          {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 4, expected to be i64\n");
+          Value* CreateNodeArgs[] = {F,
+                                     CI->getArgOperand(2),
+                                     CI->getArgOperand(3),
+                                     CI->getArgOperand(4)};
+          CreateNodeInst = CallInst::Create(CreateNodeF,
+                                            ArrayRef<Value*>(CreateNodeArgs, 4),
+                                            graphFunc->getName()+".node", CI);
+          }
+          break;
+        default:
+          llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n");
+          break;
+        }
+
+        DEBUG(errs() << "Found visc createNode call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n");
+        CI->replaceAllUsesWith(CreateNodeInst);
+        toBeErased.push_back(CI);
+      }
+
+      if(isVISCCall_edge(I)) {
+        Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
+        DEBUG(errs() << *EdgeF << "\n");
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5));
+        ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx)
+                                                : ConstantInt::getTrue(Ctx);
+        Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                             isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4),
+                             isStreaming
+                            };
+        CallInst* EdgeInst = CallInst::Create(EdgeF,
+                                              ArrayRef<Value*>(EdgeArgs, 6),
+                                              "output", CI);
+        DEBUG(errs() << "Found visc edge call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
+        CI->replaceAllUsesWith(EdgeInst);
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_bindIn(I)) {
+        Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
+        DEBUG(errs() << *BindInF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), isStreaming
+                              };
+        CallInst* BindInInst = CallInst::Create(BindInF,
+                                                ArrayRef<Value*>(BindInArgs, 4),
+                                                "", CI);
+        DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
+        CI->replaceAllUsesWith(BindInInst);
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_bindOut(I)) {
+        Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
+        DEBUG(errs() << *BindOutF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
+        Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                                CI->getArgOperand(2), isStreaming
+                               };
+        CallInst* BindOutInst = CallInst::Create(BindOutF,
+                                ArrayRef<Value*>(BindOutArgs, 4),
+                                "", CI);
+        DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
+
+        DEBUG(errs() << "Fixing the return type of the function\n");
+        // FIXME: What if the child node function has not been visited already.
+        // i.e., it's return type has not been fixed.
+        Function* F = I->getParent()->getParent();
+        DEBUG(errs() << F->getName() << "\n";);
+        IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
+        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
+        Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
+        DEBUG(errs() << ChildF->getName() << "\n";);
+        int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
+        int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
+        StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
+
+        Type* ReturnType = F->getReturnType();
+        DEBUG(errs() << *ReturnType << "\n";);
+        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
+            && "Return type should either be a struct or void type!");
+
+        FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos));
+        assert(((bind == mutateTypeCause::mtc_BIND) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+                "Both bind_out and visc_return detected");
+        bind = mutateTypeCause::mtc_BIND;
+
+        CI->replaceAllUsesWith(BindOutInst);
+        toBeErased.push_back(CI);
+      }
+      if(isVISCCall_attributes(I)) {
+        Function* F = CI->getParent()->getParent();
+        handleVISCAttributes(F, CI);
+        toBeErased.push_back(CI);
+      }
+      if (isVISCCall_getNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased);
+      }
+      if (isVISCCall_getParentNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased);
+      }
+      if (isVISCCall_barrier(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased);
+      }
+      if (isVISCCall_malloc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
+      }
+      if (isVISCCall_return(I)) {
+        DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n");
+        // The operands to this call are the values to be returned by the node
+        Value* ReturnVal = genCodeForReturn(CI);
+        DEBUG(errs() << *ReturnVal << "\n");
+        Type* ReturnType = ReturnVal->getType();
+        assert(isa<StructType>(ReturnType)
+               && "Return type should be a struct type!");
+
+        assert(((bind == mutateTypeCause::mtc_RETURN) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+                "Both bind_out and visc_return detected");
+
+        if (bind == mutateTypeCause::mtc_None) {
+          // If this is None, this is the first __visc__return
+          // instruction we have come upon. Place the return type of the
+          // function in the return type vector
+          bind = mutateTypeCause::mtc_RETURN;
+          StructType* ReturnStructTy = cast<StructType>(ReturnType);
+          for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
+            FRetTypes.push_back(ReturnStructTy->getElementType(i));
+        } else { // bind == mutateTypeCause::mtc_RETURN
+          // This is not the first __visc__return
+          // instruction we have come upon. 
+          // Check that the return types are the same
+          assert((ReturnType == FRetTypes[0])
+                 && "Multiple returns with mismatching types");
+        }
+
+        ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
+        DEBUG(errs() << "Found visc return call: " << *CI << "\n");
+        Instruction* oldReturn = CI->getParent()->getTerminator();
+        assert(isa<ReturnInst>(oldReturn)
+                && "Expecting a return to be the terminator of this BB!");
+        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
+        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
+        //CI->replaceAllUsesWith(RetInst);
+        toBeErased.push_back(CI);
+        ReplaceInstWithInst(oldReturn, RetInst);
+        DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n");
+
+      }
+
+      if (isVISCCall_getNodeInstanceID_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
+      }
+      if (isVISCCall_getNodeInstanceID_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
+      }
+      if (isVISCCall_getNodeInstanceID_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
+      }
+      if (isVISCCall_getNumNodeInstances_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
+      }
+      if (isVISCCall_getNumNodeInstances_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
+      }
+      if (isVISCCall_getNumNodeInstances_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
+      }
+      if (isVISCCall_atomic_cmpxchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, &toBeErased);
+      }
+      if (isVISCCall_atomic_add(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
+      }
+      if (isVISCCall_atomic_sub(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased);
+      }
+      if (isVISCCall_atomic_xchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased);
+      }
+      if (isVISCCall_atomic_inc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_inc, &toBeErased);
+      }
+      if (isVISCCall_atomic_dec(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_dec, &toBeErased);
+      }
+      if (isVISCCall_atomic_min(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased);
+      }
+      if (isVISCCall_atomic_umin(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_umin, &toBeErased);
+      }
+      if (isVISCCall_atomic_max(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased);
+      }
+      if (isVISCCall_atomic_umax(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_umax, &toBeErased);
+      }
+      if (isVISCCall_atomic_and(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased);
+      }
+      if (isVISCCall_atomic_or(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased);
+      }
+      if (isVISCCall_atomic_xor(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased);
+      }
+      if (isVISCCall_floor(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased);
+      }
+      if (isVISCCall_rsqrt(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, &toBeErased);
+      }
+      if (isVISCCall_sqrt(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased);
+      }
+      if (isVISCCall_sin(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
+      }
+      if (isVISCCall_cos(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
+      }
+      if (isVISCCall_tensor_convolution(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_convolution, &toBeErased);
+      }
+      if (isVISCCall_tensor_group_convolution(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_group_convolution, &toBeErased);
+      }
+      if (isVISCCall_tensor_add(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_add, &toBeErased);
+      }
+      if (isVISCCall_tensor_batchnorm(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_batchnorm, &toBeErased);
+      }
+      if (isVISCCall_tensor_mul(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_mul, &toBeErased);
+      }
+      if (isVISCCall_tensor_pool_max(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_max, &toBeErased);
+      }
+      if (isVISCCall_tensor_pool_min(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_min, &toBeErased);
+      }
+      if (isVISCCall_tensor_pool_mean(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_mean, &toBeErased);
+      }
+      if (isVISCCall_tensor_relu(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_relu, &toBeErased);
+      }
+      if (isVISCCall_tensor_tanh(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_tanh, &toBeErased);
+      }
+      if (isVISCCall_tensor_clipped_relu(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_clipped_relu, &toBeErased);
+      }
+      if (isVISCCall_tensor_softmax(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_softmax, &toBeErased);
+      }
+
+      // New Intrinsic to set Node ID
+      if (isVISCCall_node_id(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_node_id, &toBeErased);
+      }
+      
+    }
+
+    // Erase the __visc__node calls
+    DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
+    for(auto I: toBeErased) {
+      DEBUG(errs() << *I << "\n");
+    }
+    while(!toBeErased.empty()) {
+      Instruction* I = toBeErased.back(); 
+      DEBUG(errs() << "\tErasing " << *I << "\n");
+      I->eraseFromParent();
+      toBeErased.pop_back(); 
+    }
+
+    if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
+        DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
+        // Argument type list.
+        std::vector<Type*> FArgTypes;
+        for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
+            ai != ae; ++ai) {
+          FArgTypes.push_back(ai->getType());
+        }
+
+        // Find new return type of function
+        Type* NewReturnTy;
+        if(bind == mutateTypeCause::mtc_BIND) {
+
+          std::vector<Type*> TyList;
+          for (unsigned i = 0; i < FRetTypes.size(); i++)
+            TyList.push_back(FRetTypes[i]);
+
+          NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true);
+        }
+        else {
+          NewReturnTy = getReturnTypeFromReturnInst(f);
+          assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
+        }
+
+        FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
+
+        // Change the function type
+        Function* newF = cloneFunction(f, FTy, false);
+        DEBUG(errs() << *newF << "\n");
+
+        if (bind == mutateTypeCause::mtc_BIND) {
+          // This is certainly an internal node, and hence just one BB with one
+          // return terminator instruction. Change return statement
+          ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator());
+          ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy));
+          ReplaceInstWithInst(RI, newRI);        
+        }
+        if (bind == mutateTypeCause::mtc_RETURN) {
+          // Nothing
+        }
+        replaceNodeFunctionInIR(*f->getParent(), f, newF);
+        DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
+    }
+
+
+  }
+  return false; //TODO: What does returning "false" mean?
+}
+
+// Generate Code for declaring a constant string [L x i8] and return a pointer
+// to the start of it.
+Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
+  Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true);
+  Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true,
+                                      GlobalValue::InternalLinkage, SConstant, Name);
+  Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+  Value* GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
+                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
+  return SPtr;
+}
+
+
+
+// Generate the test case using the dummy __visc__node call CI
+// First parse the arguments to find the kernel function, num of levels,
+// dimensions, arguments, inputs and outputs. Pass this information to genKernel
+// and genInternalNode functions to generate the test case.
+void GenVISC::generateTest(CallInst* CI) {
+  // Parse the dummy function call here
+  LLVMContext& Ctx = CI->getParent()->getContext();
+
+  unsigned offset = 1; // argument at offset 1 is the number of dimensions
+  // Find number of arguments
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__node call!");
+  unsigned levels = getNumericValue(CI->getArgOperand(offset));
+  errs() << "\tNum of levels = " << levels << "\n";
+
+  // Find number of dimensions
+  offset += 1;
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__node call!");
+  unsigned numDims = getNumericValue(CI->getOperand(offset));
+  errs() << "\tNum of dimensions = " << numDims << "\n";
+
+
+  // Find number of arguments
+  offset += numDims*levels + 1; // skip the dimesnions
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__node call!");
+  unsigned numArgs = getNumericValue(CI->getArgOperand(offset));
+  errs() << "\tNum of kernel arguments = " << numArgs << "\n";
+
+  // Find number of outputs
+  offset += numArgs + 1; // skip the kernel arguments
+  assert(CI->getNumArgOperands() > offset
+         && "Too few arguments for __visc__node call!");
+  unsigned numOutputs = getNumericValue(CI->getArgOperand(offset));
+  errs() << "\tNum of kernel outputs = " << numOutputs << "\n";
+
+  // Find return struct type
+  assert(numOutputs == 0 && "Not handled case where number of outputs is non-zero!");
+  // This is always zero. One should look at the number of struct elements of
+  // kernel function
+  StructType* RetTy = StructType::create(Ctx, None, "rtype");
+
+  Function* KernelF = genKernel(cast<Function>(CI->getArgOperand(0)->stripPointerCasts()), CI, RetTy);
+  genHost(CI, KernelF, levels, numDims, numArgs, numOutputs, RetTy);
+}
+
+
+
+// Make all the required changes to the kernel function. This would include
+// changing the function signature by adding any extra arguments required.
+// Changing the return type. Changing all the OpenCL query intrinsics with the
+// visc intrinsics.
+Function* GenVISC::genKernel(Function* KernelF, CallInst* CI, StructType* RetTy) {
+  // Make changes to kernel here
+  DEBUG(errs() << "Modifying Node Function: " << KernelF->getName() << "\n");
+
+  // Find dummy __visc__attribute call in this function and add visc attributes
+  // in/out to pointer arguments
+  for (inst_iterator i = inst_begin(KernelF), e = inst_end(KernelF); i != e; ++i) {
+    Instruction *I = &(*i);
+    if(isVISCCall_attributes(I)) {
+      handleVISCAttributes(KernelF, cast<CallInst>(I));
+      //I->eraseFromParent();
+      break;
+    }
+  }
+
+  // Change arguments and types
+  // Create the argument type list with added argument types
+  //Function::ArgumentListType& argList = KernelF->getArgumentList();
+  std::vector<Type*> argTypes;
+  // Insert an i32 argument after every pointer argument. However adding an
+  // argument does not change the attribute list of function and so the
+  // arguments need to be shifted accordingly.
+  //bool shiftAttr = false;
+  for(Function::arg_iterator ai = KernelF->arg_begin(), ae = KernelF->arg_end();
+      ai != ae; ++ai) {
+
+    argTypes.push_back(ai->getType());
+    if(ai->getType()->isPointerTy()) {
+      // If it is a pointer argument, add an i64 type next
+      argTypes.push_back(Type::getInt64Ty(KernelF->getContext()));
+    }
+
+  }
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  FunctionType* newFT = FunctionType::get(RetTy, argTypes, KernelF->isVarArg());
+
+  // Change the function type
+  SmallVector<ReturnInst*, 8> Returns;
+  Function* newKernelF = cloneFunction(KernelF, newFT, true, &Returns);
+  DEBUG(errs() << *newKernelF << "\n");
+
+  // Replace ret void instruction with ret %RetTy undef
+  for(auto RI: Returns) {
+    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
+    ReturnInst* newRI = ReturnInst::Create(KernelF->getContext(), UndefValue::get(RetTy));
+    ReplaceInstWithInst(RI, newRI);
+  }
+
+  replaceNodeFunctionInIR(*KernelF->getParent(), KernelF, newKernelF);
+  // Replace opencl query intrinsics with visc query intrinsics
+  replaceOpenCLCallsWithVISCIntrinsics(newKernelF);
+  return newKernelF;
+}
+
+// Generate the code replacing the dummy __visc__node call with visc launch
+// intrinsic and also generate the internal nodes required at each level
+// depending on the hierarchy of DFG needed. This would also involve marhsalling
+// all the input arguments to the kernel function in memory. Replaceing CI with
+// launch intrinsic, and all the dummy __visc__wait calls with the visc wait
+// intrinsic.
+void GenVISC::genHost(CallInst* CI, Function* KernelF, unsigned levels, unsigned numDims, unsigned numArgs, unsigned numOutputs, StructType* RetTy) {
+  // Make host code changes here
+  DEBUG(errs() << "Modifying Host code for __visc__node call site: " << *CI << "\n");
+  DEBUG(errs() << "Kernel Function: " << KernelF->getName() << "\n");
+  LLVMContext& Ctx = CI->getParent()->getContext();
+
+  // Create a root funtion which has this as internal node
+  Function* Root = genInternalNode(KernelF, levels, numArgs, numDims, 3, CI);
+
+  // Add hint to compile root for CPU. This is always true.
+  addHint(Root, visc::CPU_TARGET);
+
+  // Generate argument struct type (All arguments followed by return struct type)
+  std::vector<Type*> ArgList;
+  unsigned offset = numDims*levels + 2 + 1 + 1;
+  for(Function::arg_iterator ai=KernelF->arg_begin(), ae=KernelF->arg_end();
+      ai!=ae; ai++) {
+    Type* Ty = ai->getType();
+    ArgList.push_back(Ty);
+  }
+  // Add the dimesnions arguments
+  for(unsigned i=0; i<numDims*levels; i++) {
+//    ArgList.push_back(Type::getInt32Ty(Ctx));
+    ArgList.push_back(Type::getInt64Ty(Ctx));
+  }
+  ArgList.push_back(RetTy);
+  StructType* ArgStructTy = StructType::create(ArgList, "struct.arg", true);
+  DEBUG(errs() << *ArgStructTy << "\n");
+
+  // Insert alloca inst for this argument struct type
+  AllocaInst* AI = new AllocaInst(ArgStructTy, "in.addr", CI);
+
+  // Marshall all input arguments and dimension arguments into argument struct
+  // type
+  marshallArguments(levels, numArgs, offset, numDims, 3, AI, CI, KernelF);
+
+  // Type cast argument struct to i8*
+  CastInst* BI = BitCastInst::CreatePointerCast(AI,
+                 Type::getInt8PtrTy(Ctx),
+                 "args",
+                 CI);
+
+  // Bitcast Root function to i8*
+  Constant* Root_i8ptr = ConstantExpr::getPointerCast(Root, Type::getInt8PtrTy(Ctx));
+  // Replace CI with launch call to a Root function
+  Function* LaunchF = Intrinsic::getDeclaration(Root->getParent(), Intrinsic::visc_launch);
+  DEBUG(errs() << "Intrinsic for launch: " << *LaunchF << "\n");
+
+  Value* LaunchInstArgs[] = {Root_i8ptr, BI, ConstantInt::getFalse(Ctx)};
+  CallInst* LaunchInst = CallInst::Create(LaunchF,
+                                          ArrayRef<Value*>(LaunchInstArgs,3),
+                                          "graph"+Root->getName(), CI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Add wait call
+  // Replace all wait instructions with visc wait intrinsic instructions
+  Function* WaitF = Intrinsic::getDeclaration(Root->getParent(), Intrinsic::visc_wait);
+  std::vector<CallInst*>* WaitList = getWaitList(CI);
+  for(unsigned i=0; i < WaitList->size(); ++i) {
+    CallInst* waitCall = WaitList->at(i);
+    CallInst* waitInst = CallInst::Create(WaitF,
+                                          ArrayRef<Value*>(LaunchInst),
+                                          "", CI);
+    DEBUG(errs() << *waitInst << "\n");
+    waitCall->eraseFromParent();
+  }
+
+  // Get result (optional)
+}
+
+static Function* transformReturnTypeToStruct(Function* F) {
+  // Currently only works for void return types
+  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
+
+  if (isa<StructType>(F->getReturnType())) {
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
+    return F;
+  }
+
+  assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
+
+  // Create the argument type list with added argument types
+  std::vector<Type*> ArgTypes;
+  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+      ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  
+  StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true);
+  FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
+  
+  SmallVector<ReturnInst*, 8> Returns;
+  Function* newF = cloneFunction(F, FTy, false, &Returns);
+  // Replace ret void instruction with ret %RetTy undef
+  for(auto RI: Returns) {
+    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
+    ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
+    ReplaceInstWithInst(RI, newRI);
+  }
+
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  return newF;
+}
+
+static Type* getReturnTypeFromReturnInst(Function* F) {
+  for(BasicBlock &BB: *F) {
+    if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n");
+      return RI->getReturnValue()->getType();
+    }
+  }
+}
+
+
+char genvisc::GenVISC::ID = 0;
+static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false);
+
+} // End of namespace genvisc
+
+
diff --git a/lib/GenVISC/GenVISC.exports b/lib/GenVISC/GenVISC.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/GenVISC/LLVMBuild.txt b/lib/GenVISC/LLVMBuild.txt
new file mode 100644
index 0000000000..9266b2c597
--- /dev/null
+++ b/lib/GenVISC/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = GenVISC
+parent = Transforms
diff --git a/lib/InPlaceDFG/CMakeLists.txt b/lib/InPlaceDFG/CMakeLists.txt
new file mode 100644
index 0000000000..d034ae4976
--- /dev/null
+++ b/lib/InPlaceDFG/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMInPlaceDFGAnalysis
+  InPlaceDFGAnalysis.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp b/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp
new file mode 100644
index 0000000000..a45e6e3645
--- /dev/null
+++ b/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp
@@ -0,0 +1,318 @@
+//===------------------------ InPlaceDFGAnalysis.cpp ----------------------===//
+//
+//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//
+//
+// This file is distributed under the University of Illinois Open Source
+//
+// License. See LICENSE.TXT for details.
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "InPlaceDFGAnalysis"
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+namespace inplacedfg {
+
+/***                                Classes                                 ***/
+
+// Visitor for Code generation traversal (tree traversal for now)
+class AT_OCL : public CodeGenTraversal {
+
+private:
+  //Member variables
+  InPlaceDFGAnalysis::InPlaceDFGParameter *IPP;
+
+  //Functions
+
+  // Virtual Functions
+  void init() {}
+  void initRuntimeAPI() {}
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+  // Constructor
+  AT_OCL(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP) :
+    CodeGenTraversal(_M, _DFG), IPP(&_IPP) {
+
+  }
+};
+
+/***                            Helper Functions                            ***/
+
+// Create an entry in InPlaceDFGParameter IPP for node N if it does not exist
+void initializeDFNodeIPPVector(DFNode *N,
+                               InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) {
+  if (IPP.find(N) == IPP.end()) {
+    // Find the node function
+    Function *F = N->getFuncPointer();
+    // Create a vector initialized to true
+    IPP[N] = std::vector<bool>(F->getFunctionType()->getNumParams(), true);
+    // Every scalar parameter is not eligible for an in place operation
+    for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+         ai != ae; ++ai) {
+      Argument *Arg = &*ai;
+      if (!(Arg->getType()->isPointerTy())) {
+        IPP[N][Arg->getArgNo()] = false;
+      }
+    }
+  }
+}
+
+// Update InPlaceDFGParameter IPP based on the outgoing edges of node N
+void checkOutputEdgeSources(DFNode* N, InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) {
+  // Iterate over all outgoing edges.
+  for (DFNode::outdfedge_iterator oe_it = N->outdfedge_begin(),
+       oeEnd = N->outdfedge_end(); oe_it != oeEnd; ++oe_it) {
+    // For every edge, look through all subsequent edges.
+    // If, for some edge, have the same source position, then the output is not
+    // eligible for an in place operation
+    DFNode::outdfedge_iterator oeNext = oe_it;
+
+    unsigned srcPos = (*oe_it)->getSourcePosition();
+    for (++oeNext ; oeNext != oeEnd; ++oeNext) {
+      DFEdge *E = *oeNext;
+      // If we find edges with the same source position         
+      if (E->getSourcePosition() == srcPos) {
+        // Find node and destination positions, and make the respective
+        // arguments not eligible for in place operations
+        DFNode *DN = (*oe_it)->getDestDF();
+        unsigned dstPos = (*oe_it)->getDestPosition();
+        initializeDFNodeIPPVector(DN, IPP);
+        IPP[DN][dstPos] = false;
+
+        DN = E->getDestDF();
+        dstPos = E->getDestPosition();
+        initializeDFNodeIPPVector(DN, IPP);
+        IPP[DN][dstPos] = false;
+      }
+    }
+  }
+
+}
+
+// Print InPlaceDFGParameter DFG
+void printInPlaceDFGParameter(InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) {
+
+  errs() << "----------------------------\n";
+  errs() << "In Place DFG Analysis Result\n";
+  for (InPlaceDFGAnalysis::InPlaceDFGParameter::iterator it = IPP.begin(),
+       ie = IPP.end(); it != ie; ++it) {
+    DFNode *N = it->first;
+    if (N->isDummyNode()) {
+      errs() << "(dummy) ";
+    }
+    errs() << "Node: " << N->getFuncPointer()->getName() << "\n\tMap:";
+    for (unsigned i = 0; i < it->second.size() ; i++) {
+      errs() << " " << (it->second[i] ? "true " : "false");
+    }
+    errs() << "\n";
+  }
+  errs() << "----------------------------\n";
+
+}
+
+/***                                Methods                                 ***/
+
+/*** Methods of InPlaceDFGAnalysisWrapper ***/
+const InPlaceDFGAnalysis::InPlaceDFGParameter
+  &InPlaceDFGAnalysisWrapper::getIPP() {
+    return IPP;
+}
+
+void InPlaceDFGAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BuildDFG>();
+  AU.addPreserved<BuildDFG>();
+}
+
+bool InPlaceDFGAnalysisWrapper::runOnModule(Module &M) {
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  InPlaceDFGAnalysis IPA;
+  IPA.run(M, DFG, IPP);
+
+  return false;
+}
+
+/*** Methods of InPlaceDFGAnalysis ***/
+void InPlaceDFGAnalysis::run(Module &M, BuildDFG &DFG, InPlaceDFGParameter &IPP) {
+
+  errs() << "\nIN PLACE ANALYSIS PASS\n";
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+
+  // Visitor for Graph Traversal
+  AT_OCL *ATVisitor = new AT_OCL(M, DFG, IPP);
+
+  // Iterate over all the DFGs
+  // Analyse the edges for parameters that are valid to be used in place
+  for (auto rootNode: Roots) {
+    // Initiate analysis for root DFNode
+    IPP[rootNode] =
+      std::vector<bool>(rootNode->getFuncPointer()->getFunctionType()->getNumParams(),
+                        false);
+    // All inputs from the host are marked as not in place - the host does not
+    // expect these values to change unpredictably.
+    ATVisitor->visit(rootNode);
+    // The analysis is optimistic, assuming everything is eligible for in place
+    // unless found otherwise. This happens if two edges have the same source
+    // node and port. Then the targets of these edges are not eligible for
+    // in place operations.
+
+    /* TODO:
+    To enforce that host values are marked as false, we need a second pass over
+    the graph that does the following:
+    - push root in a vector:
+    - while the vector is not empty:
+    - - pop the last node, N:
+    - - if internal node:
+    - - - find its entry dummy node (easily done by isDummyNode() and iterating
+          over outedges of dummy, exit ummy has not outedges)
+    - - - for all successors of the dymmy node,
+    - - - - if the edge carries a false annotated value (if the source position
+            is marked as false in the N vector), mark as such at the successor
+            and push successor in the vector
+    - - if leaf node
+    - - - return
+
+    For now, this is not required, as there is only one level in the graph.
+    Thus I simply iterate over outedges of entry dummy ,and mark targets as
+    false, at the end of codegen for leaf node.
+    */
+
+  }
+
+//  printInPlaceDFGParameter(IPP);
+
+  delete ATVisitor;
+  return;
+}
+
+/*** Methods of AT_OCL ***/
+
+/*** Analysis of internal node ***/
+void AT_OCL::codeGen(DFInternalNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+
+//  errs() << "Internal node: before initializing this node's vector\n";
+//  printInPlaceDFGParameter(*IPP);
+  // If a vector has not been created for this node,
+  // create one initialized to true
+  initializeDFNodeIPPVector(N, *IPP);
+
+//  errs() << "Internal node: after initializing this node's vector, before its check edges\n";
+//  printInPlaceDFGParameter(*IPP);
+  // Check its output edges, for same destination node and port.
+
+  checkOutputEdgeSources(N, *IPP);
+//  errs() << "Internal node: after this node's check edges\n";
+//  printInPlaceDFGParameter(*IPP);
+}
+
+/*** Analysis of leaf node ***/
+void AT_OCL::codeGen(DFLeafNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+
+  if(N->isAllocationNode()) {
+    DEBUG(errs() << "Analysis does not expect allocation node\n");
+    assert(false && "Allocation nodes not expected in approxHPVM");
+    return;
+  }
+
+//  errs() << "Leaf node: before initializing this node's vector\n";
+//  printInPlaceDFGParameter(*IPP);
+  // If a vector has not been created for this node,
+  // create one initialized to true
+  initializeDFNodeIPPVector(N, *IPP);
+//  errs() << "Leaf node: after initializing this node's vector\n";
+//  printInPlaceDFGParameter(*IPP);
+
+  // Skip internal checks if it is a dummy node
+  if(!(N->isDummyNode())) {
+    // Check that all outputs should be results of HPVM tensor intrinsics
+    if (N->getOutputType()->isEmptyTy())
+      return;
+
+    unsigned numOutputs = N->getOutputType()->getNumElements();
+
+    Function *F = N->getFuncPointer();
+    BasicBlock& BB = F->getEntryBlock();
+    assert(isa<ReturnInst>(BB.getTerminator())
+        && "ApproxHPVM Nodes have a single BB\n");
+    ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator());
+    // Find the returned struct
+    Value* rval = RI->getReturnValue();
+
+    // Look through all outputs to make sure they are insertvalue instructions
+    std::vector<Value*> OutValues(numOutputs, NULL);
+    for (unsigned i = 0; i < numOutputs; i++) {
+      if(InsertValueInst* IV = dyn_cast<InsertValueInst>(rval)) {
+        DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *rval << "\n");
+        OutValues[numOutputs-1-i] = IV->getOperand(1);
+        rval = IV->getOperand(0);
+      }
+      else {
+        DEBUG(errs() << "Unexpected value at out edge: " << *rval << "\n");
+        llvm_unreachable("Expecting InsertValue instruction. Error!");
+      }
+    }
+
+    // Look through all outputs
+    for (unsigned i = 0; i < numOutputs; i++) {
+      if (OutValues[i]->getType()->isPointerTy()) {
+        // All returned pointers should be results of HPVM tensor intrinsics
+        CallInst *CI = dyn_cast<CallInst>(OutValues[i]);
+        assert(CI &&
+          "Expected return value to be the result of a call instruction\n");
+        assert ((CI->getCalledFunction()->getName()).startswith("llvm.visc.tensor") &&
+          "Node output must be the result of an HPVM tensor intrinsic\n");
+      }
+    }
+
+  }
+
+//  errs() << "Leaf node: before this node's check edges\n";
+//  printInPlaceDFGParameter(*IPP);
+  // Check its output edges, for same destination node and port.
+  checkOutputEdgeSources(N, *IPP);
+//  errs() << "Leaf node: after this node's check edges\n";
+//  printInPlaceDFGParameter(*IPP);
+
+  // Mark host values as false, explained in run
+  if((N->isDummyNode())) {
+    for (DFNode::outdfedge_iterator oe_it = N->outdfedge_begin(),
+        oeEnd = N->outdfedge_end(); oe_it != oeEnd; ++oe_it) {
+      DFNode *DN = (*oe_it)->getDestDF();
+      unsigned dstPos = (*oe_it)->getDestPosition();
+      initializeDFNodeIPPVector(DN, *IPP);
+      (*IPP)[DN][dstPos] = false;
+    }
+  }
+//  errs() << "Leaf node: after this (dummy)  node's update host values\n";
+//  printInPlaceDFGParameter(*IPP);
+
+}
+
+char InPlaceDFGAnalysisWrapper::ID = 0;
+static RegisterPass<InPlaceDFGAnalysisWrapper> X("inplace",
+  "Pass to identifying candidates for in place operations in HPVM",
+  false /* does not modify the CFG */,
+  false /* not transformation, just analysis */);
+
+} // End of namespace
+
diff --git a/lib/InPlaceDFG/InPlaceDFGAnalysis.exports b/lib/InPlaceDFG/InPlaceDFGAnalysis.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/InPlaceDFG/LLVMBuild.txt b/lib/InPlaceDFG/LLVMBuild.txt
new file mode 100644
index 0000000000..b78912b9c4
--- /dev/null
+++ b/lib/InPlaceDFG/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InPlaceDFGAnalysis
+parent = Transforms
diff --git a/lib/InlineTensorCalls/CMakeLists.txt b/lib/InlineTensorCalls/CMakeLists.txt
new file mode 100644
index 0000000000..51f321884f
--- /dev/null
+++ b/lib/InlineTensorCalls/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( InlineTensorCalls
+  InlineTensorCalls.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/lib/InlineTensorCalls/InlineTensorCalls.cpp b/lib/InlineTensorCalls/InlineTensorCalls.cpp
new file mode 100644
index 0000000000..d31434341c
--- /dev/null
+++ b/lib/InlineTensorCalls/InlineTensorCalls.cpp
@@ -0,0 +1,77 @@
+//=== InlineApproxHPVMCalls.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "INLINE_APPROXHPVM_CALLS"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#include "llvm/IR/InstIterator.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/InlineCost.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/ADT/SetVector.h"
+#include <sstream>
+
+using namespace llvm;
+
+
+namespace {
+
+  struct InlineApproxHPVMCalls : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    InlineApproxHPVMCalls() : ModulePass(ID) {}
+
+    bool runOnModule(Module &M) override {
+
+      InlineFunctionInfo IFI;
+      SmallSetVector<CallSite, 16> Calls;
+      bool Changed = false;
+      SmallVector<Function *, 16> InlinedFunctions;
+      for (Function &F : M){
+	if (!F.isDeclaration() && F.getName().startswith("tensor") ) {
+	  //errs()<<"Function = "<<*&F<<"\n";
+	  Calls.clear();
+
+	  for (User *U : F.users())
+	    if (auto CS = CallSite(U))
+	      if (CS.getCalledFunction() == &F)
+		Calls.insert(CS);
+
+	  for (CallSite CS : Calls)
+	    // FIXME: We really shouldn't be able to fail to inline at this point!
+	    // We should do something to log or check the inline failures here.
+	    Changed |= InlineFunction(CS, IFI);
+
+	}
+      }
+
+      return true;
+    }
+
+  };
+
+
+} // End of namespace
+
+char InlineApproxHPVMCalls::ID = 0;
+static RegisterPass<InlineApproxHPVMCalls> X("inline-tensor-calls",
+					     "Inline ApproxHPVM tensor library function calls (CPU version)",
+					     true /* modifies the CFG */,
+					     true /* transformation,   *
+						   * not just analysis */);
+
diff --git a/lib/InlineTensorCalls/InlineTensorCalls.exports b/lib/InlineTensorCalls/InlineTensorCalls.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/InlineTensorCalls/LLVMBuild.txt b/lib/InlineTensorCalls/LLVMBuild.txt
new file mode 100644
index 0000000000..8fff7891af
--- /dev/null
+++ b/lib/InlineTensorCalls/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InlineTensorCalls
+parent = Transforms
+
diff --git a/lib/InsertApproxInfo/CMakeLists.txt b/lib/InsertApproxInfo/CMakeLists.txt
new file mode 100644
index 0000000000..2b6d41bd70
--- /dev/null
+++ b/lib/InsertApproxInfo/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( InsertApproxInfo
+  InsertApproxInfo.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/InsertApproxInfo/InsertApproxInfo.cpp b/lib/InsertApproxInfo/InsertApproxInfo.cpp
new file mode 100644
index 0000000000..bde4ef8907
--- /dev/null
+++ b/lib/InsertApproxInfo/InsertApproxInfo.cpp
@@ -0,0 +1,498 @@
+//===------------------------ InPlaceDFGAnalysis.cpp ----------------------===//
+//
+//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//
+//
+// This file is distributed under the University of Illinois Open Source
+//
+// License. See LICENSE.TXT for details.
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "InsertApproxInfo"
+
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/IR/InstrTypes.h"
+#include <unordered_map>
+#include <dirent.h>
+#include <stdio.h>
+#include <sstream>
+#include <fstream>
+
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+using namespace inplacedfg;
+
+
+namespace {
+
+static cl::opt<std::string> dir_name("results-dir", cl::desc(" Name of directory with Autotuner results "));
+
+
+struct ApproxMetrics{
+  std::string op_name;
+  std::string category;
+  unsigned int rank; // rank given by autotuner
+  double approx_level;
+  // Relative L-norm metrics
+  double relative_l1;
+  double relative_l2;
+  double relative_linf;
+  // Mean L-norm metrics
+  double mean_l1;
+  double mean_l2;
+  double mean_linf;
+};    
+
+  
+  
+struct InsertApproxInfoWrapperPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  InsertApproxInfoWrapperPass() : ModulePass(ID) {}
+    
+public:
+  // Functions
+  bool runOnModule(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+
+// Visitor for Code generation traversal (tree traversal for now)
+class InsertApproxInfo : public CodeGenTraversal {
+
+private:
+  // Virtual Functions
+  void init() {}
+  void initRuntimeAPI() {}
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  void loadTrainedApproxMetrics(std::string dir_path);
+  void loadMetricsFromFile(std::string dir_path, std::string file_path, std::string category);
+  void loadMetricsFromDir(std::string dir_path, std::string category);
+  void readApproxValues(const std::string line, ApproxMetrics* approx_metrics);
+  void initIntrinsicNames();
+  void initGlobalStrings();
+
+  // private data
+  std::unordered_map<std::string, std::string> intrinsics_map;
+  std::unordered_map<std::string, std::vector<ApproxMetrics*>> operation_metrics;
+  GlobalVariable* rank_str;
+  GlobalVariable* category_str;
+  GlobalVariable* mean_l1_str;
+  GlobalVariable* mean_l2_str;
+  GlobalVariable* mean_linf_str;
+  GlobalVariable* rel_l1_str;
+  GlobalVariable* rel_l2_str;
+  GlobalVariable* rel_linf_str;
+
+
+  // Tracks the id of the tensor op processed
+  unsigned int currentID;
+
+public:
+  // Constructor
+  InsertApproxInfo(Module &_M, BuildDFG &_DFG);
+  
+  //void run(Module &M, BuildDFG &DFG);
+  void run(std::string dir_path);
+
+};
+
+
+
+void InsertApproxInfo::initIntrinsicNames(){
+
+  intrinsics_map["llvm.visc.tensor.convolution"] = "tensorConv";
+  intrinsics_map["llvm.visc.tensor.mul"] = "tensorGemm";
+  intrinsics_map["llvm.visc.tensor.add"] = "tensorAdd";
+  intrinsics_map["llvm.visc.tensor.pool.max"] = "tensorPooling";
+  intrinsics_map["llvm.visc.tensor.tanh"] = "tensorTanh";  
+}
+
+
+void InsertApproxInfo::initGlobalStrings(){
+
+ /**** Creating global constant strings for each approximation metric type *******/
+
+  std::string rank_string = "rank";
+  Constant* stringConst = ConstantDataArray::getString(M.getContext(), StringRef(rank_string.c_str()), true);
+  rank_str = new GlobalVariable(M, stringConst->getType(), true,
+				GlobalValue::ExternalLinkage, stringConst, "");
+
+  std::string category_string = "category";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(category_string.c_str()), true);
+  category_str = new GlobalVariable(M, stringConst->getType(), true,
+				   GlobalValue::ExternalLinkage, stringConst, "");
+
+  // Mean l-norm metrics
+  std::string metric_string = "mean_l1";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  mean_l1_str = new GlobalVariable(M, stringConst->getType(), true,
+				   GlobalValue::ExternalLinkage, stringConst, "");
+
+  metric_string = "mean_l2";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  mean_l2_str = new GlobalVariable(M, stringConst->getType(), true,
+				   GlobalValue::ExternalLinkage, stringConst, "");
+
+  metric_string = "mean_linf";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  mean_linf_str = new GlobalVariable(M, stringConst->getType(), true,
+				     GlobalValue::ExternalLinkage, stringConst, "");
+
+  // Relative l-norm metrics
+  metric_string = "rel_l1";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  rel_l1_str = new GlobalVariable(M, stringConst->getType(), true,
+				   GlobalValue::ExternalLinkage, stringConst, "");
+
+  metric_string = "rel_l2";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  rel_l2_str = new GlobalVariable(M, stringConst->getType(), true,
+				   GlobalValue::ExternalLinkage, stringConst, "");
+
+  metric_string = "rel_linf";
+  stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true);
+  rel_linf_str = new GlobalVariable(M, stringConst->getType(), true,
+				     GlobalValue::ExternalLinkage, stringConst, "");
+
+}
+
+  
+InsertApproxInfo::InsertApproxInfo(Module &_M, BuildDFG &_DFG) :
+    CodeGenTraversal(_M, _DFG){
+
+  currentID = 1;
+  
+  initIntrinsicNames();
+  initGlobalStrings();   
+}
+
+
+void InsertApproxInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BuildDFG>();
+  AU.addPreserved<BuildDFG>();
+}
+
+    
+bool InsertApproxInfoWrapperPass::runOnModule(Module &M) {
+  
+  std::string dir_path = dir_name.getValue();
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  InsertApproxInfo IApprox(M, DFG);
+  IApprox.run(dir_path);
+
+  return false;
+}
+
+
+void InsertApproxInfo::readApproxValues(const std::string line, ApproxMetrics* approx_metrics){
+ 
+  std::istringstream in(line);
+  std::string op_name;
+
+  float approx_level;
+
+  float mean_l1;
+  float mean_l2;
+  float mean_linf;
+
+  float relative_l1;
+  float relative_l2;
+  float relative_linf;
+
+  in >> op_name;
+  in >> approx_level;
+  
+  in >> mean_l1;
+  in >> mean_l2;
+  in >> mean_linf;
+
+  in >> relative_l1;
+  in >> relative_l2;
+  in >> relative_linf;
+    
+  printf("\n *** op_name = %s \n", op_name.c_str());
+  printf("approx_level = %f \n", approx_level);
+  printf("relative_l1 = %f \n", relative_l1);
+  printf("relative_l2 = %f \n", relative_l2);
+  printf("relative_linf = %f \n", relative_linf);
+  printf("mean_l1 = %f \n", mean_l1);
+  printf("mean_l2 = %f \n", mean_l2);
+  printf("mean_linf = %f \n", mean_linf);
+
+  approx_metrics->op_name = op_name;
+  approx_metrics->approx_level = approx_level;
+  approx_metrics->mean_l1 = mean_l1;
+  approx_metrics->mean_l2 = mean_l2;
+  approx_metrics->mean_linf = mean_linf;
+  approx_metrics->relative_l1 = relative_l1;
+  approx_metrics->relative_l2 = relative_l2;
+  approx_metrics->relative_linf = relative_linf;
+   
+}
+
+
+unsigned int getFileRank(std::string file_path){
+
+  char file_name[100]; // Assuming no file names greater than 100 chars
+  strcpy(file_name, file_path.c_str());
+  
+  char* pch = strtok(file_name, "_");
+  char* last_pch;
+  while(pch != NULL){
+    last_pch = pch;   
+    pch = strtok(NULL, "_");  
+  }
+
+  printf("NOTE: ****** last_pch = %s \n", last_pch);
+
+  size_t sz;
+  int rank = std::stoi(last_pch, &sz);
+  
+  return rank + 1; // NOTE: Adding 1 to start ranks with '1' 
+}
+
+  
+  
+void InsertApproxInfo::loadMetricsFromFile(std::string dir_path, std::string file_path, std::string category){
+
+  std::string full_path = dir_path + "/" + file_path;
+  printf("full_path = %s \n", full_path.c_str());
+  std::ifstream infile(full_path.c_str());
+  std::string line;
+
+  unsigned int it_count = 0;
+  while(std::getline(infile, line)){
+
+    // Skip first line with confidence information
+    if(it_count > 0){
+      std::vector<float> approx_values;
+      ApproxMetrics* approx_metrics = new ApproxMetrics;
+      readApproxValues(line, approx_metrics);
+      
+      approx_metrics->category = category;
+      unsigned int rank = getFileRank(file_path);
+      approx_metrics->rank = rank; 
+
+      std::string unique_op_name = approx_metrics->op_name + std::to_string(it_count);
+      operation_metrics[unique_op_name].push_back(approx_metrics);
+      printf("\n ** unique_op_name = %s \n", unique_op_name.c_str());     
+    }
+    
+    it_count++;
+  }
+  
+}
+
+
+  
+void InsertApproxInfo::loadMetricsFromDir(std::string dir_path, std::string category){
+
+  struct dirent* entry;
+  dir_path = dir_path + category;
+
+  DIR* dir = opendir(dir_path.c_str());
+  if(dir == NULL){
+    printf("Directory %s not found . Aborting ... \n\n ", dir_path.c_str());
+    abort();
+  }
+
+  while((entry = readdir(dir)) != NULL){
+    printf("f_name = %s \n", entry->d_name);
+    std::string f_name = entry->d_name;
+    loadMetricsFromFile(dir_path, f_name, category);
+  }
+}
+
+  
+  
+void InsertApproxInfo::loadTrainedApproxMetrics(std::string dir_path){
+  
+  std::string root_path = dir_path + "/high_confidence/";
+  loadMetricsFromDir(root_path, "linear");
+  loadMetricsFromDir(root_path, "log");
+  loadMetricsFromDir(root_path, "quad");
+}
+
+  
+/*** Methods of InPlaceDFGAnalysis ***/
+void InsertApproxInfo::run(std::string dir_path) {
+
+  loadTrainedApproxMetrics(dir_path);
+
+  errs() << "\n NOTE: ApproxInfo INSERTION TRANSFORM \n";
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+
+  // Iterate over all the DFGs
+  // Analyse the edges for parameters that are valid to be used in place
+  for (auto rootNode: Roots) {
+    //ATVisitor->visit(rootNode);
+   this->visit(rootNode);
+  }
+
+  //delete ATVisitor;
+  return;
+}
+
+/*** Analysis of internal node ***/
+void InsertApproxInfo::codeGen(DFInternalNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+}
+
+/*** Analysis of leaf node ***/
+void InsertApproxInfo::codeGen(DFLeafNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+  
+  Function *F = N->getFuncPointer();
+  Module* M = F->getParent();
+  std::vector<IntrinsicInst *> IItoRemove;
+  
+
+  /**** Adding operand bundles for each tensor operation in the HPVM DFG Leaf Node ****/
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    errs()<<*I<<"\n";
+
+
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+
+      std::string intrinsic_id = std::string(II->getCalledFunction()->getName().data());
+      std::string runtime_func_name = intrinsics_map[intrinsic_id];
+      std::string unique_name = runtime_func_name + std::to_string(currentID);
+      printf("\n ---- unique_name = %s \n ", unique_name.c_str());
+      std::vector<ApproxMetrics*> approx_metrics;
+      if(operation_metrics.find(unique_name) != operation_metrics.end()){
+         approx_metrics = operation_metrics[unique_name];
+      }
+      else{
+	errs()<<"Intrinsic Name NOT found in the map - Unexpected Error. Aborting ... \n\n";
+        abort();
+      }
+      
+      
+      unsigned int num_configs = approx_metrics.size();
+      std::vector<OperandBundleDef> conf_bundles;
+      for(unsigned int i = 0; i < num_configs; i++){
+	std::vector<Value*> norm_vals;
+
+	norm_vals.push_back(category_str);
+	Constant* categoryConst = ConstantDataArray::getString(M->getContext(), StringRef(approx_metrics[i]->category.c_str()), true);
+        GlobalVariable* category_value = new GlobalVariable(*M, categoryConst->getType(), true,
+							    GlobalValue::ExternalLinkage, categoryConst, "");
+	norm_vals.push_back(category_value);
+
+	norm_vals.push_back(rank_str);
+	Constant* constIntVal = ConstantInt::get(Type::getInt32Ty(M->getContext()), approx_metrics[i]->rank);
+	norm_vals.push_back(constIntVal);
+
+	// Adding mean l-norm metrics
+	norm_vals.push_back(mean_l1_str);
+	Constant* constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_l1);
+	norm_vals.push_back(constFPVal);
+
+	norm_vals.push_back(mean_l2_str);
+	constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_l2);
+	norm_vals.push_back(constFPVal);
+
+	norm_vals.push_back(mean_linf_str);
+	constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_linf);
+	norm_vals.push_back(constFPVal);
+
+        // Relative l-norm Metrics
+	norm_vals.push_back(rel_l1_str);
+	constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_l1);
+	norm_vals.push_back(constFPVal);
+
+	norm_vals.push_back(rel_l2_str);
+	constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_l2);
+	norm_vals.push_back(constFPVal);
+
+	norm_vals.push_back(rel_linf_str);
+	constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_linf);
+	norm_vals.push_back(constFPVal);
+
+
+	std::string config_name = "config_" + std::to_string(i+1);
+	OperandBundleDef norm_bundle(config_name, norm_vals);
+ 
+	conf_bundles.push_back(norm_bundle);
+      }
+
+      ArrayRef<OperandBundleDef> bundle_arr(conf_bundles);
+
+      /*** Creating new Intrinsic call with Operand Bundles attached **/
+      Function* calledFunction = II->getCalledFunction();
+      unsigned num_args = II->getNumArgOperands();
+      std::vector<Value*> args;
+      for(unsigned i = 0; i < num_args; i++){
+        Value* argValue = II->getArgOperand(i);
+	args.push_back(argValue);
+      }
+
+      CallInst* CI = CallInst::Create(calledFunction,
+                 		      args, bundle_arr, "", II);
+
+      errs()<<"NOTE: New CallInst = "<<*CI<<"\n";
+      
+      II->replaceAllUsesWith(CI);
+      // Mark to remove at the end
+      IItoRemove.push_back(II);
+      
+      // Increment counter of op processed
+      currentID++;
+    }
+  }
+
+
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    errs() << "Erasing: " << **ri << "\n";
+    (*ri)->eraseFromParent();
+  }
+
+  
+}
+
+char InsertApproxInfoWrapperPass::ID = 0;
+static RegisterPass<InsertApproxInfoWrapperPass> X("insert-approxinfo",
+  "Pass to add approximation information (l-norm metrics) in the ApproxHPVM DFG",
+  false /* does not modify the CFG */,
+  false /* not transformation, just analysis */);
+
+
+
+
+  
+} // End of namespace
+
diff --git a/lib/InsertApproxInfo/LLVMBuild.txt b/lib/InsertApproxInfo/LLVMBuild.txt
new file mode 100644
index 0000000000..e9cf5afd4a
--- /dev/null
+++ b/lib/InsertApproxInfo/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InsertApproxInfo
+parent = Transforms
diff --git a/lib/LocalMem/CMakeLists.txt b/lib/LocalMem/CMakeLists.txt
new file mode 100644
index 0000000000..fa91332594
--- /dev/null
+++ b/lib/LocalMem/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMLocalMem
+  LocalMem.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/LocalMem/LLVMBuild.txt b/lib/LocalMem/LLVMBuild.txt
new file mode 100644
index 0000000000..629f9caaa9
--- /dev/null
+++ b/lib/LocalMem/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LocalMem
+parent = Transforms
diff --git a/lib/LocalMem/LocalMem.cpp b/lib/LocalMem/LocalMem.cpp
new file mode 100644
index 0000000000..896c3f382a
--- /dev/null
+++ b/lib/LocalMem/LocalMem.cpp
@@ -0,0 +1,224 @@
+//===-------------------------- LocalMem.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "LocalMem"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+namespace {
+// Helper Functions
+
+static AllocationNodeProperty* isAllocationNode(DFLeafNode* N);
+
+// LocalMem - The first implementation.
+struct LocalMem : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  LocalMem() : ModulePass(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addPreserved<BuildDFG>();
+  }
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class AT_OCL : public CodeGenTraversal {
+
+private:
+  //Member variables
+
+  //Functions
+
+  // Virtual Functions
+  void init() {}
+  void initRuntimeAPI() {}
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+  // Constructor
+  AT_OCL(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
+    //init();
+    //initRuntimeAPI();
+  }
+
+};
+
+bool LocalMem::runOnModule(Module &M) {
+  errs() << "\nLOCALMEM PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  //DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  AT_OCL *ATVisitor = new AT_OCL(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    ATVisitor->visit(rootNode);
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // TODO: Later on, we might like to do this in a separate pass, which would
+    // allow us the flexibility to switch between complete static code generation
+    // for DFG or having a customized runtime+scheduler
+  }
+
+  delete ATVisitor;
+  return true;
+}
+
+void AT_OCL::codeGen(DFInternalNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+}
+
+// Code generation for leaf nodes
+void AT_OCL::codeGen(DFLeafNode* N) {
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Check and mark as allocation node
+  AllocationNodeProperty* ANP = isAllocationNode(N);
+  if(ANP != NULL) {
+    // set Properties of the allocation node
+    N->setProperty(DFNode::Allocation, ANP);
+    AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation);
+    AllocationNodeProperty::AllocationListType AL = anp->getAllocationList();
+    DEBUG(errs() << "Total allocations = " << AL.size() << "\n");
+    for(auto P: AL) {
+      DEBUG(errs() << " EdgePort: " << P.first->getDestPosition());
+      DEBUG(errs() << " Size: " << *P.second << "\n");
+    } 
+
+  }
+}
+
+// Return pointer to property if this leaf node matches the conditions for being an allocation
+// node.
+// Conditions 
+// 1. No incoming memory pointer. No in/out attribute on a pointer argument
+// 2. Uses visc malloc intrinsic to allocate memory
+// 3. Sends it out
+// 2. (TODO:) Whether the allocated pointer escapes the parent node
+AllocationNodeProperty* isAllocationNode(DFLeafNode* N) {
+  // Allocation node must be free from side-effects
+  if(N->hasSideEffects())
+    return NULL;
+
+  // Allocation node must have some outgoing edges
+  if(N->getOutputType()->isEmptyTy())
+    return NULL;
+
+  Function* F = N->getFuncPointer();
+  
+  // Allocation node must use visc malloc intrinsic
+  bool usesVISCMalloc = false;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) {
+    Instruction* I = &*i;
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
+      if(II->getIntrinsicID() == Intrinsic::visc_malloc) {
+        usesVISCMalloc = true;
+        break;
+      }
+    }
+  } 
+  if(!usesVISCMalloc)
+    return NULL;
+
+  // TODO: Check if allocated pointer leaves parent node
+  
+  // This is an allocation node
+  AllocationNodeProperty* ANP = new AllocationNodeProperty();
+  // Find the return statement.
+  // FIXME: For now, assuming their is just one BB. Terminator instruction of
+  // this BB is a return statement. The value returned is what we need
+  BasicBlock& BB = F->getEntryBlock();
+  assert(isa<ReturnInst>(BB.getTerminator())
+      && "Currently we do not handle the case where Allocation Node has multiple BB");
+  ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator());
+  // Find the returned struct
+  Value* val = RI->getReturnValue();
+  std::vector<Value*> OutValues(6, NULL);
+  unsigned numOutputs = N->getOutputType()->getNumElements();
+  for(unsigned i = 0; i < numOutputs; i++) {
+    if(InsertValueInst* IV = dyn_cast<InsertValueInst>(val)) {
+      DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *val << "\n");
+      OutValues[numOutputs-1-i] = IV->getOperand(1);
+      val = IV->getOperand(0);
+    }
+    else {
+      DEBUG(errs() << "Unexpected value at out edge: " << *val << "\n");
+      llvm_unreachable("Expecting InsertValue instruction. Error!");
+    }
+  }
+  // OutValues vector contains all the values that will go out
+  // Assume that the Allocation node only sends the pointers and their sizes
+  // forward
+  unsigned i=0;
+  while(i < numOutputs) {
+    assert(OutValues[i]->getType()->isPointerTy()
+        && "Expected outgoing edge to be of pointer type");
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(OutValues[i])) {
+      if(II->getIntrinsicID() == Intrinsic::visc_malloc) {
+        // Sanity check: Size passed to malloc intrinsic is same as the value
+        // going into the next outgoing edge
+        DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); 
+        DEBUG(errs() << "Out edge value: " << *OutValues[i+1] << "\n"); 
+        assert(II->getArgOperand(0) == OutValues[i+1]
+            && "Sanity Check Failed: VISC Malloc size argument != next outgoing edge");
+        ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0));
+        i = i+2;
+        continue;
+      }
+    }
+    llvm_unreachable("Expecting visc malloc intrinsic instruction!");
+  } 
+  return ANP;
+}
+
+} // End of namespace
+
+char LocalMem::ID = 0;
+static RegisterPass<LocalMem> X("localmem",
+                                    "Pass to identifying nodes amenable to local memory allocation",
+                                    false /* does not modify the CFG */,
+                                    true /* transformation, not just analysis */);
+
diff --git a/lib/LocalMem/LocalMem.exports b/lib/LocalMem/LocalMem.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/MergeDFN/CMakeLists.txt b/lib/MergeDFN/CMakeLists.txt
new file mode 100644
index 0000000000..30e7330d0c
--- /dev/null
+++ b/lib/MergeDFN/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( LLVMMergeDFN
+  MergeDFN.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/lib/MergeDFN/LLVMBuild.txt b/lib/MergeDFN/LLVMBuild.txt
new file mode 100644
index 0000000000..099486e6c3
--- /dev/null
+++ b/lib/MergeDFN/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/MergeDFN/LLVMBuild.txt ------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MergeDFN
+parent = Transforms
diff --git a/lib/MergeDFN/MergeDFN.cpp b/lib/MergeDFN/MergeDFN.cpp
new file mode 100644
index 0000000000..35e70e35ce
--- /dev/null
+++ b/lib/MergeDFN/MergeDFN.cpp
@@ -0,0 +1,2338 @@
+//=== DFG2LLVM_NVPTX.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "MergeDFN"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+static cl::opt<std::string> Node1Name("mc1",
+                                 cl::init(""),
+                                 cl::Hidden,
+                                 cl::desc("First node candidate for merge"));
+static cl::opt<std::string> Node2Name("mc2",
+                                 cl::init(""),
+                                 cl::Hidden,
+                                 cl::desc("Second node candidate for merge"));
+
+namespace {
+// Helper class declarations
+
+// Helper function declarations
+
+// MergeDFN
+struct MergeDFN : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  MergeDFN() : ModulePass(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  // Functions
+  bool runOnModule(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addPreserved<BuildDFG>(); //TODO: Check
+  }
+
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class MergeTraversal : public DFNodeVisitor {
+
+private:
+  //Member variables
+  Module &M;
+  BuildDFG &DFG;
+  DFNode *n1;
+  DFNode *n2;
+  DFNode *m;
+
+  //Functions
+  void testNodeName(DFNode* N);
+
+public:
+  // Constructor
+  MergeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { 
+    n1 = NULL;
+    n2 = NULL;
+    m = NULL;
+  }
+
+  virtual void visit(DFInternalNode* N) {
+    // Follows a bottom-up approach to find the nodes.
+    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
+        e = N->getChildGraph()->end(); i != e; ++i) {
+      DFNode* child = *i;
+      child->applyDFNodeVisitor(*this);
+    }
+
+    DEBUG(errs() << "Testing Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    testNodeName(N);
+    DEBUG(errs() << "\tDONE - " << "\n");
+
+  }
+
+  virtual void visit(DFLeafNode* N) {
+    DEBUG(errs() << "Testing Node (L) - " << N->getFuncPointer()->getName() << "\n");
+    testNodeName(N);
+    DEBUG(errs() << "DONE" << "\n");
+  }
+
+  bool isValidMergeChoise();
+
+  void mergeDFN();
+
+};
+
+//===--------------------- Helper Function Declarations --------------===//
+IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F,
+                                                            IntrinsicInst* II);
+IntrinsicInst* createNewCreateNodeBasedOn(Function* F, IntrinsicInst* II,
+                                                              Function* Fargs);
+IntrinsicInst* createIdenticalCreateEdgeWithDifferentPort(IntrinsicInst* II,
+unsigned port, bool srcport);
+IntrinsicInst* createIdenticalCreateEdgeWithDifferentNode(IntrinsicInst* II,
+IntrinsicInst* IInode, bool srcnode);
+IntrinsicInst* createIdenticalBindInputWithDifferentNode(IntrinsicInst* II,
+                                                         IntrinsicInst* IInode);
+IntrinsicInst* createIdenticalBindInputWithDifferentPort(IntrinsicInst* II,
+                                                         unsigned port,
+                                                         bool srcport);
+IntrinsicInst* createIdenticalBindOutputWithDifferentNode(IntrinsicInst* II,
+                                                         IntrinsicInst* IInode);
+IntrinsicInst* createIdenticalBindOutputWithDifferentPort(IntrinsicInst* II,
+                                                          unsigned port,
+                                                          bool srcport);
+void updateUsesOfCreateNodeInParent(IntrinsicInst* II1,
+                                    IntrinsicInst* II2,
+                                    IntrinsicInst* IInew,
+                                    std::map<unsigned, unsigned> InMap,
+                                    std::map<unsigned, unsigned> OutMap,
+                                    std::vector<DFEdge*> &DFEdgestoRemove,
+                                    BuildDFG &DFG);
+bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn);
+bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn);
+bool hasSuccessor(DFNode* N1, DFNode* N2);
+bool hasImmediateSuccesssor(DFNode* N1, DFNode* N2);
+bool checkEdgesType(DFNode* N1, DFNode* N2);
+static void createArgTypes(DFNode* N1, DFNode* N2, std::vector<Type*> &ArgTypes);
+void getChildNodeSplit(DFInternalNode* N,
+                       std::vector<DFNode*> &AllocationNodes,
+                       std::vector<DFNode*> &ComputeNodes);
+void buildInputAndOutputMaps(DFNode* N1, DFNode* N2,
+                             std::map<unsigned, unsigned> &N1InMap,
+                             std::map<unsigned, unsigned> &N1OutMap,
+                             std::map<unsigned, unsigned> &N2InMap,
+                             std::map<unsigned, unsigned> &N2OutMap);
+void buildInAndOutEdgeMaps(DFNode* N1, DFNode* N2,
+                           std::map<unsigned, unsigned> &N1InMap,
+                           std::map<unsigned, unsigned> &N1OutMap,
+                           std::map<unsigned, unsigned> &N2InMap,
+                           std::map<unsigned, unsigned> &N2OutMap);
+static StructType* createReturnType(DFNode* N1, DFNode* N2);
+static void copyAttrList(DFNode* N1, DFNode* N2, Function* F);
+static void copyArgumentNames(DFNode* N1, DFNode* N2, Function* F);
+void createShiftMap(Function* F, unsigned fromPos, unsigned num,
+                    unsigned shift, std::vector<unsigned> &ShiftMap);
+void shiftArgs(Function* F, unsigned fromPos, unsigned num,
+               unsigned shift, std::vector<unsigned> &ShiftMap);
+static Function* createEmptyDFNodeFunction(DFNode* N1, DFNode* N2, Module &M);
+static Function* createLeafDFNodeFunction(DFNode* N1, DFNode* N2, Module &M,
+                                          unsigned numOfN1AllocArgs,
+                                          unsigned posOfN1AllocArgs,
+                                          unsigned numOfN2AllocArgs);
+static Function* createInternalDFNodeFunction(DFNode* N1, DFNode* N1an,
+                                              DFNode* N1cn, DFNode* N2,
+                                              DFNode* N2an, DFNode* N2cn,
+                                              Function* Fa, Function* Fc,
+                                              Module &M,
+                                              unsigned numOfN1AllocArgs,
+                                              unsigned posOfN1AllocArgs,
+                                              unsigned numOfN2AllocArgs);
+void createNewInternalNodeIntrinsics(DFNode* N1,
+                                     DFNode* N2,
+                                     DFNode* N1a,
+                                     DFNode* N1c,
+                                     DFNode* N2a,
+                                     DFNode* N2c,
+                                     IntrinsicInst* IInewa,
+                                     IntrinsicInst* IInewc,
+                                     Function* Fa, //FIXME: Unused
+                                     Function* Fc,
+                        std::vector<IntrinsicInst*> &IntrinsicInstructionsToAdd,
+                        std::vector<IntrinsicInst*> &IntermediateInstructions);
+Argument* getFunctionArgumentAt(Function* F, unsigned i);
+void removeUnnecessaryInputEdges(DFNode* N, DFNode* N1,
+                                 unsigned numOfN1AllocArgs,
+                                 unsigned numOfN2AllocArgs);
+void deleteInternalNodeFunction(DFNode* N, BuildDFG &DFG);
+static visc::Target getPreferredTarget(Function* F);
+static void addHint(Function* F, visc::Target T);
+static void removeHint(Function* F, visc::Target T);
+std::string getTestModuleName(Module &M);
+
+
+//===--------------------- MergeDFN Outlined Functions --------------===//
+void MergeTraversal::testNodeName(DFNode* N) {
+
+  if (N->getFuncPointer()->getName() == Node1Name) {
+  //if (N->getFuncPointer()->getName() ==  "WrapperDilate_cloned") {
+  //if (N->getFuncPointer()->getName() ==  "WrapperDilate_cloned_WrapperErode_cloned") {
+  //if (N->getFuncPointer()->getName() ==  "WrapperHorizontal_cloned") {
+  //if (N->getFuncPointer()->getName() ==  "WrapperHorizontal_cloned_WrapperVertical_cloned") {
+    n1 = N;
+  }
+  else if (N->getFuncPointer()->getName() == Node2Name) {
+  //else if (N->getFuncPointer()->getName() == "WrapperErode_cloned") {
+  //else if (N->getFuncPointer()->getName() == "WrapperLincomb_cloned") {
+  //else if (N->getFuncPointer()->getName() == "WrapperVertical_cloned") {
+  //else if (N->getFuncPointer()->getName() == "WrapperSquareRoot_cloned") {
+    n2 = N;
+  }
+}
+
+//TODO: use the topological sort to find merge candidates
+bool MergeTraversal::isValidMergeChoise() {
+  if (!n1 || !n2)
+    return false;
+
+  // Check that n1 and n2 have the same
+  // - parent
+  // - hint
+  // - number and size of dimensions of dynamic instances
+  bool valid = (n1->getParent() == n2->getParent()) &&
+               (getPreferredTarget(n1->getFuncPointer()) ==
+                                    getPreferredTarget(n2->getFuncPointer())) &&
+               (n1->getNumOfDim() == n2->getNumOfDim());
+
+  std::vector<Value*> n1dim = n1->getDimLimits();
+  std::vector<Value*> n2dim = n2->getDimLimits();
+  for (unsigned i = 0; (i < n1dim.size()) && valid ; i++)
+    valid = valid && (n1dim[i] == n2dim[i]);
+
+  // n1 should not be a successor of n2
+  valid = valid && !hasSuccessor(n2, n1);
+  // n2 should not be a successor of n1, other than an immediate successor
+  valid = valid && (!hasSuccessor(n1, n2) || hasImmediateSuccesssor(n1, n2));
+
+  if (!valid)
+    return false;
+
+  // Now, check specifically for one or two level cases
+  if (dyn_cast<DFLeafNode>(n1) && dyn_cast<DFLeafNode>(n1)) {
+    // For now, only allow one to one edges between them
+    return checkEdgesType(n1, n2);
+  }
+
+  //At this point, at least one of them is internal node
+
+  DFInternalNode* n1cast = dyn_cast<DFInternalNode>(n1);
+  DFInternalNode* n2cast = dyn_cast<DFInternalNode>(n2);
+
+  // If not both of them are internal nodes, it is not a valid merging
+  if (!n1cast || !n2cast)
+    return false;
+
+  // At this point, they are both internal nodes
+  // For internal nodes, we only allow one-to-one edges
+  valid = valid && checkEdgesType(n1->getParent(), n2->getParent()); // FIXME: n1 and n2?
+
+  // We need to check that they have the appropriate internal structure
+  std::vector<DFNode*> AllocNodes1, ComputeNodes1, AllocNodes2, ComputeNodes2;
+  getChildNodeSplit(n1cast, AllocNodes1, ComputeNodes1);
+  getChildNodeSplit(n2cast, AllocNodes2, ComputeNodes2);
+
+  // There must be at most a single allocation node within each one of them
+  // There must be exactly one compute node within each one of them
+  valid = valid &&
+    (AllocNodes1.size() <= 1) &&
+    (AllocNodes2.size() <= 1) &&
+    (ComputeNodes1.size() == 1) &&
+    (ComputeNodes2.size() == 1);
+
+  // The compute nodes must be leaf nodes with the same number and size of
+  // dimensions of dynamic instances
+  DFLeafNode* n1cn = dyn_cast<DFLeafNode>(ComputeNodes1[0]);
+  DFLeafNode* n2cn = dyn_cast<DFLeafNode>(ComputeNodes2[0]);
+  if (!n1cn || !n2cn)
+    return false;
+
+  errs() << "Checking if the sizes are same for internal nodes\n";
+
+  valid = valid && (n1cn->getNumOfDim() == n2cn->getNumOfDim());
+  std::vector<Value*> n1cndim = n1cn->getDimLimits();
+  std::vector<Value*> n2cndim = n2cn->getDimLimits();
+
+  for (unsigned i = 0; (i < n1cndim.size()) && valid ; i++) {
+    // These cannot fail, these valaues have been passed as arguments
+    Argument* n1arg = cast<Argument>(n1cndim[i]);
+    Argument* n2arg = cast<Argument>(n2cndim[i]);
+    unsigned n1argPos = n1arg->getArgNo();
+    unsigned n2argPos = n2arg->getArgNo();
+    // These values are coming from bind intrinsics, thus from the parent node
+    // The position of the argument is the same as the inPort of the incoming
+    // edge of their parent, n1 and n2.
+    DFEdge* n1argEdge = n1->getInDFEdgeAt(n1argPos);
+    DFEdge* n2argEdge = n2->getInDFEdgeAt(n2argPos);
+    // Get source position and node of these edges
+    unsigned n1SrcPos = n1argEdge->getSourcePosition();
+    DFNode* n1SrcNode = n1argEdge->getSourceDF();
+    unsigned n2SrcPos = n2argEdge->getSourcePosition();
+    DFNode* n2SrcNode = n2argEdge->getSourceDF();
+    valid = valid && (n1SrcPos == n2SrcPos) && (n1SrcNode == n2SrcNode);
+  }
+
+  // We must also make sure that any edge that is incoming to the allocation
+  // node of n2 is not from n1
+  if (AllocNodes2.size() == 1) {
+    DFNode* n2an = AllocNodes2[0];
+    unsigned inPort = 0;
+    for (DFNode::const_indfedge_iterator ei = n2an->indfedge_begin(),
+         ee = n2an->indfedge_end(); (ei != ee) && valid ; ei++, inPort++)
+      if (n2an->getExtendedInDFEdgeAt(inPort)->getSourceDF() == ComputeNodes1[0])
+        return false;
+  }
+
+  return valid;
+}
+
+void MergeTraversal::mergeDFN() {
+
+  Function* Fm;
+
+  if (dyn_cast<DFLeafNode>(n1)) { // One level node merging,
+    // n1 and n2 are leaf nodes
+    // Simply create the merged leaf function (with the calls)
+    Fm = createLeafDFNodeFunction(n1, n2, M, 0, 0, 0);
+    addHint(Fm, getPreferredTarget(n1->getFuncPointer()));
+    removeHint(n1->getFuncPointer(), getPreferredTarget(n1->getFuncPointer()));
+    removeHint(n2->getFuncPointer(), getPreferredTarget(n2->getFuncPointer()));
+  } else { // Two level node merging, n1 and n2 are internal nodes
+    // Correct form of internal nodes has been verified in isValidMerge
+    // Both n1 and n2 have at most two children:
+    // a compute node and maybe an allocation node
+    std::vector<DFNode*> AllocationNodes;
+    std::vector<DFNode*> ComputeNodes;
+
+    getChildNodeSplit(cast<DFInternalNode>(n1), AllocationNodes, ComputeNodes);
+    DFLeafNode* N1ComputeNode = cast<DFLeafNode>(ComputeNodes[0]);
+    DFLeafNode* N1AllocationNode =
+      (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL;
+    AllocationNodes.clear();
+    ComputeNodes.clear();
+    getChildNodeSplit(cast<DFInternalNode>(n2), AllocationNodes, ComputeNodes);
+    DFLeafNode* N2ComputeNode = cast<DFLeafNode>(ComputeNodes[0]);
+    DFLeafNode* N2AllocationNode =
+      (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL;
+
+    Function* Falloc = NULL;
+    if (N1AllocationNode && N2AllocationNode)
+      Falloc = createLeafDFNodeFunction(N1AllocationNode,
+                                        N2AllocationNode,
+                                        M, 0, 0, 0);
+    else if (N1AllocationNode)
+      Falloc = N1AllocationNode->getFuncPointer();
+    else if (N2AllocationNode)
+      Falloc = N2AllocationNode->getFuncPointer();
+
+    unsigned numOfN1AllocArgs = 0;
+    unsigned posOfN1AllocArgs = 0;
+    unsigned numOfN2AllocArgs = 0;
+    if (N1AllocationNode) {
+      StructType* F1RetTy =
+        cast<StructType>(N1AllocationNode->getFuncPointer()->getReturnType());
+        numOfN1AllocArgs = F1RetTy->getNumElements();
+        // The position where the allocation node's arguments of n1 alloc go in
+        // the merged function's parameter list is the same as it was in n1
+        // compute function, because all the incoming edges to n1 do not change.
+        // We need this information to shift the allocation parameters to the
+        // end of the merged function's parameter list
+        posOfN1AllocArgs =
+          N1AllocationNode->getOutDFEdgeAt(0)->getDestPosition();
+    }
+    if (N2AllocationNode) {
+      StructType* F2RetTy =
+        cast<StructType>(N2AllocationNode->getFuncPointer()->getReturnType());
+        numOfN2AllocArgs = F2RetTy->getNumElements();
+    }
+
+    errs () << "Working on leaf functions ...\n";
+    Function* Fcompute = 
+      createLeafDFNodeFunction(N1ComputeNode,
+                               N2ComputeNode,
+                               M, numOfN1AllocArgs,
+                               posOfN1AllocArgs, numOfN2AllocArgs);
+    addHint(Fcompute, getPreferredTarget(N1ComputeNode->getFuncPointer()));
+    removeHint(N1ComputeNode->getFuncPointer(),
+               getPreferredTarget(N1ComputeNode->getFuncPointer()));
+    removeHint(N2ComputeNode->getFuncPointer(),
+               getPreferredTarget(N2ComputeNode->getFuncPointer()));
+
+    errs () << "Leaf functions merged ...\n";
+    Fm = createInternalDFNodeFunction(n1, N1AllocationNode, N1ComputeNode,
+                                      n2, N2AllocationNode, N2ComputeNode,
+                                      Falloc, Fcompute,
+                                      M, numOfN1AllocArgs,
+                                      posOfN1AllocArgs, numOfN2AllocArgs);
+    addHint(Fm, getPreferredTarget(n1->getFuncPointer()));
+    removeHint(n1->getFuncPointer(), getPreferredTarget(n1->getFuncPointer()));
+    removeHint(n2->getFuncPointer(), getPreferredTarget(n2->getFuncPointer()));
+  }
+  errs () << "Leaf functions merged and Internal Function merged ...\n";
+  // This is before any code generation passes -> no genfunc
+
+  // FIX PARENT DFNode'S FUNCTION
+  DFInternalNode* ParentNode = n1->getParent();
+
+  // Find createNode intrinsics for initial nodes
+  IntrinsicInst* II1 = n1->getInstruction();
+  IntrinsicInst* II2 = n2->getInstruction();
+
+  // Generate createNode Intrinsic for new node and insert it
+  IntrinsicInst* CreateNodeII =
+    createIdenticalCreateNodeWithDifferentFunction(Fm, II1);
+
+  // It needs to be inserted before either of the two.
+  // Find which one is first and add the new intrinsic before it
+  IntrinsicInst* IIfirst = NULL;
+  for (inst_iterator ib = inst_begin(ParentNode->getFuncPointer()),
+       ie = inst_end(ParentNode->getFuncPointer());
+       (ib != ie) && !IIfirst ; ++ib) {
+    Instruction* I = &*ib; // Grab pointer to Instruction
+    if ((I == II1) || (I == II2)) {
+      IIfirst = cast<IntrinsicInst>(I);
+    }
+  }
+  CreateNodeII->insertBefore(IIfirst);
+
+/* The following is an alternative to using the BuildDFG interface. It only   *
+ * creates this single node, not cnotinuing with the graph contained, thus    *
+ * will not build the graph of the node if it is internal node. Instead, I    *
+ * use the call DFG.handleCreateNode                                          */
+
+/*
+// -------------------------------------------------------------------------- //
+// Updating the graph directly
+  // Create the new node and add it to the graph
+  DFLeafNode* mergeDFNode = DFLeafNode::Create(CreateNodeII, Fm,
+                n1->getTargetHint(),
+                ParentNode,
+                n1->getNumOfDim(),
+                n1->getDimLimits());
+  //Done Later: fix rank of mergeDFNode and successors, after edges are fixed
+  //  mergeDFNode->setRank((n1->getRank() > n2->getRank()) ?
+  //                       (n1->getRank()) : (n2->getRank()) );
+
+  ParentNode->addChildToDFGraph(mergeDFNode);
+// -------------------------------------------------------------------------- //
+*/
+
+// -------------------------------------------------------------------------- //
+// Updating the BuildDFG result
+// remove the two nodes from mapping, add the new one
+  errs () << "Updating intrinsics\n";
+  DFG.removeElementFromHandleToDFNodeMap(II1);
+  DFG.removeElementFromHandleToDFNodeMap(II2);
+//  DFG.addElementToHandleToDFNodeMap(CreateNodeII, mergeDFNode);
+  DFG.handleCreateNode(ParentNode, CreateNodeII);
+  DFNode* mergeDFNode = DFG.getHandleToDFNodeMap()[CreateNodeII];
+
+// -------------------------------------------------------------------------- //
+
+  // Need to update every use of the createNode in the parent node function
+  // -- that would be in create edge and bind
+  std::map<unsigned, unsigned> N1InMap;
+  std::map<unsigned, unsigned> N1OutMap;
+  std::map<unsigned, unsigned> N2InMap;
+  std::map<unsigned, unsigned> N2OutMap;
+  // These maps map the old location of an argument/output (to its function's
+  // parameter list/out struct) to the new, after edges removed and functions
+  // merged
+  buildInputAndOutputMaps(n1, n2, N1InMap, N1OutMap, N2InMap, N2OutMap);
+
+  // Edges from n1 to n2 need to be deleted.
+  // They are placed here for deletion at the end.
+  std::vector<DFEdge*> DFEdgestoRemove;
+
+  // Update uses of createNode - that would be createEdge and bind intrinsics -
+  // to use the new createNode intrinsic
+  updateUsesOfCreateNodeInParent(II1, II2, CreateNodeII, N1InMap, N1OutMap,
+                                 DFEdgestoRemove, DFG);
+  updateUsesOfCreateNodeInParent(II2, II1, CreateNodeII, N2InMap, N2OutMap,
+                                 DFEdgestoRemove, DFG);
+
+  // Both II1 and II2 have no uses left. It is safe to remove them.
+  errs() << "Erasing: " << *II1 << "\n";
+  II1->eraseFromParent();
+  errs() << "Erasing: " << *II2 << "\n";
+  II2->eraseFromParent();
+
+// -------------------------------------------------------------------------- //
+// Updating the graph directly
+
+  // Update
+  // - dataflow edges
+  // - successor lists
+  // - incoming and outgoing edge lists
+  // The edges are updated directly, therefore in the DFGraph DFEdgeList as well
+
+  // For n1
+  for (DFNode::indfedge_iterator indfedgeI = n1->indfedge_begin(),
+       indfedgeE = n1->indfedge_end(); indfedgeI != indfedgeE; indfedgeI++) {
+    DFEdge* E = *indfedgeI;
+    // Incoming edges are retargeted to new node in graph
+    E->setDestDF(mergeDFNode);
+    // Incoming edges are added to the incoming edge list
+    // ( no need to add them in the outgoing edge list of source nodes,
+    // they are already there )
+    mergeDFNode->addInDFEdge(E);
+    // Merge node is added to the successor list of the sources of the edges
+    E->getSourceDF()->addSuccessor(mergeDFNode);
+  }
+
+  for (DFNode::outdfedge_iterator outdfedgeI = n1->outdfedge_begin(),
+       outdfedgeE = n1->outdfedge_end(); outdfedgeI != outdfedgeE; outdfedgeI++) {
+    DFEdge* E = *outdfedgeI;
+    // Outgoing edges to n2 are deleted
+    if (E->getDestDF() == n2)  {
+      ParentNode->getChildGraph()->deleteEdge(E);
+      continue;
+    }
+
+    // Outgoing edges are retargeted to start from the new node in graph
+    E->setSourceDF(mergeDFNode);
+    // Outgoing edges' source port is updated
+    E->setSourcePosition(N1OutMap[E->getSourcePosition()]);
+    // Outgoing edges are added to the outgoing edge list
+    // ( no need to add them in the incoming edge list of destination nodes,
+    // they are already there )
+    mergeDFNode->addOutDFEdge(E);
+    // The destination node is added to the successor list of merge node
+    mergeDFNode->addSuccessor(E->getDestDF());
+  }
+
+  // For n2
+  for (DFNode::indfedge_iterator indfedgeI = n2->indfedge_begin(),
+       indfedgeE = n2->indfedge_end(); indfedgeI != indfedgeE; indfedgeI++) {
+    DFEdge* E = *indfedgeI;
+    // Incoming edges from n1 have already been removed from the graph - ignore
+    if (E->getSourceDF() == n1) {
+      DEBUG(errs() << "Edges between n1-n2 have already been removed from graph\n");
+    }
+
+    // Incoming edges are retargeted to new node in graph
+    E->setDestDF(mergeDFNode);
+    // Incoming edges' destination port is updated
+    E->setDestPosition(N2InMap[E->getDestPosition()]);
+    // Incoming edges are added to the incoming edge list
+    // ( no need to add them in the outgoing edge list of source nodes,
+    // they are already there )
+    mergeDFNode->addInDFEdge(E);
+    // Merge node is added to the successor list of the sources of the edges
+    E->getSourceDF()->addSuccessor(mergeDFNode);
+  }
+
+  for (DFNode::outdfedge_iterator outdfedgeI = n2->outdfedge_begin(),
+       outdfedgeE = n2->outdfedge_end(); outdfedgeI != outdfedgeE; outdfedgeI++) {
+    DFEdge* E = *outdfedgeI;
+    // Outgoing edges are retargeted to start from the new node in graph
+    E->setSourceDF(mergeDFNode);
+    // Outgoing edges' source port is updated
+    E->setSourcePosition(N2OutMap[E->getSourcePosition()]);
+    // Outgoing edges are added to the outgoing edge list
+    // ( no need to add them in the incoming edge list of destination nodes,
+    // they are already there )
+    mergeDFNode->addOutDFEdge(E);
+    // The destination node is added to the successor list of merge node
+    mergeDFNode->addSuccessor(E->getDestDF());
+  }
+  
+
+// -------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------- //
+// Updating the graph directly
+
+  // Compute rank of mergeDFNode and update rank of successors
+  mergeDFNode->setRank((n1->getRank() > n2->getRank()) ?
+                       (n1->getRank()) : (n2->getRank()) );
+
+  // Clear their incoming and outgoing edges vectors, and the successors list
+  n1->clearGraphElements();
+  n2->clearGraphElements();
+
+  // Clear them from the parent graph
+  ParentNode->removeChildFromDFGraph(n1);
+  ParentNode->removeChildFromDFGraph(n2);
+
+  /*
+  delete n1;
+  delete n2;
+  for (unsigned i = 0 ; i < DFEdgestoRemove.size(); i++)
+    delete DFEdgestoRemove[i];
+*/
+
+// -------------------------------------------------------------------------- //
+  errs() << "Removing similar arguments\n";
+  if (dyn_cast<DFLeafNode>(n1)) {
+    removeUnnecessaryInputEdges(mergeDFNode, n1, 0, 0);
+    // Erase old functions from module
+    n1->getFuncPointer()->replaceAllUsesWith(UndefValue::get(n1->getFuncPointer()->getType()));
+    n1->getFuncPointer()->eraseFromParent();
+    n2->getFuncPointer()->replaceAllUsesWith(UndefValue::get(n2->getFuncPointer()->getType()));
+    n2->getFuncPointer()->eraseFromParent();
+
+  } else {
+    std::vector<DFNode*> AllocationNodes;
+    std::vector<DFNode*> ComputeNodes;
+
+    // Get components of n1
+    getChildNodeSplit(cast<DFInternalNode>(n1), AllocationNodes, ComputeNodes);
+    DFLeafNode* N1ComputeNode = cast<DFLeafNode>(ComputeNodes[0]);
+    DFLeafNode* N1AllocationNode =
+      (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL;
+
+    AllocationNodes.clear();
+    ComputeNodes.clear();
+
+    // Get components of n2
+    getChildNodeSplit(cast<DFInternalNode>(n2), AllocationNodes, ComputeNodes);
+    DFLeafNode* N2AllocationNode =
+      (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL;
+    DFLeafNode* N2ComputeNode = cast<DFLeafNode>(ComputeNodes[0]);
+
+    AllocationNodes.clear();
+    ComputeNodes.clear();
+
+    // Get components of mergeDFNode
+    getChildNodeSplit(cast<DFInternalNode>(mergeDFNode), AllocationNodes,
+                      ComputeNodes);
+    DFLeafNode* ComputeNode = cast<DFLeafNode>(ComputeNodes[0]);
+
+    unsigned numOfN1AllocArgs = 0;
+    unsigned numOfN2AllocArgs = 0;
+    if (N1AllocationNode) {
+      StructType* F1RetTy =
+        cast<StructType>(N1AllocationNode->getFuncPointer()->getReturnType());
+        numOfN1AllocArgs = F1RetTy->getNumElements();
+    }
+    if (N2AllocationNode) {
+      StructType* F2RetTy =
+        cast<StructType>(N2AllocationNode->getFuncPointer()->getReturnType());
+        numOfN2AllocArgs = F2RetTy->getNumElements();
+    }
+
+    errs() << "Removing unnecessary input arguments\n";
+    removeUnnecessaryInputEdges(ComputeNode, N1ComputeNode, numOfN1AllocArgs,
+                                numOfN2AllocArgs);
+
+    N1ComputeNode->getFuncPointer()->replaceAllUsesWith(UndefValue::get(N1ComputeNode->getFuncPointer()->getType()));
+    N1ComputeNode->getFuncPointer()->eraseFromParent();
+    N2ComputeNode->getFuncPointer()->replaceAllUsesWith(UndefValue::get(N2ComputeNode->getFuncPointer()->getType()));
+    N2ComputeNode->getFuncPointer()->eraseFromParent();
+  }
+
+  errs() << "Deleting internal nodes\n";
+
+  deleteInternalNodeFunction(n1, DFG);
+  deleteInternalNodeFunction(n2, DFG);
+
+  errs() << "Returning\n";
+  return;
+}
+
+bool MergeDFN::runOnModule(Module &M) {
+  errs() << "\nMergeDFN PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* handles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+
+  // Visitor for Code Generation Graph Traversal
+  MergeTraversal *MergeLookup = new MergeTraversal(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    MergeLookup->visit(rootNode);
+  }
+
+  if (MergeLookup->isValidMergeChoise()) {
+    errs() << "Valid Merge Choise. Begin merging..\n";
+    DEBUG(errs() << "Valid Merge Choise. Begin merging..\n");
+    MergeLookup->mergeDFN();
+  } else {
+    errs() << "Not Valid Merge Choise. Abort merging.\n";
+    DEBUG(errs() << "Not Valid Merge Choise. Abort merging.\n");
+  }
+
+  delete MergeLookup;
+
+  return true;
+}
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+// Creates a new createNode intrinsic, similar to II but with different
+// associated function F instead
+IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F,
+                                                            IntrinsicInst* II) {
+  Module* M = F->getParent();
+
+  // Find which createNode intrinsic we need to create
+  Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID());
+  Constant* Fp = ConstantExpr::getPointerCast(F,
+                                          Type::getInt8PtrTy(II->getContext()));
+
+  ArrayRef<Value*> CreateNodeArgs;
+  switch (II->getIntrinsicID()) {
+    case Intrinsic::visc_createNode:
+    {
+      CreateNodeArgs = ArrayRef<Value*>(Fp);
+      break;
+    }
+    case Intrinsic::visc_createNode1D:
+    {
+      Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
+      break;
+    }
+    case Intrinsic::visc_createNode2D:
+    {
+      Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
+                                       II->getArgOperand(2)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
+      break;
+    }
+    case Intrinsic::visc_createNode3D:
+    {
+      Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1),
+                                       II->getArgOperand(2),
+                                       II->getArgOperand(3)};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
+      break;
+    }
+    default :
+      assert(false && "Unknown createNode intrinsic");
+      break;
+  }
+
+  CallInst* CI = CallInst::Create(CreateNodeF,
+                                  CreateNodeArgs,
+                                  F->getName()+".node");
+  IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
+  return CreateNodeII;
+}
+
+// Creates a new createNode intrinsic based on II.
+// The new intrinsic has different associated function F instead. II is used to
+// determine the location (in the parameter list of function Fargs) where the
+// arguments of the new intrinsic can be found.
+IntrinsicInst* createNewCreateNodeBasedOn(Function* F, IntrinsicInst* II,
+                                                              Function* Fargs) {
+  Module* M = F->getParent();
+
+  // Find which createNode intrinsic we need to create
+  Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID());
+  Constant* Fp = ConstantExpr::getPointerCast(F,
+                                          Type::getInt8PtrTy(II->getContext()));
+
+  std::vector<Argument*> FArgList;
+  for (auto& arg: Fargs->getArgumentList()) {
+    FArgList.push_back(&arg);
+  }
+
+  ArrayRef<Value*> CreateNodeArgs;
+  switch (II->getIntrinsicID()) {
+    case Intrinsic::visc_createNode:
+    {
+      CreateNodeArgs = ArrayRef<Value*>(Fp);
+      break;
+    }
+    case Intrinsic::visc_createNode1D:
+    {
+      Value* CreateNode1DArgs[] = {Fp,
+                    FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()]};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
+      break;
+    }
+    case Intrinsic::visc_createNode2D:
+    {
+      Value* CreateNode2DArgs[] = {Fp,
+                    FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()],
+                    FArgList[cast<Argument>(II->getArgOperand(2))->getArgNo()]};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
+      break;
+    }
+    case Intrinsic::visc_createNode3D:
+    {
+      Value* CreateNode3DArgs[] = {Fp,
+                    FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()],
+                    FArgList[cast<Argument>(II->getArgOperand(2))->getArgNo()],
+                    FArgList[cast<Argument>(II->getArgOperand(3))->getArgNo()]};
+      CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
+      break;
+    }
+    default :
+      assert(false && "Unknown createNode intrinsic");
+      break;
+  }
+
+  CallInst* CI = CallInst::Create(CreateNodeF,
+                                  CreateNodeArgs,
+                                  F->getName()+".node");
+  IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
+  return CreateNodeII;
+}
+
+
+// create an identical createEdge with different src (true) or dst (false) node
+IntrinsicInst* createIdenticalCreateEdgeWithDifferentNode(IntrinsicInst* II,
+IntrinsicInst* IInode, bool srcnode) {
+  // Argument of the function to be called
+  Value* SrcNode = (srcnode) ? IInode: II->getArgOperand(0);
+  Value* DstNode = (srcnode) ? II->getArgOperand(1): IInode;
+
+  Value* EdgeArgs[] = {SrcNode, DstNode,
+                       II->getArgOperand(2),
+                       II->getArgOperand(3),
+                       II->getArgOperand(4),
+                       II->getArgOperand(5)
+                      };
+
+//  Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
+  Function* EdgeF = II->getCalledFunction();
+  CallInst* EdgeInst = CallInst::Create(EdgeF,
+                                        ArrayRef<Value*>(EdgeArgs, 6),
+                                        II->getName()+".repl");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(EdgeInst);
+  assert(newII && "Cannot cast createEdge to IntrinsicInst");
+
+  return newII;
+}
+
+// create an identical createEdge with different src (true) or dst (false) port
+IntrinsicInst* createIdenticalCreateEdgeWithDifferentPort(IntrinsicInst* II,
+unsigned port, bool srcport) {
+  // Argument of the function to be called
+  ConstantInt* PortConstant =
+    ConstantInt::get(Type::getInt32Ty(II->getContext()), port);
+  Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(3);
+  Value* DstPort = (srcport) ? II->getArgOperand(4): PortConstant;
+
+  Value* EdgeArgs[] = {II->getArgOperand(0),
+                       II->getArgOperand(1),
+                       II->getArgOperand(2),
+                       SrcPort, DstPort,
+                       II->getArgOperand(5)
+                      };
+
+//  Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
+  Function* EdgeF = II->getCalledFunction();
+  CallInst* EdgeInst = CallInst::Create(EdgeF,
+                                        ArrayRef<Value*>(EdgeArgs, 6),
+                                        II->getName()+".repl");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(EdgeInst);
+  assert(newII && "Cannot cast createEdge to IntrinsicInst");
+
+  return newII;
+}
+
+// create an identical bindInput with different destination node
+IntrinsicInst* createIdenticalBindInputWithDifferentNode(IntrinsicInst* II,
+                                                        IntrinsicInst* IInode) {
+  Value* BindArgs[] = {IInode,
+                       II->getArgOperand(1),
+                       II->getArgOperand(2),
+                       II->getArgOperand(3)
+                      };
+//  Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
+  Function* BindF = II->getCalledFunction();
+  CallInst* BindInst = CallInst::Create(BindF,
+                                        ArrayRef<Value*>(BindArgs, 4),
+                                        "");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst);
+  assert(newII && "Cannot cast bind_output to IntrinsicInst");
+
+  return newII;
+}
+
+// create an identical bindInput with different src (true) or dst (false) port
+IntrinsicInst* createIdenticalBindInputWithDifferentPort(IntrinsicInst* II,
+                                                         unsigned port,
+                                                         bool srcport) {
+  // Argument of the function to be called
+  ConstantInt* PortConstant =
+    ConstantInt::get(Type::getInt32Ty(II->getContext()), port);
+  Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1);
+  Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant;
+
+  Value* BindArgs[] = {II->getArgOperand(0),
+                       SrcPort,
+                       DstPort,
+                       II->getArgOperand(3)
+                      };
+//  Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
+  Function* BindF = II->getCalledFunction();
+  CallInst* BindInst = CallInst::Create(BindF,
+                                        ArrayRef<Value*>(BindArgs, 4),
+                                        "");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst);
+  assert(newII && "Cannot cast bind_output to IntrinsicInst");
+
+  return newII;
+}
+
+// create an identical bindOutput with different source node
+IntrinsicInst* createIdenticalBindOutputWithDifferentNode(IntrinsicInst* II,
+                                                        IntrinsicInst* IInode) {
+  Value* BindArgs[] = {IInode,
+                       II->getArgOperand(1),
+                       II->getArgOperand(2),
+                       II->getArgOperand(3)
+                      };
+//  Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
+  Function* BindF = II->getCalledFunction();
+  CallInst* BindInst = CallInst::Create(BindF,
+                                        ArrayRef<Value*>(BindArgs, 4),
+                                        "");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst);
+  assert(newII && "Cannot cast bind_output to IntrinsicInst");
+
+  return newII;
+}
+
+// create an identical bindOutput with different src (true) or dst (false) port
+IntrinsicInst* createIdenticalBindOutputWithDifferentPort(IntrinsicInst* II,
+                                                          unsigned port,
+                                                          bool srcport) {
+  // Argument of the function to be called
+  ConstantInt* PortConstant =
+    ConstantInt::get(Type::getInt32Ty(II->getContext()), port);
+  Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1);
+  Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant;
+
+  Value* BindArgs[] = {II->getArgOperand(0),
+                       SrcPort,
+                       DstPort,
+                       II->getArgOperand(3)
+                      };
+//  Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
+  Function* BindF = II->getCalledFunction();
+  CallInst* BindInst = CallInst::Create(BindF,
+                                        ArrayRef<Value*>(BindArgs, 4),
+                                        "");
+  IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst);
+  assert(newII && "Cannot cast bind_output to IntrinsicInst");
+
+  return newII;
+}
+
+// Function to find each use of a createNode intrinsic for a node existing
+// before merging and properly replace it with a use of the createNode for
+// node created after node merging
+// II1 is the createNode for the node that got merged, whose uses we want to replace
+// II2 is the createNode for the other node that got merged
+// (we need this to determine if en edge should be updated or deleted)
+// InMap and Outmap maps map the old location of an argument/output to the new
+// one, after edges removed and functions merged
+// CreateEdge for edges from n1 to n2 need to be deleted and associated
+// intrinsics to be removed. They are placed in the two vectors.
+void updateUsesOfCreateNodeInParent(IntrinsicInst* II1,
+                                    IntrinsicInst* II2,
+                                    IntrinsicInst* IInew,
+                                    std::map<unsigned, unsigned> InMap,
+                                    std::map<unsigned, unsigned> OutMap,
+                                    std::vector<DFEdge*> &DFEdgestoRemove,
+                                    BuildDFG &DFG) {
+  std::vector<IntrinsicInst*> IItoRemove;
+
+  for (Value::user_iterator i = II1->user_begin(), ie = II1->user_end();
+       i != ie; ++i) {
+    Instruction *VI = dyn_cast<Instruction>(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+    assert(II && "Use of a node handle outside of a visc intrinsic");
+
+    switch(II->getIntrinsicID()) {
+      case Intrinsic::visc_createEdge:
+        {
+        if (isOutgoingEdgeIntrinsic(II,II1)) { // check for outgoing edges
+          if (isIncomingEdgeIntrinsic(II,II2)) {
+            // edge is between merged nodes
+            // createEdge is marked for deletion, if not already there
+            if (std::find(IItoRemove.begin(),IItoRemove.end(),II) == IItoRemove.end()) {
+              IItoRemove.push_back(II);
+              // ------------------------------------------------------------ //
+              // Updating the BuildDFG result
+              // remove handle for non-existing edge in mapping
+              DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+              DFG.removeElementFromHandleToDFEdgeMap(II);
+              DFEdgestoRemove.push_back(EdgeInMapping);
+              // ------------------------------------------------------------ //
+            }
+          } else { // Edge is outgoing, but to another node in the graph
+            // We need to change Src and SrcPort
+            // create an identical createEdge with different srcport
+            unsigned srcPos = cast<ConstantInt>(II->getOperand(3))->getZExtValue();
+            IntrinsicInst* newII =
+              createIdenticalCreateEdgeWithDifferentPort(II,
+                                                         OutMap[srcPos],
+                                                         true);
+            // and insert it before the current create edge
+            newII->insertBefore(II);
+            // change of operand II1 will happen at the end with replaceAllUsesWith
+            // mark this createEdge for deletion
+            IItoRemove.push_back(II);
+            // -------------------------------------------------------------- //
+            // Updating the BuildDFG result
+            // replace handle for edge in mapping
+            DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+            DFG.removeElementFromHandleToDFEdgeMap(II);
+            DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping);
+            // -------------------------------------------------------------- //
+          }
+        } else { // isIncomingEdgeIntrinsic(II,II1) : check for incoming edges
+          if (isOutgoingEdgeIntrinsic(II,II2)) {
+            // edge is between merged nodes
+            // createEdge is marked for deletion, if not already there
+            if (std::find(IItoRemove.begin(),IItoRemove.end(),II) == IItoRemove.end()) {
+              IItoRemove.push_back(II);
+              // ------------------------------------------------------------ //
+              // Updating the BuildDFG result
+              // remove handle for non-existing edge in mapping
+              DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+              DFG.removeElementFromHandleToDFEdgeMap(II);
+              DFEdgestoRemove.push_back(EdgeInMapping);
+              // ------------------------------------------------------------ //
+            }
+          } else { // Edge is incoming, but from another node 
+            // We need to change Dst node and DstPort
+            // create an identical createEdge with different dstport
+            unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue();
+            IntrinsicInst* newII =
+              createIdenticalCreateEdgeWithDifferentPort(II,
+                                                         InMap[dstPos],
+                                                         false);
+            // and insert it before the current create edge
+            newII->insertBefore(II);
+            // change of operand II1 will happen at the end with replaceAllUsesWith
+            // mark this createEdge for deletion
+            IItoRemove.push_back(II);
+            // -------------------------------------------------------------- //
+            // Updating the BuildDFG result
+            // replace handle for edge in mapping
+            DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+            DFG.removeElementFromHandleToDFEdgeMap(II);
+            DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping);
+            // -------------------------------------------------------------- //
+          }
+        }
+        }
+        break;
+      case Intrinsic::visc_bind_input:
+        {
+        // incoming bind from parent node 
+        // We need to change Dst node and DstPort
+        // create an identical bindInput with different dstport
+        unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+        IntrinsicInst* newII =
+              createIdenticalBindInputWithDifferentPort(II, InMap[dstPos], false);
+        // and insert it before the current bind
+        newII->insertBefore(II);
+        // change of operand II1 will happen at the end with replaceAllUsesWith
+        // mark this bind for deletion
+        IItoRemove.push_back(II);
+        // ------------------------------------------------------------------ //
+        // Updating the BuildDFG result
+        // replace handle for edge in mapping
+        DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+        DFG.removeElementFromHandleToDFEdgeMap(II);
+        DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping);
+        // ------------------------------------------------------------------ //
+        }
+        break;
+      case Intrinsic::visc_bind_output:
+        {
+        // outgoing bind to parent node 
+        // We need to change Src node and SrcPort
+        // create an identical bindOutput with different srcport
+        unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+        IntrinsicInst* newII =
+              createIdenticalBindOutputWithDifferentPort(II, OutMap[srcPos], true);
+        // and insert it before the current bind
+        newII->insertBefore(II);
+        // change of operand II1 will happen at the end with replaceAllUsesWith
+        // mark this bind for deletion
+        IItoRemove.push_back(II);
+        // ------------------------------------------------------------------ //
+        // Updating the BuildDFG result
+        // replace handle for edge in mapping
+        DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II];
+        DFG.removeElementFromHandleToDFEdgeMap(II);
+        DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping);
+        // ------------------------------------------------------------------ //
+        }
+        break;
+      default :
+        assert(false && "Unknown use of node handle");
+        break;
+    }
+  }
+
+  // Delete gathered instructions
+  for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(),
+       ie = IItoRemove.end(); ib != ie; ++ib) {
+    DEBUG(errs() << "Erasing: " << **ib << "\n");
+    (*ib)->eraseFromParent();
+  }
+
+  // Change all remaining edge-bind intrinsics containing n1 to the new node
+  II1->replaceAllUsesWith(IInew);
+
+}
+
+// Query the king of edge described by a createEdge intrinsic
+// with respect to node handle IIn
+bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) {
+  Value* Src = IIe->getArgOperand(1);
+  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src);
+//  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src->stripPointerCasts());
+  assert(ArgII && "First argument of createEdge is not an intrinsic");
+  return (ArgII == IIn);
+}
+
+bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) {
+  Value* Src = IIe->getArgOperand(0);
+  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src);
+//  IntrinsicInst* ArgII = cast<IntrinsicInst>(Src->stripPointerCasts());
+  assert(ArgII && "First argument of createEdge is not an intrinsic");
+  return (ArgII == IIn);
+}
+
+/*
+ * Return true if n2 is a successor of n1
+ */
+bool hasSuccessor(DFNode* N1, DFNode* N2) {
+  for (DFNode::const_successor_iterator i = N1->successors_begin(),
+                                        e = N1->successors_end();
+       i != e; i++) {
+    DFNode* N = *i;
+    if ((N == N2) || (hasSuccessor(N,N1))) return true;
+  }
+  return false;
+}
+
+/*
+ * Return true if n2 is an immediate successor of n1
+ */
+bool hasImmediateSuccesssor(DFNode* N1, DFNode* N2) {
+  for (DFNode::const_successor_iterator i = N1->successors_begin(),
+                                        e = N1->successors_end();
+       i != e; i++) {
+    DFNode* N = *i;
+    if (N == N2) return true;
+  }
+  return false;
+}
+
+/*
+ * Return true if all edges between n1 and n2 are one-to-one
+ */
+bool checkEdgesType(DFNode* N1, DFNode* N2) {
+  for (DFNode::const_outdfedge_iterator i = N1->outdfedge_begin(),
+                                        e = N1->outdfedge_end();
+       i != e; i++) {
+    DFEdge* E = *i;
+    if ((E->getDestDF() == N2) && (E->getEdgeType())) return false;
+  }
+  return true;
+}
+
+// Construct argument list
+// Assuming that N2 cannot be an ansestor of N1
+static void createArgTypes(DFNode* N1, DFNode* N2, std::vector<Type*> &ArgTypes) {
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  for(auto& arg: F1->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    ArgTypes.push_back(arg.getType());
+  }
+
+  unsigned inport = 0;
+  for(auto& arg: F2->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    if (N2->getExtendedInDFEdgeAt(inport)->getSourceDF() != N1)
+      ArgTypes.push_back(arg.getType());
+    inport++;
+  }
+
+}
+
+// Returns the allocation nodes and the compute nodes of a parent dataflow node
+void getChildNodeSplit(DFInternalNode* N,
+                       std::vector<DFNode*> &AllocationNodes,
+                       std::vector<DFNode*> &ComputeNodes) {
+  DFGraph::const_children_iterator ci = N->getChildGraph()->begin();
+  DFGraph::const_children_iterator ce = N->getChildGraph()->end();
+
+  for ( ; ci != ce; ci++ ) {
+    DFNode* child = *ci;
+    if (child->isAllocationNode())
+      AllocationNodes.push_back(child);
+    else if (!child->isDummyNode())
+     ComputeNodes.push_back(child);
+  }
+
+}
+
+// Creates a map between the old locations of parameters and outputs in the
+// functions before merging, and the new one after merge. Those that correspond
+// to edges that no longer exist (between the merged nodes) are not in the maps.
+void buildInputAndOutputMaps(DFNode* N1, DFNode* N2,
+                             std::map<unsigned, unsigned> &N1InMap,
+                             std::map<unsigned, unsigned> &N1OutMap,
+                             std::map<unsigned, unsigned> &N2InMap,
+                             std::map<unsigned, unsigned> &N2OutMap) {
+  unsigned n1NumInputs = 0;
+  for (unsigned i = 0; i < N1->getFuncPointer()->getArgumentList().size();
+       i++, n1NumInputs++) {
+    N1InMap[i] = i;
+  }
+  for (unsigned i = 0, inpos = 0;
+       i < N2->getFuncPointer()->getArgumentList().size(); i++) {
+    if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() != N1) {
+      N2InMap[i] = inpos+n1NumInputs;
+      inpos++;
+    }
+  }
+
+  unsigned n1NumOutputs = 0;
+  StructType* F1RetTy = cast<StructType>(N1->getFuncPointer()->getReturnType());
+  for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) {
+    if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) {
+      N1OutMap[i] = n1NumOutputs;
+      n1NumOutputs++;
+    }
+  }
+
+  StructType* F2RetTy = cast<StructType>(N2->getFuncPointer()->getReturnType());
+  for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) {
+    N2OutMap[i] = i+n1NumOutputs;
+  }
+
+  return;
+}
+
+// Creates a map between the old edge ports in the
+// nodes before merging, and the new one after merge. Those that correspond
+// to edges that no longer exist (between the merged nodes) are not in the maps.
+void buildInAndOutEdgeMaps(DFNode* N1, DFNode* N2,
+                           std::map<unsigned, unsigned> &N1InMap,
+                           std::map<unsigned, unsigned> &N1OutMap,
+                           std::map<unsigned, unsigned> &N2InMap,
+                           std::map<unsigned, unsigned> &N2OutMap) {
+
+  unsigned n1NumInEdges = N1->getFuncPointer()->getArgumentList().size();
+  for (unsigned i = 0; i < n1NumInEdges; i++) {
+    N1InMap[i] = i;
+  }
+
+  unsigned n1NumOutEdges = 0;
+  StructType* F1RetTy = cast<StructType>(N1->getFuncPointer()->getReturnType());
+  for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) {
+    if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) {
+      N1OutMap[i] = n1NumOutEdges;
+      n1NumOutEdges++;
+    }
+  }
+
+  unsigned n2NumInEdges = N2->getFuncPointer()->getArgumentList().size();
+  for (unsigned i = 0, inpos = 0; i < n2NumInEdges; i++) {
+    if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() != N1) {
+      N2InMap[i] = inpos+n1NumInEdges;
+      inpos++;
+    }
+  }
+
+  StructType* F2RetTy = cast<StructType>(N2->getFuncPointer()->getReturnType());
+  for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) {
+    N2OutMap[i] = i+n1NumOutEdges;
+  }
+
+  return;
+}
+
+// Construct return type
+// Assuming that N2 cannot be an ansestor of N1
+static StructType* createReturnType(DFNode* N1, DFNode* N2) {
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  StructType* F1RetTy = dyn_cast<StructType>(F1->getReturnType());
+  assert(F1RetTy && "Return Type must always be a struct");
+  StructType* F2RetTy = dyn_cast<StructType>(F2->getReturnType());
+  assert(F2RetTy && "Return Type must always be a struct");
+
+  std::vector<Type*> ReturnTypeElements;
+  unsigned outPos1 = 0, outPos2 = 0, outPosM = 0;
+  for (StructType::element_iterator i = F1RetTy->element_begin(),
+       e = F1RetTy->element_end();
+       (i != e) && (outPos1 < F1RetTy->getNumElements()); i++, outPos1++) {
+    if (N1->getExtendedOutDFEdgeAt(outPos1)->getDestDF() == N2)
+      continue;
+    ReturnTypeElements.push_back(*i);
+    outPosM++;
+  }
+
+  for (StructType::element_iterator i = F2RetTy->element_begin(),
+       e = F2RetTy->element_end();
+       i != e && outPos2 < F2RetTy->getNumElements(); i++, outPos2++) {
+    ReturnTypeElements.push_back(*i);
+    outPosM++;
+  }
+
+  errs() << "Return elements = " << ReturnTypeElements.size() << "\n";
+  StructType* FRetTy = StructType::create(F1->getContext(),
+                                  ArrayRef<Type*>(ReturnTypeElements),
+                                 (F1->getName()+"."+F2->getName()+".ty").str(), true);
+
+  errs() << "Struct type created\n";
+  return FRetTy;
+}
+
+// Copy attributes
+// Assuming that N2 cannot be an ansestor of N1
+static void copyAttrList(DFNode* N1, DFNode* N2, Function* F) {
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  Function::arg_iterator f1_ai = F1->arg_begin(), f1_ae = F1->arg_end();
+  Function::arg_iterator f2_ai = F2->arg_begin(), f2_ae = F2->arg_end();
+  Function::arg_iterator f_ai = F->arg_begin(), f_ae = F->arg_end();
+
+  unsigned inPos1 = 0, inPos2 = 0, inPosM = 0;
+  for(; f1_ai != f1_ae && f_ai != f_ae; ++f1_ai, ++f_ai, inPos1++, inPosM++) {
+    AttributeSet AS = F1->getAttributes();
+    DEBUG(errs() << "Copying attributes from " << F1->getName() << " at " << f1_ai->getArgNo() << "\n");
+    AttrBuilder AB(AS, f1_ai->getArgNo()+1);
+    AttributeSet argAS = AttributeSet::get(F1->getContext(), f_ai->getArgNo()+1, AB);
+    F->addAttributes(f_ai->getArgNo()+1, argAS);
+  }
+  for(; f2_ai != f2_ae && f_ai != f_ae; ++f2_ai, inPos2++) {
+    if (N2->getExtendedInDFEdgeAt(inPos2)->getSourceDF() == N1)
+      continue;
+
+    AttributeSet AS = F2->getAttributes();
+    DEBUG(errs() << "Copying attributes from " << F2->getName() << " at " << f2_ai->getArgNo() << "\n");
+    AttrBuilder AB(AS, f2_ai->getArgNo()+1);
+    AttributeSet argAS = AttributeSet::get(F2->getContext(), f_ai->getArgNo()+1, AB);
+    F->addAttributes(f_ai->getArgNo()+1, argAS);
+    ++f_ai;
+    inPosM++;
+  }
+}
+
+// Copy argument names
+static void copyArgumentNames(DFNode* N1, DFNode* N2, Function* F) {
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  Function::arg_iterator dest_it = F->arg_begin();
+
+  for(auto& arg: F1->getArgumentList()) {
+    dest_it->setName("n1_" + arg.getName());
+    dest_it++;
+  }
+
+  unsigned inport = 0;
+  for(auto& arg: F2->getArgumentList()) {
+    if (N2->getExtendedInDFEdgeAt(inport)->getSourceDF() != N1) {
+      dest_it->setName("n2_" + arg.getName());
+      dest_it++;
+    }
+    inport++;
+  }
+}
+
+// Creates shift map, which maps old position to new, after shifting num
+// arguments starting from fromPos by shift positions to the right.
+void createShiftMap(Function* F, unsigned fromPos, unsigned num,
+                    unsigned shift, std::vector<unsigned> &ShiftMap) {
+
+  for (unsigned i = 0; i < F->getArgumentList().size(); i++)
+    ShiftMap.push_back(i);
+
+  for (unsigned i = fromPos; i < fromPos + num; i++)
+    ShiftMap[i] += shift;
+
+  for (unsigned i = fromPos + num; i < fromPos + num + shift; i++)
+    ShiftMap[i] -= num;
+
+}
+
+// Shifts num arguments starting from fromPos by shift positions to the right,
+// replacing with the arguments at those positions.
+// Updates shift map, which maps old position to new.
+void shiftArgs(Function* F, unsigned fromPos, unsigned num,
+               unsigned shift, std::vector<unsigned> &ShiftMap) {
+  Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+  Function::arg_iterator from = ai;
+
+  unsigned cnt;
+  for (cnt = 0; from != ae && cnt < fromPos; from++, cnt++) {
+  }
+  assert((cnt == fromPos) && "Invalid start position for argument shifting");
+  
+  Function::arg_iterator af = from;
+  std::vector<Type*> ArgTypes;
+  std::vector<StringRef> ArgNames;
+  unsigned argNo = 0;
+
+  //TODO: check if this copies attributes as well
+  ValueToValueMapTy VMap;
+  Function* F_copy = CloneFunction(F, VMap);
+  F_copy->removeFromParent();
+
+  // Arguments up until before from
+  for ( ; ai != from && ai != ae; ai++, argNo++) {
+    ArgTypes.push_back(ai->getType());
+    ArgNames.push_back(ai->getName());
+  }
+
+  // Arguments to be shifted (num arguments) are skipped for now
+  for (unsigned i = 0; (i < num) && (ai != ae); i++, ai++, argNo++) {
+    ShiftMap[argNo] += shift;
+  }
+
+  // Later arguments (#shift arguments) are pushed until we fill shift positions
+  for (unsigned i = 0; (ai != ae) && (i < shift); i++, ai++, argNo++) {
+    ArgTypes.push_back(ai->getType());
+    ArgNames.push_back(ai->getName());
+    ShiftMap[argNo] -= num;
+  }
+
+  // Arguments that were to be shifted (num arguments) are now pushed
+  for (unsigned i = 0; (i < num) && (af != ae); i++, af++, argNo++) {
+    ArgTypes.push_back(af->getType());
+    ArgNames.push_back(af->getName());
+  }
+
+  // Remaining arguments are pushed
+  for (; ai != ae; ai++) {
+    ArgTypes.push_back(ai->getType());
+    ArgNames.push_back(ai->getName());
+  }
+
+  // Change function type
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+  PointerType* PTy = FTy->getPointerTo();
+  F->mutateType(PTy);
+
+  // Shift argument names
+  ai = F->arg_begin();
+  for (unsigned i = 0; ai != ae; ai++, i++) {
+    (*ai).setName(ArgNames[i]);
+  }
+
+  // Shift attributes by deleting them from F and copying them from F_copy
+
+  //Initialize required iterators for shift elements: from -> from_copy+shift
+  af = from;
+  Function::arg_iterator af_copy;
+  for (unsigned i = 0; i < af->getArgNo() + shift; i++, af_copy++) {
+  }
+  for (unsigned i = 0; i < num; i++, af++, af_copy++) {
+    AttributeSet ASf = F->getAttributes();
+    AttributeSet ASfc = F_copy->getAttributes();
+
+    AttrBuilder ABfc(ASfc, af_copy->getArgNo()+1);
+    AttributeSet argASfc = AttributeSet::get(F_copy->getContext(), af->getArgNo()+1, ABfc);
+    F->removeAttributes(af->getArgNo()+1,ASf.getParamAttributes(af->getArgNo()+1));
+    F->addAttributes(af->getArgNo()+1, argASfc);
+  }
+  //Initialize required iterators for num elements: to -> to_copy-num
+  af_copy = from;
+  for (unsigned i = 0; i < shift; i++, af++, af_copy++) {
+    AttributeSet ASf = F->getAttributes();
+    AttributeSet ASfc = F_copy->getAttributes();
+
+    AttrBuilder ABfc(ASfc, af_copy->getArgNo()+1);
+    AttributeSet argASfc = AttributeSet::get(F_copy->getContext(), af->getArgNo()+1, ABfc);
+    F->removeAttributes(af->getArgNo()+1,ASf.getParamAttributes(af->getArgNo()+1));
+    F->addAttributes(af->getArgNo()+1, argASfc);
+  }
+}
+
+/* 
+ * Create type of merged function
+ * - input arguments type
+ * - struct return type
+ * Get Attributes from original functions
+ * Get parameter names from original functions
+ * Insert an empty function of this type in the module
+ */
+static Function* createEmptyDFNodeFunction(DFNode* N1, DFNode* N2, Module &M) {
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  errs () << "Constructing argument list\n";
+ // Construct argument list
+  std::vector<Type*> ArgTypes;
+  createArgTypes(N1, N2, ArgTypes);
+
+  errs () << "Constructing return type\n";
+  // Construct return type
+  StructType* FRetTy = createReturnType(N1, N2);
+
+  FunctionType* FTy = FunctionType::get(FRetTy, ArgTypes, false);
+  // Create a function with the new type
+  Function* F = Function::Create(FTy, F1->getLinkage(),
+                                 F1->getName()+"_"+F2->getName(), &M);
+
+  errs () << "Copying argument names\n";
+  // Copy argument names from original functions
+  copyArgumentNames(N1, N2, F);
+  // Copy argument attributes from original functions
+  copyAttrList(N1, N2, F);
+
+   return F;
+}
+
+/* 
+ * Create function of leaf node after merging
+ * - create type
+ * - Create the call instructions
+ * - Create intermediate assignments
+ * - Create assignments to output struct
+ */
+static Function* createLeafDFNodeFunction(DFNode* N1, DFNode* N2, Module &M,
+  unsigned numOfN1AllocArgs, unsigned posOfN1AllocArgs,
+  unsigned numOfN2AllocArgs) {
+
+  errs () << "Creating function signature\n";
+  /*
+   * Create empty node function of the correct type
+   */
+  Function* F = createEmptyDFNodeFunction(N1, N2, M);
+
+  // Get return type, needed for building the assignmens to the return struct
+  StructType* FRetTy = cast<StructType>(F->getReturnType());
+
+  Function* F1 = N1->getFuncPointer();
+  Function* F2 = N2->getFuncPointer();
+
+  errs () << "Creating function body\n";
+  // This maps i: position in F argument list, to new position in F argument
+  // list (after shifting arguments maybe). Initially, no shift.
+  std::vector<unsigned> FArgsShiftMap(F->getArgumentList().size());
+  for (unsigned i = 0; i < FArgsShiftMap.size(); i++)
+    FArgsShiftMap[i] = i;
+
+  if (numOfN1AllocArgs) {
+    // Number of remaining f2 parameters is initial parameter number of f2
+    // minus the number of edges between n1 and n2. We can also find this by
+    // getting the number of parameters of the new function F and subtract the
+    // number of parameters of F1, since this did not change.
+    unsigned shiftOfN1AllocArgs = F->getArgumentList().size() -
+                                  F1->getArgumentList().size() -
+                                  numOfN2AllocArgs;
+    shiftArgs(F, posOfN1AllocArgs, numOfN1AllocArgs, shiftOfN1AllocArgs,
+              FArgsShiftMap);
+  }
+
+
+  // Add a basic block to the new, empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(FRetTy), BB);
+
+
+  errs () << "Creating function call\n";
+  // Get Argument list of new function into a vector (for easier indexing)
+  std::vector<Value*> FArgs;
+  for (auto& arg: F->getArgumentList()) {
+    FArgs.push_back(&arg);
+  }
+
+  // Create call instruction for first node
+  std::vector<Value*> Args;
+  for (unsigned i = 0; i < F1->getArgumentList().size(); i++) {
+    Args.push_back(FArgs[FArgsShiftMap[i]]);
+  }
+  CallInst* CI1 = CallInst::Create(F1,
+                                   ArrayRef<Value*>(Args),
+                                   "merged."+F1->getName(),
+                                   RI);
+  Args.clear();
+
+  errs () << "Creating function call for second node\n";
+  // Create call instruction for second node
+  for(unsigned fargNo = 0, i = 0;
+      i < F2->getArgumentList().size(); i++) {
+    Value* Arg;
+    if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() == N1) {
+      ExtractValueInst *EI =
+        ExtractValueInst::Create(CI1,
+                                 N2->getExtendedInDFEdgeAt(i)->getSourcePosition(),
+                                 "",
+                                 RI);
+      Arg = EI;
+    } else {
+      Arg = FArgs[FArgsShiftMap[F1->getArgumentList().size() + fargNo++]];
+    }
+    Args.push_back(Arg);
+  }
+
+  CallInst* CI2 = CallInst::Create(F2,
+                                   ArrayRef<Value*>(Args),
+                                   "merged."+F2->getName(),
+                                   RI);
+
+  errs () << "Creating extract element instructions\n";
+  // Create extract element instructions for elements of output struct
+  std::vector<ExtractValueInst *> ExtractValueInstVec;
+
+  // First, from node n1: exclude those that go to n2
+  StructType *F1RetTy = dyn_cast<StructType>(F1->getReturnType());
+  for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) {
+    if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) {
+      ExtractValueInst *EI = ExtractValueInst::Create(CI1, i, "", RI);
+      ExtractValueInstVec.push_back(EI);
+    }
+  }
+  // Then, from node n2
+  StructType *F2RetTy = dyn_cast<StructType>(F2->getReturnType());
+  for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) {
+    ExtractValueInst *EI = ExtractValueInst::Create(CI2, i, "", RI);
+    ExtractValueInstVec.push_back(EI);
+  }
+
+  errs () << "Creating output struct\n";
+  // Create output struct of type FRetTy
+  assert(FRetTy->getNumElements() == ExtractValueInstVec.size() &&
+    "Size of output struct does not match expected number of EE instructions");
+  Value* retVal = UndefValue::get(F->getReturnType());
+
+  for (unsigned i = 0; i < ExtractValueInstVec.size(); i++) {
+    InsertValueInst *IVI =
+      InsertValueInst::Create(retVal, ExtractValueInstVec[i], i, "", RI);
+    retVal = IVI;
+  }
+  ReturnInst* newRI = ReturnInst::Create(M.getContext(), retVal);
+  ReplaceInstWithInst(RI, newRI);
+
+  // Inline the two calls
+  InlineFunctionInfo IFI1, IFI2;
+  InlineFunction(CI1, IFI1, nullptr, false);
+  InlineFunction(CI2, IFI2, nullptr, false);
+
+  return F;
+}
+
+static Function* createInternalDFNodeFunction(DFNode* N1, DFNode* N1an,
+  DFNode* N1cn, DFNode* N2, DFNode* N2an, DFNode* N2cn, Function* Fa,
+  Function* Fc, Module &M, unsigned numOfN1AllocArgs, unsigned posOfN1AllocArgs,
+  unsigned numOfN2AllocArgs) {
+
+  /*
+   * Create empty node function of the correct type
+   */
+  Function* F = createEmptyDFNodeFunction(N1, N2, M);
+
+  // Get return type, needed for building the assignmens to the return struct
+  StructType* FRetTy = cast<StructType>(F->getReturnType());
+
+//  Function* F1 = N1->getFuncPointer();
+//  Function* F2 = N2->getFuncPointer();
+
+  // Add a basic block to the new, empty function
+  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+  ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                      UndefValue::get(FRetTy), BB);
+
+  // Get Argument list of new function into a vector (for easier indexing)
+  std::vector<Value*> FArgs;
+  for (auto& arg: F->getArgumentList()) {
+    FArgs.push_back(&arg);
+  }
+
+  // Get pointers to functions inthe original graph
+//  Function* F1a = (N1an) ? N1an->getFuncPointer() : NULL;
+//  Function* F2a = (N2an) ? N2an->getFuncPointer() : NULL;
+//  Function* F1c = N1cn->getFuncPointer();
+//  Function* F2c = N2cn->getFuncPointer();
+
+  // Create the required createNode intrinsics
+  IntrinsicInst* AllocII = NULL;
+  if (N1an)
+    AllocII = createIdenticalCreateNodeWithDifferentFunction(Fa,
+                                                        N1an->getInstruction());
+  else if (N2an)
+    AllocII = createIdenticalCreateNodeWithDifferentFunction(Fa,
+                                                        N2an->getInstruction());
+  if (AllocII)
+    AllocII->insertBefore(RI);
+
+  // The position in F (new node function) of the node dimensions parameters is
+  // the same as it was in n1 internal node function, because n1 is the first
+  // one to be added to the resulting merged node.
+  IntrinsicInst* ComputeII =
+    createNewCreateNodeBasedOn(Fc, N1cn->getInstruction(), F);
+  ComputeII->insertBefore(RI);
+
+  // Vector to be populated with instructions to be added to internal node
+  std::vector<IntrinsicInst*> IntrinsicInstructionsToAdd;
+  std::vector<IntrinsicInst*> IntermediateInstructions;
+
+  createNewInternalNodeIntrinsics(N1, N2, N1an, N1cn, N2an, N2cn,
+                                  AllocII, ComputeII,
+                                  Fa /* FIXME: Unused */, Fc,
+                                  IntrinsicInstructionsToAdd,
+                                  IntermediateInstructions);
+
+  // Insert generated intrinsics at new internal function
+  for (auto& Inst: IntrinsicInstructionsToAdd) {
+    Inst->insertBefore(RI);
+  }
+
+  // Insert generated intrinsics at new internal function and erase
+  for (auto& Inst: IntermediateInstructions) {
+    Inst->insertBefore(RI);
+    Inst->eraseFromParent();
+  }
+
+  return F;
+}
+
+void createNewInternalNodeIntrinsics(DFNode* N1,
+                                     DFNode* N2,
+                                     DFNode* N1a,
+                                     DFNode* N1c,
+                                     DFNode* N2a,
+                                     DFNode* N2c,
+                                     IntrinsicInst* IInewa,
+                                     IntrinsicInst* IInewc,
+                                     Function* Fa, //FIXME: Unused
+                                     Function* Fc,
+                      std::vector<IntrinsicInst*>& IntrinsicInstructionsToAdd,
+                      std::vector<IntrinsicInst*>& IntermediateInstructions) {
+  IntrinsicInst* II1a = (N1a) ? N1a->getInstruction() : NULL;
+  IntrinsicInst* II1c = N1c->getInstruction();
+  IntrinsicInst* II2a = (N2a) ? N2a->getInstruction() : NULL;
+  IntrinsicInst* II2c = N2c->getInstruction();
+
+  Function* F1a = (N1a) ? N1a->getFuncPointer() : NULL;
+  Function* F1c = N1c->getFuncPointer();
+  Function* F2a = (N2a) ? N1a->getFuncPointer() : NULL;
+
+  unsigned n1aNumOfInputs = 0;
+  unsigned n1aNumOfOutputs = 0;
+  unsigned n1aPosOfOutputs = 0;
+  if (N1a) {
+    n1aNumOfInputs = F1a->getArgumentList().size();
+    n1aNumOfOutputs = cast<StructType>(F1a->getReturnType())->getNumElements();
+    n1aPosOfOutputs = N1a->getOutDFEdgeAt(0)->getDestPosition();
+  }
+  unsigned n2aNumOfOutputs = 0;
+  if (N2a) {
+    n2aNumOfOutputs = cast<StructType>(F2a->getReturnType())->getNumElements();
+  }
+
+  unsigned shiftOfN1AllocOutputs = Fc->getArgumentList().size() -
+                                   F1c->getArgumentList().size() -
+                                   n2aNumOfOutputs;
+
+  std::map<unsigned, unsigned> N1cInMap;
+  std::map<unsigned, unsigned> N1cOutMap;
+  std::map<unsigned, unsigned> N2cInMap;
+  std::map<unsigned, unsigned> N2cOutMap;
+  // These maps map the old location of an argument/output (to its function's
+  // parameter list/out struct) to the new, after edges removed and functions
+  // merged
+
+  // This accounts for argument shifting, due to allocation node n1
+  std::vector<unsigned> FcShiftMap;
+
+  buildInputAndOutputMaps(N1c, N2c, N1cInMap, N1cOutMap, N2cInMap, N2cOutMap);
+  createShiftMap(Fc, n1aPosOfOutputs, n1aNumOfOutputs, shiftOfN1AllocOutputs,
+                 FcShiftMap);
+
+
+  std::map<unsigned, unsigned> N1InDFEdgeMap;
+  std::map<unsigned, unsigned> N1OutDFEdgeMap;
+  std::map<unsigned, unsigned> N2InDFEdgeMap;
+  std::map<unsigned, unsigned> N2OutDFEdgeMap;
+  buildInAndOutEdgeMaps(N1, N2, N1InDFEdgeMap, N1OutDFEdgeMap, N2InDFEdgeMap,
+                        N2OutDFEdgeMap);
+
+
+  // Start with the intrinsics for allocation nodes n1a and n2a
+
+  // TODO: This is only for testing, not needed for functionality
+  std::map<IntrinsicInst*, IntrinsicInst*> CreateEdgeAndBindMap;
+
+  if (N1a) { // If there is an allocation node for the first node
+    for (Value::user_iterator i = II1a->user_begin(), ie = II1a->user_end();
+         i != ie; ++i) {
+      Value *v = *i;
+      Instruction *VI = dyn_cast<Instruction>(v);
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+      assert(II && "Use of a node handle outside of a visc intrinsic");
+
+      switch(II->getIntrinsicID()) {
+        case Intrinsic::visc_createEdge:
+          // This is between allocation and compute node of n1.
+          {
+            // Change source to new allocation node
+            IntrinsicInst* IItemp1 =
+              createIdenticalCreateEdgeWithDifferentNode(II, IInewa, true);
+            // Do not change source port
+            // Change destination node to new compute node
+            IntrinsicInst* IItemp2 =
+              createIdenticalCreateEdgeWithDifferentNode(IItemp1, IInewc, false);
+            // Change destination port to new port, after inmap and shift
+            unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue();
+            IntrinsicInst* EI =
+              createIdenticalCreateEdgeWithDifferentPort(IItemp2,
+                                           FcShiftMap[N1cInMap[dstPos]], false);
+            IntrinsicInstructionsToAdd.push_back(EI);
+            IntermediateInstructions.push_back(IItemp1);
+            IntermediateInstructions.push_back(IItemp2);
+            CreateEdgeAndBindMap[II] = EI;
+          }
+          break;
+        case Intrinsic::visc_bind_input:
+         // These are the inputs from the parent node.
+          {
+           // The destination ports will not change, only the destination will
+           // be changed to point to the new allocation node
+            IntrinsicInst* BI =
+              createIdenticalBindInputWithDifferentNode(II, IInewa);
+            IntrinsicInstructionsToAdd.push_back(BI);
+            CreateEdgeAndBindMap[II] = BI;
+          }
+          break;
+        case Intrinsic::visc_bind_output:
+          assert(false && "Allocation node handle found in visc_bind_output");
+          break;
+        default:
+          assert(false && "Unknown use of node handle");
+          break;
+      }
+    }
+  }
+
+  if (N2a) { // If there is an allocation node fot the second node
+    for (Value::user_iterator i = II2a->user_begin(), ie = II2a->user_end();
+         i != ie; ++i) {
+      Value *v = *i;
+      Instruction *VI = dyn_cast<Instruction>(v);
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+      assert(II && "Use of a node handle outside of a visc intrinsic");
+
+      switch(II->getIntrinsicID()) {
+        case Intrinsic::visc_createEdge:
+          // This is between allocation and compute node of n2.
+          {
+            // Change source to new allocation node
+            IntrinsicInst* IItemp1 =
+              createIdenticalCreateEdgeWithDifferentNode(II, IInewa, true);
+            // Change source port to after all outputs of n1a
+            unsigned srcPos = cast<ConstantInt>(II->getOperand(3))->getZExtValue();
+            IntrinsicInst* IItemp2 =
+              createIdenticalCreateEdgeWithDifferentPort(IItemp1,
+                                                srcPos + n1aNumOfOutputs, true);
+            // Change destination node to new compute node
+            IntrinsicInst* IItemp3 =
+              createIdenticalCreateEdgeWithDifferentNode(IItemp2, IInewc, false);
+            // Change destination port to new port, after inmap and shift
+            // Use of FcShiftMap is not required here - allocation outputs of 
+            // n2a will not get shifted, but it is OK to use (1-1 at this point)
+            unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue();
+            IntrinsicInst* EI =
+              createIdenticalCreateEdgeWithDifferentPort(IItemp3,
+                                           FcShiftMap[N2cInMap[dstPos]], false);
+            IntrinsicInstructionsToAdd.push_back(EI);
+            IntermediateInstructions.push_back(IItemp1);
+            IntermediateInstructions.push_back(IItemp2);
+            IntermediateInstructions.push_back(IItemp3);
+            CreateEdgeAndBindMap[II] = EI;
+          }
+          break;
+        case Intrinsic::visc_bind_input:
+         // These are the inputs from the parent node.
+          {
+            // Change destination node to new allocation node
+            IntrinsicInst* IItemp1 = 
+              createIdenticalBindInputWithDifferentNode(II, IInewa);
+            // Change source port to new port, after edgeinmap
+            unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+            IntrinsicInst* IItemp2 = 
+              createIdenticalBindInputWithDifferentPort(IItemp1,
+                                                       N2InDFEdgeMap[srcPos], true);
+            // Change destination port to new port, after inmap and shift
+            unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+            IntrinsicInst* BI = 
+              createIdenticalBindInputWithDifferentPort(IItemp2,
+                                                       dstPos + n1aNumOfInputs, false);
+            IntrinsicInstructionsToAdd.push_back(BI);
+            IntermediateInstructions.push_back(IItemp1);
+            IntermediateInstructions.push_back(IItemp2);
+            CreateEdgeAndBindMap[II] = BI;
+          }
+          break;
+        case Intrinsic::visc_bind_output:
+          assert(false && "Allocation node handle found in visc_bind_output");
+          break;
+        default:
+          assert(false && "Unknown use of node handle");
+          break;
+      }
+    }
+  }
+
+  // Continue with the intrinsics for compute nodes n1c and n2c
+
+  for (Value::user_iterator i = II1c->user_begin(), ie = II1c->user_end();
+       i != ie; ++i) { // Handle inputs and outputs of n1 compute node
+    Value *v = *i;
+    Instruction *VI = dyn_cast<Instruction>(v);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+    assert(II && "Use of a node handle outside of a visc intrinsic");
+
+    switch(II->getIntrinsicID()) {
+      case Intrinsic::visc_createEdge:
+        // This is between allocation and compute node of n1.
+        {
+          // These edges should have been handled when dealing with the
+          // allocation nodes
+          assert(CreateEdgeAndBindMap.find(II) != CreateEdgeAndBindMap.end() &&
+            "Edge between A-C node should have been handled while processing A");
+        }
+        break;
+      case Intrinsic::visc_bind_input:
+       // These are the inputs from the parent node.
+        {
+         // The destination ports will not change, only the destination will
+         // be changed to point to the new compute node
+          IntrinsicInst* BI =
+            createIdenticalBindInputWithDifferentNode(II, IInewc);
+          IntrinsicInstructionsToAdd.push_back(BI);
+          CreateEdgeAndBindMap[II] = BI;
+        }
+        break;
+      case Intrinsic::visc_bind_output:
+       // These are the outputs to the parent node.
+        {
+          // If this goes to n2, ignore edge completely
+          unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+          if (N1c->getExtendedOutDFEdgeAt(srcPos)->getDestDF() != N2c) {
+            // this bind creates an edge that ends up to another node in the graph
+            // Change source to new compute node
+            IntrinsicInst* IItemp1 =
+              createIdenticalBindOutputWithDifferentNode(II, IInewc);
+            // Change source port to new port after outmap
+            IntrinsicInst* IItemp2 =
+              createIdenticalBindOutputWithDifferentPort(IItemp1,
+                                                         N1cOutMap[srcPos], true);
+            // Change destination port to new port after edgeoutmap
+          unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+            IntrinsicInst* BI =
+              createIdenticalBindOutputWithDifferentPort(IItemp2,
+                                                         N1OutDFEdgeMap[dstPos], false);
+            IntrinsicInstructionsToAdd.push_back(BI);
+            IntermediateInstructions.push_back(IItemp1);
+            IntermediateInstructions.push_back(IItemp2);
+            CreateEdgeAndBindMap[II] = BI;
+          }
+        }
+        break;
+      default:
+        errs() << "Unknown use: " << *II << "\n";
+        assert(false && "Unknown use of node handle");
+        break;
+    }
+  }
+
+  for (Value::user_iterator i = II2c->user_begin(), ie = II2c->user_end();
+       i != ie; ++i) { // Handle inputs and outputs of n2 compute node
+    Value *v = *i;
+    Instruction *VI = dyn_cast<Instruction>(v);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI);
+    assert(II && "Use of a node handle outside of a visc intrinsic");
+
+    switch(II->getIntrinsicID()) {
+      case Intrinsic::visc_createEdge:
+        // This is between allocation and compute node of n2.
+        {
+          // These edges should have been handled when dealing with the
+          // allocation nodes
+          assert(CreateEdgeAndBindMap.find(II) != CreateEdgeAndBindMap.end() &&
+            "Edge between A-C node should have been handled while processing A");
+        }
+        break;
+      case Intrinsic::visc_bind_input:
+       // These are the inputs from the parent node.
+        {
+         // If this is incoming from n1 compute node, ignore completely
+          unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+          if (N2c->getExtendedInDFEdgeAt(dstPos)->getSourceDF() != N1c) {
+            // this bind creates an edge that comes from another node in the graph
+            // Change destination to new compute node
+            IntrinsicInst* IItemp1 =
+                createIdenticalBindInputWithDifferentNode(II, IInewc);
+            // Change source port to new port after edgeinmap
+            unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+            IntrinsicInst* IItemp2 =
+              createIdenticalBindInputWithDifferentPort(IItemp1,
+                                                        N2InDFEdgeMap[srcPos], true);
+            // Change destination port to new port after inmap and shift
+            unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+            IntrinsicInst* BI =
+              createIdenticalBindInputWithDifferentPort(IItemp2,
+                                                        FcShiftMap[N2cInMap[dstPos]], false);
+            IntrinsicInstructionsToAdd.push_back(BI);
+            IntermediateInstructions.push_back(IItemp1);
+            IntermediateInstructions.push_back(IItemp2);
+            CreateEdgeAndBindMap[II] = BI;
+          }
+        }
+        break;
+      case Intrinsic::visc_bind_output:
+       // These are the outputs to the parent node.
+        {
+          // this bind creates an edge that ends up to another node in the graph
+          // Change source to new compute node
+          IntrinsicInst* IItemp1 =
+              createIdenticalBindOutputWithDifferentNode(II, IInewc);
+          // Change source port to new port after outmap
+          unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+          IntrinsicInst* IItemp2 =
+            createIdenticalBindOutputWithDifferentPort(IItemp1,
+                                                       N2cOutMap[srcPos], true);
+          // Change destination port to new port after edgeoutmap
+          unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
+          IntrinsicInst* BI =
+            createIdenticalBindOutputWithDifferentPort(IItemp2,
+                                                       N2OutDFEdgeMap[dstPos], false);
+          IntrinsicInstructionsToAdd.push_back(BI);
+          IntermediateInstructions.push_back(IItemp1);
+          IntermediateInstructions.push_back(IItemp2);
+          CreateEdgeAndBindMap[II] = BI;
+        }
+        break;
+      default:
+        assert(false && "Unknown use of node handle");
+        break;
+    }
+  }
+
+}
+
+void deleteInternalNodeFunction(DFNode* N, BuildDFG &DFG) {
+
+  if (dyn_cast<DFLeafNode>(N))
+    return;
+
+  for (inst_iterator i = inst_begin(N->getFuncPointer()),
+       e = inst_end(N->getFuncPointer()); i != e ; ++i) {
+    Instruction* I = &*i; // Grab pointer to Instruction
+    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
+      switch(II->getIntrinsicID()) {
+        case Intrinsic::visc_createNode:
+        case Intrinsic::visc_createNode1D:
+        case Intrinsic::visc_createNode2D:
+        case Intrinsic::visc_createNode3D:
+          // ---------------------------------------------------------------- //
+          // Updating the BuildDFG result
+          // remove the node from mapping
+          DFG.removeElementFromHandleToDFNodeMap(II);
+          // ---------------------------------------------------------------- //
+          break;
+        case Intrinsic::visc_createEdge:
+        case Intrinsic::visc_bind_input:
+        case Intrinsic::visc_bind_output:
+          // ---------------------------------------------------------------- //
+          // Updating the BuildDFG result
+          // remove the edge from mapping
+          DFG.removeElementFromHandleToDFEdgeMap(II);
+          // ---------------------------------------------------------------- //
+          break;
+        default:
+          errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n";
+          break;
+      }
+    }
+  }
+
+  // Erase Functions associated with node N
+  Function* F = N->getFuncPointer();
+
+errs() << "Removing " << F->getName() << "\n";
+  F->replaceAllUsesWith(UndefValue::get(F->getType()));
+  F->eraseFromParent();
+
+}
+
+/*
+void shiftAttrsToLeftBy(Function* F, unsigned shift, unsigned argNo) {
+  // Source attr location : i+shift (+1), dst : i (+1)
+  for (unsigned i = argno; i + shift < F->getArgumentList().size(); i++) {
+    AttributeSet AS = F->getAttributes();
+    AttrBuilder AB(AS, i+shift+1);
+    AttributeSet argAS = AttributeSet::get(F->getContext(), i+1, AB);
+    F->removeAttributes(i+1,AS.getParamAttributes(i+1));
+    F->addAttributes(i+1, argAS);
+  }
+
+}
+
+void shiftArgumentNamesToLeftBy(Function* F, unsigned shift, unsigned argNo) {
+  // Source attr location : i+shift (+1), dst : i (+1)
+
+  // Skip arguments up until argNo  
+  Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(), as = F->arg_begin();
+  for ( ; (ai != ae) && (ai->getArgNo() < argNo); ++ai, ++as) { }
+
+  // Find source of name  
+  for ( unsigned i = 0; (i < shift) && (as != ae); i++) {
+    ++as;
+  }
+
+  for ( ; (ai != ae) && (as != ae); ++ai, ++as) {
+    ai->setName(as->getName());
+  }
+
+}
+
+void removeFunctionArgument(Function* F, Argument *ArgToRemove) {
+
+  // Shift attributes one to the left
+  shiftAttrsToLeftBy(F, 1, ArgToRemove->getArgNo());
+  // Shift argument names one to the left
+  shiftArgumentNamesToLeftBy(F, 1, ArgToRemove->getArgNo());
+  // Update the type of F
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    if (&arg != ArgToRemove)
+      ArgTypes.push_back(arg.getType());
+  }
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+  PointerType* PTy = FTy->getPointerTo();
+  F->mutateType(PTy);
+
+}
+*/
+
+Argument* getFunctionArgumentAt(Function* F, unsigned i) {
+  assert((i < F->getArgumentList().size()) && "Requesting argument in invalid position");
+  for (auto& arg: F->getArgumentList()) {
+    if (arg.getArgNo() == i)
+      return &arg;
+  }
+  return NULL;
+}
+
+// TODO
+void removeUnnecessaryInputEdges(DFNode* N, DFNode* N1,
+                                 unsigned numOfN1AllocArgs,
+                                 unsigned numOfN2AllocArgs) {
+  Function* F = N->getFuncPointer();
+  Function* F1 = N1->getFuncPointer();
+  // Compute these once - they may change while in the loop
+  unsigned f1ArgListSize = F1->getArgumentList().size();
+  unsigned fArgListSize = F->getArgumentList().size();
+  // Iterate over input parameters of F1 without allocation arguments
+  for (unsigned i = 0; i < f1ArgListSize - numOfN1AllocArgs; i++) {
+    DFEdge* N1InEdge = N->getInDFEdgeAt(i);
+    unsigned n1SrcPos = N1InEdge->getSourcePosition();
+    for (unsigned j = f1ArgListSize - numOfN1AllocArgs,
+                  pos = f1ArgListSize - numOfN1AllocArgs;
+         j < fArgListSize - numOfN2AllocArgs; j++, pos++) {
+      DFEdge* N2InEdge = N->getInDFEdgeAt(pos);
+      unsigned n2SrcPos = N2InEdge->getSourcePosition();
+      Argument* n1arg = getFunctionArgumentAt(F, i);
+      Argument* n2arg = getFunctionArgumentAt(F, j);
+      DEBUG(errs() << "Comparing " << *n1arg << " with " << *n2arg << "\n");
+      // If the edges are coming from the same position of the same source node
+      // If the arguments are not pointer arguments, or if they are pointer
+      // arguments without the out attribute (they are only used as inputs)
+      if ((N1InEdge->getSourceDF() == N2InEdge->getSourceDF()) &&
+          (n1SrcPos == n2SrcPos) &&
+          ((!(n1arg->getType()->isPointerTy()) &&
+            !(n2arg->getType()->isPointerTy())) ||
+           (!(hasAttribute(F, i, Attribute::Out)) &&
+            !(hasAttribute(F, pos, Attribute::Out))) ) ) {
+         DEBUG(errs() << "Replacing " << *n1arg << " with " << *n2arg << "\n");
+        // It is safe to remove the second argument and replace its uses with
+        // the first one
+        n2arg->replaceAllUsesWith(n1arg);
+//        removeFunctionArgument(F, n2arg); TODO
+//        removeInputEdgeAt(F, pos);        TODO
+      } else {
+        // It is not safe to remove the second argument. Update position
+//        pos++;                            TODO increase here instead of loop increment
+      }
+    }
+  }
+}
+
+// This function checks the metadata in visc code for a function's target hint
+static visc::Target getPreferredTarget(Function* F) {
+  DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
+  Module* M = F->getParent();
+  // checking for GPU hint
+  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::GPU_TARGET;
+  }
+
+  // checking for SPIR hint
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::SPIR_TARGET;
+  }
+
+  // checking for CPU hint
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_TARGET;
+  }
+  return visc::None;
+}
+
+// This function adds the hint as metadata in visc code
+static void addHint(Function* F, visc::Target T) {
+  // Get Module
+  Module* M = F->getParent();
+  DEBUG(errs() << "Set preferred target for " << F->getName() << ": " << T << "\n");
+
+  // Based on the hint, get the hint metadata
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    default:
+      llvm_unreachable("Unsupported Target Hint!");
+      break;
+  }
+
+  // Create a node for the function and add it to the hint node
+  MDNode* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  HintNode->addOperand(N);
+}
+
+// This function removes the hint as metadata in visc code
+static void removeHint(Function* F, visc::Target T) {
+  // Get Module
+  Module* M = F->getParent();
+  DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n");
+
+  // Based on the hint, get the hint metadata
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CPU_TARGET:
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    default:
+      llvm_unreachable("Unsupported Target Hint!");
+      break;
+  }
+
+  // Gather metadata nodes, and keep those not associated with this function
+  MDNode* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  std::vector<MDNode*> MDNodes;
+
+  for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* MDN = HintNode->getOperand(i);
+    if (MDN == N) {
+      continue;
+    }
+    MDNodes.push_back(MDN);
+  }
+
+  HintNode->dropAllReferences();
+
+  for (unsigned i = 0; i < MDNodes.size(); i++) {
+    HintNode->addOperand(MDNodes[i]);
+  }
+
+}
+
+std::string getTestModuleName(Module &M) {
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".original.ll");
+}
+
+} // End of namespace mergedfn
+
+char MergeDFN::ID = 0;
+static RegisterPass<MergeDFN> X("mergedfn",
+                                "Dataflow node merging optimization",
+                                true /* modifies the CFG */,
+                                true /* transformation,   *
+                                      * not just analysis */);
+
diff --git a/lib/MergeDFN/MergeDFN.exports b/lib/MergeDFN/MergeDFN.exports
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/lib/ReplaceIntrinsics/CMakeLists.txt b/lib/ReplaceIntrinsics/CMakeLists.txt
new file mode 100644
index 0000000000..0bfb2bf221
--- /dev/null
+++ b/lib/ReplaceIntrinsics/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( ReplaceIntrinsics
+  ReplaceIntrinsics.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/lib/ReplaceIntrinsics/LLVMBuild.txt b/lib/ReplaceIntrinsics/LLVMBuild.txt
new file mode 100644
index 0000000000..6450fa1714
--- /dev/null
+++ b/lib/ReplaceIntrinsics/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ReplaceIntrinsics
+parent = Transforms
+
diff --git a/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp b/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp
new file mode 100644
index 0000000000..ef649d8e17
--- /dev/null
+++ b/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp
@@ -0,0 +1,516 @@
+//=== ReplaceApproxHPVMIntrinsicsWithFCalls.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "REPLACE_APPROXHPVM_INTRINSICS_WITH_FCALLS"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+// TODO: We still need in place analysis, if calls have the same interface
+using namespace inplacedfg;
+
+namespace {
+// Helper class declarations
+
+// Replace ApproxHPVM intrinsics with LLVM function calls.
+// aiming to go through the CPU backend code generation.
+
+struct DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls() : DFG2LLVM(ID) {}
+private:
+
+public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addRequired<InPlaceDFGAnalysisWrapper>();
+    AU.addPreserved<BuildDFG>();
+    AU.addPreserved<InPlaceDFGAnalysisWrapper>();
+  }
+
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_ReplaceApproxHPVMIntrinsicsWithFCalls : public CodeGenTraversal {
+
+private:
+  //Member variables
+  InPlaceDFGAnalysis::InPlaceDFGParameter *IPP;
+
+  // VISC Runtime API and Tensor runtime API
+
+  /* TODO: I believe that TensorRt is not needed, since we will have llvm
+   implementations linked in, so init and cleanup calls can be removed and
+   relevant code also, but I leave in in for now until verified. */
+  Constant* llvm_hpvm_initTensorRt;
+  Constant* llvm_hpvm_cleanupTensorRt;
+//  Constant* hpvm_request_tensor; DONE: request tensor will not be used
+
+  // Functions
+  bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N);
+
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP)
+  : CodeGenTraversal(_M, _DFG), IPP(&_IPP) {
+    initRuntimeAPI();
+  }
+
+};
+
+bool CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::isValidOperandForInPlaceOperation(Value *Op,
+                                                  Function *Fgen,
+                                                  DFNode *N) {
+  // We only expect the if branch to be taken
+  if (Argument *Arg = dyn_cast<Argument>(Op)) {
+    DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n");
+    assert((Arg->getParent() == Fgen) &&
+          "Extra Parameter in body of Function\n");
+    // Candidae parameter is a function argument
+    // In this case, consult the result of in place analysis
+    // Find position in arg list
+    unsigned pos = Arg->getArgNo();
+    // If this parameter cannot be used for in place operation
+    // code gen cannot continue
+    if (IPP->at(N)[pos]) {
+      DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n");
+      return false;
+    }
+  }
+  else {
+    // If it is not an argument, then it needs to be the result of
+    // another intrinsic. These are new objects that are allocated,
+    // and consumed by next intrinsic. Alternatively, the intrinsic
+    // could have been replaced by a call to an LLVM function.
+    // We do not expect a merge pass to have run before the replacement pass,
+    // therefore we do not expect to go in the else branch.
+    DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n");
+    if (dyn_cast<IntrinsicInst>(Op)) {
+      DEBUG(errs() << *Arg << "\t: local, suitable for in place\n");
+      return true;
+    } else if (CallInst *CI = dyn_cast<CallInst>(Op)) {
+      if ((CI->getCalledFunction()->getName()).startswith("tensor"))
+        return true;
+      else
+        return false;
+    }
+    else {
+      DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n");
+      return false;
+    }
+  }
+}
+
+
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::init() {
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n");
+
+  // FIXME: set correct path
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_cpu_runtime.ll";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
+
+  // Get or insert Global declarations for
+  // - initialization
+  // - cleanup
+  // - request a tensor
+  DECLARE(llvm_hpvm_initTensorRt);
+  DECLARE(llvm_hpvm_cleanupTensorRt);
+//  DECLARE(hpvm_request_tensor);
+
+  // Find visc.init and visc.cleanup calls, and add placeholder methods
+  // for initialization and cleanup of the hpvm tensor runtime
+
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  CallInst::Create(llvm_hpvm_initTensorRt,
+                   ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
+                   "", InitCall);
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall);
+
+}
+
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Skipping internal node\n";
+}
+
+  
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFLeafNode* N) {
+
+  // Skip if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+
+  // Search for intrinsic only if it has the right hint
+  if (!checkPreferredTarget(N, visc::CPU_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+  errs()<<"function name = "<< F->getName()<<"\n";
+
+  std::vector<IntrinsicInst *> IItoRemove;
+
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+      /********************* Handle VISC Tensor intrinsics ********************/
+      // We replace them with calls to functions with implementations at the LLVM level
+      switch (II->getIntrinsicID()) {
+
+      case Intrinsic::visc_tensor_convolution:
+      { /* llvm.hpvm.tensor.convolution */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor convolution \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+
+	Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+	Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+
+        Args.push_back(conv_mode);
+        Args.push_back(conv_precision);
+	
+        // Create function call
+        Constant* tensorConvolutionCPU;
+        DECLARE(tensorConvolutionCPU);
+	
+        CallInst* CI = CallInst::Create(tensorConvolutionCPU,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_mul:
+      { /* llvm.hpvm.tensor.mul */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor mul\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create function call
+        Constant* tensorGemmCPU;
+        DECLARE(tensorGemmCPU);
+	
+        CallInst* CI = CallInst::Create(tensorGemmCPU,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_add:
+      { /* llvm.hpvm.tensor.add */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor add\n");
+        // Tensor add(a,b) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+
+	// FIXME: remove this comment - must check for in-place
+        //assert(inplace &&
+        //       "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create function call
+        Constant* tensorAddCPU;
+        DECLARE(tensorAddCPU);
+        CallInst::Create(tensorAddCPU, Args, "", II);
+        // We can replace the call to hpvm.tensor.add with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_pool_max:
+      case Intrinsic::visc_tensor_pool_mean:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor_pool_max\n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list - tensorPooling(input, poolFunction, window_height, window_width, vertical_pad, horizontal_pad,
+	//                               vertical_stride, horizontal_stride);
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	int pool_type = 0;
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){
+          pool_type = 0;
+	}
+        if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){
+          pool_type = 1;
+	}	
+	
+	Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type);
+        Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero)	
+	Args.push_back(II->getOperand(1));
+        Args.push_back(II->getOperand(2));
+	Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+	Args.push_back(II->getOperand(5));
+	Args.push_back(II->getOperand(6));
+
+        // Create function call
+        Constant* tensorPoolingCPU;
+        DECLARE(tensorPoolingCPU);
+        CallInst* CI = CallInst::Create(tensorPoolingCPU, Args, "", II);
+
+	// Replacing intrinsic result uses with the result of the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }break;
+
+      case Intrinsic::visc_tensor_relu:
+      case Intrinsic::visc_tensor_clipped_relu:
+      case Intrinsic::visc_tensor_tanh:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions \n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){
+          // Create function call
+          Constant* tensorReluCPU;
+          DECLARE(tensorReluCPU);
+          CallInst::Create(tensorReluCPU, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){
+          // Create function call
+          //-- Constant* tensorClippedRelu;
+	  Constant* tensorRelu2CPU;
+          DECLARE(tensorRelu2CPU);
+          CallInst::Create(tensorRelu2CPU, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){
+          // Create function call
+          Constant* tensorTanhCPU;
+	  errs()<<"tensorTanh Call = \n\n";
+          DECLARE(tensorTanhCPU);
+	  //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l";
+          CallInst::Create(tensorTanhCPU, Args, "", II);
+	}
+     
+        // We can replace the call to hpvm.tensor.relu with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_softmax:
+      { /* llvm.visc.tensor.softmax */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+        // Create function call
+        Constant* tensorSoftmaxCPU;
+        DECLARE(tensorSoftmaxCPU);
+        CallInst::Create(tensorSoftmaxCPU, Args, "", II);
+        // We can replace the call to hpvm.tensor.softmax with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+
+      }
+
+    }
+
+  }
+
+  // We need to do this explicitly: DCE pass may not remove them.
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around.
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    errs() << "Erasing: " << **ri << "\n";
+    (*ri)->eraseFromParent();
+  }
+
+  return;
+}
+
+bool DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // Get the In Place Analysis Results
+  InPlaceDFGAnalysis::InPlaceDFGParameter IPP =
+    (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP();
+  // Print results
+  printInPlaceDFGParameter(IPP);
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+ 
+  // Visitor for Code Generation Graph Traversal
+  CGT_ReplaceApproxHPVMIntrinsicsWithFCalls *CGTVisitor =
+    new CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(M, DFG, IPP);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+
+} // End of namespace
+
+char DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::ID = 0;
+static RegisterPass<DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls> X("replace-intrinsics",
+                                      "Replace ApproxHPVM intrinsics with LLVM calls",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/lib/ReplaceIntrinsics/ReplaceIntrinsics.exports b/lib/ReplaceIntrinsics/ReplaceIntrinsics.exports
new file mode 100644
index 0000000000..e69de29bb2
-- 
GitLab