diff --git a/hpvm/include/GenHPVM/GenHPVM.h b/hpvm/include/GenHPVM/GenHPVM.h
index f61d4a7c90dff2e4bff5f781c6c9cc92e3246232..4380433b979216ff3fd86591d117e857abfc5db4 100644
--- a/hpvm/include/GenHPVM/GenHPVM.h
+++ b/hpvm/include/GenHPVM/GenHPVM.h
@@ -26,7 +26,8 @@ using namespace llvm;
 
 namespace genhpvm {
 // GenHPVM - The first implementation.
-struct GenHPVM : public ModulePass {
+class GenHPVM : public ModulePass {
+  public:
   static char ID; // Pass identification, replacement for typeid
   GenHPVM() : ModulePass(ID) {}
 
diff --git a/hpvm/include/SupportHPVM/DFGTreeTraversal.h b/hpvm/include/SupportHPVM/DFGTreeTraversal.h
index e357bb3dd98527fd773570593beac7ce0386c6cf..cb3963b150e8a55b8de482dcb12292b24804a5ff 100644
--- a/hpvm/include/SupportHPVM/DFGTreeTraversal.h
+++ b/hpvm/include/SupportHPVM/DFGTreeTraversal.h
@@ -1,6 +1,3 @@
-#ifndef __DFGTREETRAVERSAL_H__
-#define __DFGTREETRAVERSAL_H__
-
 //=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====//
 //
 //                     The LLVM Compiler Infrastructure
@@ -10,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __DFGTREETRAVERSAL_H__
+#define __DFGTREETRAVERSAL_H__
+
 #include "BuildDFG/BuildDFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -38,19 +38,22 @@ public:
 
   void visit(DFInternalNode *N) {
     // May visit a nodemore than once, there is no marking it as visited
-    DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName()
-                 << "\n");
+    DEBUG(errs() << "In Node (I) - " << N->getFuncPointer()->getName() << "\n");
+
+    std::vector<DFNode *> Children;
 
     // Follows a bottom-up approach.
     for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
                                     e = N->getChildGraph()->end();
          i != e; ++i) {
       DFNode *child = *i;
-      child->applyDFNodeVisitor(*this);
+      Children.push_back(child);
     }
-
+    for (auto child : Children)
+      child->applyDFNodeVisitor(*this);
     // Process this internal node now.
-    process(N);
+    if (N != nullptr)
+      process(N);
     DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName()
                  << "\n");
   }
diff --git a/hpvm/include/SupportHPVM/DFGraph.h b/hpvm/include/SupportHPVM/DFGraph.h
index 3da7c0b01a79d52f668795eb072fdcb6381813a9..f6c697bd2daf902bd24276f40bb2a0b8082bf120 100644
--- a/hpvm/include/SupportHPVM/DFGraph.h
+++ b/hpvm/include/SupportHPVM/DFGraph.h
@@ -20,8 +20,8 @@
 #ifndef LLVM_IR_DFGRAPH_H
 #define LLVM_IR_DFGRAPH_H
 
-#include "SupportHPVM/HPVMHint.h"
-#include "SupportHPVM/HPVMUtils.h"
+#include "HPVMHint.h"
+#include "HPVMUtils.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -47,6 +47,9 @@ struct TargetGenFunctions {
   Function *GPUGenFunc;
   Function *CUDNNGenFunc;
   Function *PROMISEGenFunc;
+  Function *FFTGenFunc;
+  Function *VITERBIGenFunc;
+  Function *EPOCHSGenFunc;
 };
 
 struct TargetGenFuncInfo {
@@ -54,6 +57,9 @@ struct TargetGenFuncInfo {
   bool gpu_hasCPUFunc;
   bool cudnn_hasCPUFunc;
   bool promise_hasCPUFunc;
+  bool fft_hasCPUFunc;
+  bool viterbi_hasCPUFunc;
+  bool epochs_hasCPUFunc;
 };
 
 class DFGraph {
@@ -72,11 +78,15 @@ private:
   DFEdgeListType DFEdgeList;   ///< List of Dataflow edges among children
 
 public:
-  DFGraph(DFInternalNode *P);
+  inline DFGraph(DFInternalNode *P);
 
   virtual ~DFGraph() {}
 
-  void addChildDFNode(DFNode *child) { ChildrenList.push_back(child); }
+  void addChildDFNode(DFNode *child) {
+    if (std::find(ChildrenList.begin(), ChildrenList.end(), child) ==
+        ChildrenList.end())
+      ChildrenList.push_back(child);
+  }
 
   void removeChildDFNode(DFNode *child) {
     children_iterator position = std::find(begin(), end(), child);
@@ -85,7 +95,10 @@ public:
   }
 
   // Dataflow edge connecting child dataflow nodes
-  void addDFEdge(DFEdge *E) { DFEdgeList.push_back(E); }
+  void addDFEdge(DFEdge *E) {
+    if (std::find(DFEdgeList.begin(), DFEdgeList.end(), E) == DFEdgeList.end())
+      DFEdgeList.push_back(E);
+  }
 
   DFNode *getEntry() const { return Entry; }
 
@@ -95,8 +108,8 @@ public:
 
   bool isExit(const DFNode *N) const { return N == Exit; }
 
-  void sortChildren();
-  static bool compareRank(DFNode *A, DFNode *B);
+  inline void sortChildren();
+  static inline bool compareRank(DFNode *A, DFNode *B);
 
   // Iterators
   typedef DFNodeListType::iterator children_iterator;
@@ -140,9 +153,10 @@ public:
   //===--------------------------------------------------------------------===//
 
   DFInternalNode *getParent() const { return Parent; }
+  void setParent(DFInternalNode *_Parent) { Parent = _Parent; }
 
   // Child graph is streaming if any of the edges in the edge list is streaming
-  bool isStreaming();
+  inline bool isStreaming();
 
   //**************************************************************************//
   //*                  Functions to modify a dataflow graph                  *//
@@ -154,6 +168,8 @@ public:
     if (position != dfedge_end()) // the edge was found
       DFEdgeList.erase(position);
   }
+  // Returns whether an edge was removed (might not happen if edge not found).
+  inline bool removeEdge(const DFEdge *E);
 };
 
 // DFNode represents a single HPVM Dataflow Node in LLVM.
@@ -165,6 +181,7 @@ public:
 // 4. Pointer to parent Dataflow Node
 // 5. List of children Dataflow Nodes (empty if it is a leaf node)
 // 6. List of Dataflow Edges among children
+// 7. A Node Criticality value
 
 class DFNode {
 
@@ -210,12 +227,17 @@ private:
   const DFNodeKind Kind;          ///< Kind of Node Internal/Leaf
   hpvm::Target Tag;               ///< Code Generated for which backend
   hpvm::Target Hint;              ///< To store preferred backend
+  unsigned Criticality;
+  bool Root;
 
+  std::vector<hpvm::EPOCHS_DEVICES> EpochsTargets; // Store EPOCHS targets
 public:
   virtual ~DFNode() {
     // TODO: Check if fields DimLimits and OutputType need to freed here.
   }
 
+  StringRef getName() { return FuncPointer->getName(); }
+
   // Iterators
   typedef DFNodeListType::iterator successor_iterator;
   typedef DFNodeListType::const_iterator const_successor_iterator;
@@ -287,36 +309,75 @@ public:
 
   DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
          DFInternalNode *_Parent, unsigned _NumOfDim,
-         std::vector<Value *> _DimLimits, DFNodeKind _K);
+         std::vector<Value *> _DimLimits, DFNodeKind _K, unsigned _Criticality);
 
+  void setRoot() { Root = true; }
   bool isRoot() const {
-    // It is a root node is it was created from a launch intrinsic
-    if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) {
-      assert(Level == 0 && "Root node's level is zero.");
+    if (Root)
       return true;
-    }
-    return false;
+    else
+      return false;
+    // It is a root node is it was created from a launch intrinsic
+    //    if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) {
+    //      assert(Level == 0 && "Root node's level is zero.");
+    //      return true;
+    //    }
+    //    return false;
   }
 
   StructType *getOutputType() const { return OutputType; }
 
   void addSuccessor(DFNode *N) { Successors.push_back(N); }
 
+  void removeSuccessor(DFNode *N) {
+    Successors.erase(std::remove(Successors.begin(), Successors.end(), N),
+                     Successors.end());
+  }
+
   // Add incoming dataflow edge
-  void addInDFEdge(DFEdge *E) { InDFEdges.push_back(E); }
+  void addInDFEdge(DFEdge *E) {
+    if (std::find(InDFEdges.begin(), InDFEdges.end(), E) == InDFEdges.end())
+      InDFEdges.push_back(E);
+  }
 
   // Add outgoing dataflow edge
-  void addOutDFEdge(DFEdge *E) { OutDFEdges.push_back(E); }
+  void addOutDFEdge(DFEdge *E) {
+    if (std::find(OutDFEdges.begin(), OutDFEdges.end(), E) == OutDFEdges.end())
+      OutDFEdges.push_back(E);
+  }
+
+  // For removing dataflow edges
+  // Note (applies to add*Edge too): does not update edge data in other places
+  // (nodes and the graph). Does not update successors either.
+  void removeInDFEdge(const DFEdge *E) {
+    InDFEdges.erase(std::remove(InDFEdges.begin(), InDFEdges.end(), E),
+                    InDFEdges.end());
+  }
+
+  void removeOutDFEdge(const DFEdge *E) {
+    OutDFEdges.erase(std::remove(OutDFEdges.begin(), OutDFEdges.end(), E),
+                     OutDFEdges.end());
+  }
 
   Function *getFuncPointer() const { return FuncPointer; }
 
   void setFuncPointer(Function *_FuncPointer) { FuncPointer = _FuncPointer; }
 
   IntrinsicInst *getInstruction() const { return II; }
+  void setInstruction(IntrinsicInst *_II) { II = _II; }
 
   DFInternalNode *getParent() const { return Parent; }
+  void setParent(DFInternalNode *_Parent) { Parent = _Parent; }
+
+  // Adds the node to the graph of P and updates this node's parent
+  inline void addToNodeGraph(DFInternalNode *P);
 
   unsigned getNumOfDim() const { return NumOfDim; }
+  void setNumOfDim(unsigned numDims) { NumOfDim = numDims; }
+
+  const std::vector<Value *> &getDimLimits() const { return DimLimits; }
+  std::vector<Value *> &getDimLimits() { return DimLimits; }
+  void setDimLimits(std::vector<Value *> _DimLimits) { DimLimits = _DimLimits; }
 
   std::vector<Value *> getDimLimits() const { return DimLimits; }
 
@@ -365,6 +426,15 @@ public:
       break;
     case hpvm::CPU_OR_GPU_TARGET:
       break;
+    case hpvm::FFT_TARGET:
+      GenFuncInfo.fft_hasCPUFunc = isCPUFunc;
+      break;
+    case hpvm::VITERBI_TARGET:
+      GenFuncInfo.viterbi_hasCPUFunc = isCPUFunc;
+      break;
+    case hpvm::EPOCHS_TARGET:
+      GenFuncInfo.epochs_hasCPUFunc = isCPUFunc;
+      break;
     default:
       assert(false && "Unknown target\n");
       break;
@@ -384,6 +454,12 @@ public:
       return GenFuncInfo.cudnn_hasCPUFunc;
     case hpvm::TENSOR_TARGET:
       return GenFuncInfo.promise_hasCPUFunc;
+    case hpvm::FFT_TARGET:
+      return GenFuncInfo.fft_hasCPUFunc;
+    case hpvm::VITERBI_TARGET:
+      return GenFuncInfo.viterbi_hasCPUFunc;
+    case hpvm::EPOCHS_TARGET:
+      return GenFuncInfo.epochs_hasCPUFunc;
     case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
     default:
@@ -427,6 +503,32 @@ public:
       GenFuncs.PROMISEGenFunc = F;
       GenFuncInfo.promise_hasCPUFunc = isCPUFunc;
       break;
+    case hpvm::EPOCHS_TARGET:
+      if (isCPUFunc == true) {
+        if (GenFuncInfo.epochs_hasCPUFunc) {
+          DEBUG(errs() << "Warning: Second generated EPOCHS function for node "
+                       << FuncPointer->getName() << "\n");
+        }
+      }
+      GenFuncs.EPOCHSGenFunc = NULL; // We do not actually generate a function
+      GenFuncInfo.epochs_hasCPUFunc = isCPUFunc;
+      break;
+    case hpvm::FFT_TARGET:
+      if (GenFuncs.FFTGenFunc != NULL) {
+        DEBUG(errs() << "Warning: Second generated FFT function for node "
+                     << FuncPointer->getName() << "\n");
+      }
+      GenFuncs.FFTGenFunc = F;
+      GenFuncInfo.fft_hasCPUFunc = isCPUFunc;
+      break;
+    case hpvm::VITERBI_TARGET:
+      if (GenFuncs.VITERBIGenFunc != NULL) {
+        DEBUG(errs() << "Warning: Second generated VITERBI function for node "
+                     << FuncPointer->getName() << "\n");
+      }
+      GenFuncs.VITERBIGenFunc = F;
+      GenFuncInfo.viterbi_hasCPUFunc = isCPUFunc;
+      break;
     case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "A node function should be set with a tag specifying its \
                 type, not the node hint itself\n");
@@ -449,6 +551,12 @@ public:
       return GenFuncs.CUDNNGenFunc;
     case hpvm::TENSOR_TARGET:
       return GenFuncs.PROMISEGenFunc;
+    case hpvm::FFT_TARGET:
+      return GenFuncs.FFTGenFunc;
+    case hpvm::VITERBI_TARGET:
+      return GenFuncs.VITERBIGenFunc;
+    case hpvm::EPOCHS_TARGET:
+      return GenFuncs.EPOCHSGenFunc;
     case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Requesting genarated node function with dual tag instead of \
@@ -479,6 +587,18 @@ public:
       GenFuncs.PROMISEGenFunc = NULL;
       GenFuncInfo.promise_hasCPUFunc = false;
       break;
+    case hpvm::FFT_TARGET:
+      GenFuncs.FFTGenFunc = NULL;
+      GenFuncInfo.fft_hasCPUFunc = false;
+      break;
+    case hpvm::VITERBI_TARGET:
+      GenFuncs.VITERBIGenFunc = NULL;
+      GenFuncInfo.viterbi_hasCPUFunc = false;
+      break;
+    case hpvm::EPOCHS_TARGET:
+      GenFuncs.EPOCHSGenFunc = NULL;
+      GenFuncInfo.viterbi_hasCPUFunc = false;
+      break;
     case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Removing genarated node function with dual tag instead of \
@@ -493,25 +613,28 @@ public:
 
   hpvm::Target getTargetHint() const { return Hint; }
 
+  std::vector<hpvm::EPOCHS_DEVICES> getEpochsDevices() { return EpochsTargets; }
+
   bool isDummyNode() const { return isEntryNode() || isExitNode(); }
 
   bool isAllocationNode() {
     // If Allocation Property is defined then it is not an allocation node
     return PropertyList.count(Allocation) != 0;
   }
-  void setRank(unsigned r);
-  bool isEntryNode() const;
-  bool isExitNode() const;
-  DFEdge *getInDFEdgeAt(unsigned inPort);
-  DFEdge *getExtendedInDFEdgeAt(unsigned inPort);
-  DFEdge *getOutDFEdgeAt(unsigned outPort);
-  DFEdge *getExtendedOutDFEdgeAt(unsigned outPort);
-  std::map<unsigned, unsigned> getInArgMap();
-  std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap();
-  std::vector<unsigned> getOutArgMap();
-  int getAncestorHops(DFNode *N);
-  bool hasSideEffects();
-
+  inline void setRank(unsigned r);
+  inline bool isEntryNode() const;
+  inline bool isExitNode() const;
+  inline DFEdge *getInDFEdgeAt(unsigned inPort);
+  inline DFEdge *getExtendedInDFEdgeAt(unsigned inPort);
+  inline DFEdge *getOutDFEdgeAt(unsigned outPort);
+  inline DFEdge *getExtendedOutDFEdgeAt(unsigned outPort);
+  inline std::map<unsigned, unsigned> getInArgMap();
+  inline std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap();
+  inline std::vector<unsigned> getOutArgMap();
+  inline int getAncestorHops(DFNode *N);
+  inline bool hasSideEffects();
+
+  unsigned getCriticality() { return Criticality; }
   virtual void applyDFNodeVisitor(DFNodeVisitor &V) = 0;
   //  virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0;
 
@@ -534,9 +657,9 @@ private:
   // Constructor
   DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
                  DFInternalNode *Parent, int NumOfDim,
-                 std::vector<Value *> DimLimits)
-      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits,
-               InternalNode) {
+                 std::vector<Value *> DimLimits, unsigned Criticality)
+      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, InternalNode,
+               Criticality) {
 
     childGraph = new DFGraph(this);
   }
@@ -546,10 +669,11 @@ public:
   Create(IntrinsicInst *II, Function *FuncPointer,
          hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL,
          int NumOfDim = 0,
-         std::vector<Value *> DimLimits = std::vector<Value *>()) {
+         std::vector<Value *> DimLimits = std::vector<Value *>(),
+         unsigned Criticality = 0) {
 
     return new DFInternalNode(II, FuncPointer, Hint, Parent, NumOfDim,
-                              DimLimits);
+                              DimLimits, Criticality);
   }
 
   static bool classof(const DFNode *N) { return N->getKind() == InternalNode; }
@@ -558,14 +682,16 @@ public:
 
   void removeChildFromDFGraph(DFNode *N) { childGraph->removeChildDFNode(N); }
 
-  void addEdgeToDFGraph(DFEdge *E);
+  inline void addEdgeToDFGraph(DFEdge *E);
+
+  void removeEdgeFromDFGraph(DFEdge *E) { childGraph->removeEdge(E); }
 
   DFGraph *getChildGraph() const { return childGraph; }
 
   bool isChildGraphStreaming() { return childGraph->isStreaming(); }
 
-  void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
-  //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+  inline void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
+  //  inline void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
 };
 
 /*****************************************************
@@ -577,21 +703,50 @@ private:
   // Constructor
   DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
              DFInternalNode *Parent, int NumOfDim = 0,
-             std::vector<Value *> DimLimits = std::vector<Value *>())
-      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {}
+             std::vector<Value *> DimLimits = std::vector<Value *>(),
+             unsigned Criticality = 0)
+      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode,
+               Criticality) {
+    std::vector<Instruction *> toErase;
+    for (inst_iterator i = inst_begin(FuncPointer), e = inst_end(FuncPointer);
+         i != e; ++i) {
+
+      Instruction *I = &*i;
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::hpvm_task) {
+          ConstantInt *C = dyn_cast<ConstantInt>(II->getOperand(0));
+          assert(C && "Task ID should be a constant integer");
+          TaskID = C->getSExtValue();
+
+          toErase.push_back(II);
+        }
+      }
+    }
+
+    for (Instruction *I : toErase) {
+      I->eraseFromParent();
+    }
+  }
+
+  int TaskID;
 
 public:
   static DFLeafNode *
   Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
          DFInternalNode *Parent, int NumOfDim = 0,
-         std::vector<Value *> DimLimits = std::vector<Value *>()) {
-    return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits);
+         std::vector<Value *> DimLimits = std::vector<Value *>(),
+         unsigned Criticality = 0) {
+    return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits,
+                          Criticality);
   }
 
   static bool classof(const DFNode *N) { return N->getKind() == LeafNode; }
 
-  void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
-  //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+  inline void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
+  //  inline void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+
+  int getTaskID() { return TaskID; }
 };
 
 // DFEdge represents a single HPVM Dataflow Edge in LLVM.
@@ -652,6 +807,8 @@ public:
 
   bool getEdgeType() const { return EdgeType; }
 
+  void setEdgeType(bool _EdgeType) { EdgeType = _EdgeType; }
+
   unsigned getSourcePosition() const { return SourcePosition; }
 
   void setSourcePosition(unsigned i) { SourcePosition = i; }
@@ -663,6 +820,8 @@ public:
   Type *getType() const { return ArgType; }
 
   bool isStreamingEdge() const { return isStreaming; }
+
+  void setStreamingEdge(bool _isStreaming) { isStreaming = _isStreaming; }
 };
 
 //===--------------------- DFGraph Outlined Functions --------------===//
@@ -690,12 +849,31 @@ bool DFGraph::isStreaming() {
   return false;
 }
 
+bool DFGraph::removeEdge(const DFEdge *E) {
+  dfedge_iterator position = std::find(dfedge_begin(), dfedge_end(), E);
+  if (position != dfedge_end()) // the edge was found
+    DFEdgeList.erase(position);
+
+  DFNode *S = E->getSourceDF();
+  DFNode *D = E->getDestDF();
+  S->removeSuccessor(D);
+  S->removeOutDFEdge(E);
+  D->removeInDFEdge(E);
+
+  if (position == dfedge_end())
+    DEBUG_WITH_TYPE(
+        "dfgraph",
+        errs() << "DFGraph::removeEdge called with non-existent edge.\n");
+  return position != dfedge_end();
+}
+
 //===--------------------- DFNode Outlined Functions --------------===//
 DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
                DFInternalNode *_Parent, unsigned _NumOfDim,
-               std::vector<Value *> _DimLimits, DFNodeKind _K)
+               std::vector<Value *> _DimLimits, DFNodeKind _K,
+               unsigned _Criticality)
     : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim),
-      DimLimits(_DimLimits), Kind(_K) {
+      DimLimits(_DimLimits), Kind(_K), Criticality(_Criticality) {
 
   Type *Ty = FuncPointer->getFunctionType()->getReturnType();
 
@@ -724,11 +902,24 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
   GenFuncs.GPUGenFunc = NULL;
   GenFuncs.CUDNNGenFunc = NULL;
   GenFuncs.PROMISEGenFunc = NULL;
+  GenFuncs.FFTGenFunc = NULL;
+  GenFuncs.VITERBIGenFunc = NULL;
+  GenFuncs.EPOCHSGenFunc = NULL;
 
   GenFuncInfo.cpu_hasCPUFunc = false;
   GenFuncInfo.gpu_hasCPUFunc = false;
   GenFuncInfo.cudnn_hasCPUFunc = false;
   GenFuncInfo.promise_hasCPUFunc = false;
+  GenFuncInfo.fft_hasCPUFunc = false;
+  GenFuncInfo.viterbi_hasCPUFunc = false;
+  GenFuncInfo.epochs_hasCPUFunc = false;
+  Root = false;
+  Hint = _Hint;
+}
+
+void DFNode::addToNodeGraph(DFInternalNode *P) {
+  Parent = P;
+  P->addChildToDFGraph(this);
 }
 
 void DFNode::setRank(unsigned r) {
@@ -796,13 +987,14 @@ DFEdge *DFNode::getOutDFEdgeAt(unsigned outPort) {
   // Cannot perform check for the number of outputs here,
   // it depends on the node's return type
 
-  unsigned index = 0;
+  // ADEL: Note - kept this same as non-release branch because wasn't sure how
+  // that change would affect Scheduler backend. The release-branch-change was
+  // made by Akash for Movidius stuff.
   for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e;
        ++i) {
     DFEdge *E = *i;
-    if (outPort == index) // E->getSourcePosition())
+    if (outPort == E->getSourcePosition())
       return E;
-    index++;
   }
   return NULL;
 }
@@ -1182,9 +1374,13 @@ template <> struct DOTGraphTraits<DFGraph *> : public DefaultDOTGraphTraits {
   static void addCustomGraphFeatures(DFGraph *G, GraphWriter<DFGraph *> &GW) {}
 };
 
-void viewDFGraph(DFGraph *G) {
-  llvm::WriteGraph(G, "DataflowGraph");
-  // llvm::ViewGraph(G, "DataflowGraph");
+inline void viewDFGraph(DFGraph *G) {
+  DFInternalNode *Parent = G->getParent();
+  assert(Parent && "Child Graph with no parent\n");
+
+  llvm::WriteGraph(G,
+                   (Parent->isRoot() ? "Root_Dataflow_Graph"
+                                     : (Parent->getFuncPointer()->getName())));
 }
 
 } // namespace llvm
diff --git a/hpvm/include/SupportHPVM/HPVMHint.h b/hpvm/include/SupportHPVM/HPVMHint.h
index 25020e82016b8b3320abb8ddf94b78f24bc91acd..a4946d235e221cf24b68580b1e6e27f0fa5ff42f 100644
--- a/hpvm/include/SupportHPVM/HPVMHint.h
+++ b/hpvm/include/SupportHPVM/HPVMHint.h
@@ -22,10 +22,37 @@ enum Target {
   CUDNN_TARGET,
   TENSOR_TARGET,
   CPU_OR_GPU_TARGET,
+  EPOCHS_TARGET,
   //    ALL_TARGETS,
+  FFT_TARGET,
+  VITERBI_TARGET,
   NUM_TARGETS
 };
 
+enum EPOCHS_DEVICES {
+  CPU_ACCEL=0,
+  OneD_FFT_ACCEL,
+  VITDEC_ACCEL,
+  CV_CNN_ACCEL,
+  NUM_DEVICES
+};
+
+enum EPOCHS_TASKS {
+  FFT_TASK=0,
+  RADAR_TASK,
+  CV_TASK,
+  VIT_TASK,
+  PNC_TASK,
+  NONE_TASK,
+  NUM_TASKS
+};
+
+enum NODE_CRITICALITY {
+  HPVM_BASE = 1,
+  HPVM_ELEVATED,
+  HPVM_CRITICAL
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/hpvm/include/SupportHPVM/HPVMUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h
index 2a5116ddb122b16b28ee45022d7c57409cdce566..e313083f93d83092c3990ca4ed168ad79cd71b06 100644
--- a/hpvm/include/SupportHPVM/HPVMUtils.h
+++ b/hpvm/include/SupportHPVM/HPVMUtils.h
@@ -18,7 +18,7 @@
 
 #include <assert.h>
 
-#include "SupportHPVM/HPVMHint.h"
+#include "HPVMHint.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -62,7 +62,7 @@ static bool isHPVMLaunchCall(Instruction *I) {
 }
 // Creates a new createNode intrinsic, similar to II but with different
 // associated function F instead
-IntrinsicInst *
+static IntrinsicInst *
 createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
                                                         IntrinsicInst *II) {
   Module *M = F->getParent();
@@ -72,27 +72,28 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
   Constant *Fp =
       ConstantExpr::getPointerCast(F, Type::getInt8PtrTy(II->getContext()));
 
-  ArrayRef<Value *> CreateNodeArgs;
+  std::vector<Value *> NodeArgs;
   switch (II->getIntrinsicID()) {
   case Intrinsic::hpvm_createNode: {
-    CreateNodeArgs = ArrayRef<Value *>(Fp);
+    NodeArgs.push_back(Fp);
     break;
   }
   case Intrinsic::hpvm_createNode1D: {
-    Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
-    CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2);
+    NodeArgs.push_back(Fp);
+    NodeArgs.push_back(II->getArgOperand(1));
     break;
   }
   case Intrinsic::hpvm_createNode2D: {
-    Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
-                                 II->getArgOperand(2)};
-    CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3);
+    NodeArgs.push_back(Fp);
+    NodeArgs.push_back(II->getArgOperand(1));
+    NodeArgs.push_back(II->getArgOperand(2));
     break;
   }
   case Intrinsic::hpvm_createNode3D: {
-    Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2),
-                                 II->getArgOperand(3)};
-    CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4);
+    NodeArgs.push_back(Fp);
+    NodeArgs.push_back(II->getArgOperand(1));
+    NodeArgs.push_back(II->getArgOperand(2));
+    NodeArgs.push_back(II->getArgOperand(2));
     break;
   }
   default:
@@ -100,14 +101,16 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
     break;
   }
 
+  ArrayRef<Value *> CreateNodeArgs(NodeArgs);
+
   CallInst *CI =
-      CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node");
+      CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node", II);
   IntrinsicInst *CreateNodeII = cast<IntrinsicInst>(CI);
   return CreateNodeII;
 }
 
 // Fix HPVM hints for this function
-void fixHintMetadata(Module &M, Function *F, Function *G) {
+static void fixHintMetadata(Module &M, Function *F, Function *G) {
   Metadata *MD_F = ValueAsMetadata::getIfExists(F);
   MDTuple *MDT_F =
       MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata *>(MD_F));
@@ -129,12 +132,15 @@ void fixHintMetadata(Module &M, Function *F, Function *G) {
   FixHint("hpvm_hint_cpu_gpu");
   FixHint("hpvm_hint_cudnn");
   FixHint("hpvm_hint_promise");
+  FixHint("hpvm_hint_fft");
+  FixHint("hpvm_hint_viterbi");
+  FixHint("hpvm_hint_epochs");
 }
 
 // Assuming that the changed function is a node function, it is only used as a
 // first operand of createNode*. It is enough to iterate through all createNode*
 // calls in the program.
-void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
+static void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
 
   for (auto &Func : M) {
     DEBUG(errs() << "Function: " << Func.getName() << "\n");
@@ -148,7 +154,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
       if (isHPVMCreateNodeIntrinsic(I)) {
         IntrinsicInst *II = cast<IntrinsicInst>(I);
         // The found createNode is not associated with the changed function
-        if (II->getArgOperand(0) != F)
+        if (II->getArgOperand(0)->stripPointerCasts() != F)
           continue; // skip it
 
         // Otherwise, create a new createNode similar to the other one,
@@ -211,10 +217,10 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
 //      over extra pointer arguments.
 // The function returns the list of return instructions to the caller to fix in
 // case the return type is also changed.
-Function *cloneFunction(Function *F, FunctionType *newFT,
-                        bool isAddingPtrSizeArg,
-                        SmallVectorImpl<ReturnInst *> *Returns = NULL,
-                        std::vector<Argument *> *Args = NULL) {
+static Function *cloneFunction(Function *F, FunctionType *newFT,
+                               bool isAddingPtrSizeArg,
+                               SmallVectorImpl<ReturnInst *> *Returns = NULL,
+                               std::vector<Argument *> *Args = NULL) {
 
   DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
   DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
@@ -225,8 +231,8 @@ Function *cloneFunction(Function *F, FunctionType *newFT,
          "the old function!");
 
   // Create Function of specified type
-  Function *newF = Function::Create(newFT, F->getLinkage(),
-                                    F->getName() + "_cloned", F->getParent());
+  Function *newF = Function::Create(newFT, F->getLinkage(), F->getName() + "_c",
+                                    F->getParent());
   DEBUG(errs() << "Old Function name: " << F->getName() << "\n");
   DEBUG(errs() << "New Function name: " << newF->getName() << "\n");
   ValueToValueMapTy VMap;
@@ -308,13 +314,15 @@ Function *cloneFunction(Function *F, FunctionType *newFT,
   if (Returns == NULL)
     Returns = new SmallVector<ReturnInst *, 8>();
   CloneFunctionInto(newF, F, VMap, false, *Returns);
+  newF->setAttributes(F->getAttributes());
 
   return newF;
 }
 
 // Overloaded version of cloneFunction
-Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg,
-                        SmallVectorImpl<ReturnInst *> *Returns = NULL) {
+static Function *cloneFunction(Function *F, Function *newF,
+                               bool isAddingPtrSizeArg,
+                               SmallVectorImpl<ReturnInst *> *Returns = NULL) {
 
   DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
   DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
@@ -372,13 +380,15 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg,
     Returns = new SmallVector<ReturnInst *, 8>();
   CloneFunctionInto(newF, F, VMap, false, *Returns);
 
+  newF->setAttributes(F->getAttributes());
+
   return newF;
 }
 
 //------------------- Helper Functions For Handling Hints -------------------//
 
 // Return true if 1st arg (tag) contains 2nd (target)
-bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
+static bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
   switch (Tag) {
   case hpvm::None:
     return false;
@@ -403,6 +413,18 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
       if (T == hpvm::TENSOR_TARGET)
         return true;
       return false;
+  case hpvm::FFT_TARGET:
+    if (T == hpvm::FFT_TARGET)
+      return true;
+    return false;
+  case hpvm::VITERBI_TARGET:
+    if (T == hpvm::VITERBI_TARGET)
+      return true;
+    return false;
+  case hpvm::EPOCHS_TARGET:
+    if (T == hpvm::EPOCHS_TARGET)
+      return true;
+    return false;
   default:
     assert(false && "Unknown Target\n");
     return false;  // What kind of compiler doesn't know this is unreachable?!
@@ -411,15 +433,19 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
 
 bool isSingleTargetTag(hpvm::Target T) {
   return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) 
-       || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET));
+       || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET)||
+          (T == hpvm::FFT_TARGET) || (T == hpvm::VITERBI_TARGET) ||
+          (T == hpvm::EPOCHS_TARGET));
 }
 
 // Add the specified target to the given tag
 hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) {
   assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) 
-       || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET)) &&
+       || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET) ||
+       (T == hpvm::FFT_TARGET) || (T == hpvm::VITERBI_TARGET) ||
+      (T == hpvm::EPOCHS_TARGET)) &&
          "The target is only allowed to be a single target: CPU, GPU, SPIR, "
-         "CUDNN, PROMISE\n");
+          "CUDNN, PROMISE, FFT, VITERBI, EPOCHS\n");
 
   switch (Tag) {
   case hpvm::None:
@@ -441,6 +467,18 @@ hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) {
   case hpvm::CPU_OR_GPU_TARGET:
     assert((T != hpvm::CUDNN_TARGET) && (T != hpvm::TENSOR_TARGET) && "Unsupported target combination\n");
     return hpvm::CPU_OR_GPU_TARGET; 
+  case hpvm::FFT_TARGET:
+    if (T == hpvm::FFT_TARGET)
+      return hpvm::FFT_TARGET;
+    return T;
+  case hpvm::VITERBI_TARGET:
+    if (T == hpvm::VITERBI_TARGET)
+      return hpvm::VITERBI_TARGET;
+    return T;
+  case hpvm::EPOCHS_TARGET:
+    if (T == hpvm::EPOCHS_TARGET)
+      return hpvm::EPOCHS_TARGET;
+    return T;
   default:
     assert(false && "Unknown Target\n");
   }
@@ -448,8 +486,7 @@ hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) {
 }
 
 // This functions add the hint as metadata in hpvm code
-void addHint(Function *F, hpvm::Target T) {
-  DEBUG(errs() << "ADD HINT *************************\n");
+static void addHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
@@ -478,6 +515,18 @@ void addHint(Function *F, hpvm::Target T) {
       DEBUG(errs() << "PROMISE\n");
       HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise");
       break;
+  case hpvm::FFT_TARGET:
+    DEBUG(errs() << "FFT Target\n");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_fft");
+    break;
+  case hpvm::VITERBI_TARGET:
+    DEBUG(errs() << "VITERBI Target\n");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_viterbi");
+    break;
+  case hpvm::EPOCHS_TARGET:
+    DEBUG(errs() << "EPOCHS Target\n");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_epochs");
+    break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
     break;
@@ -490,7 +539,7 @@ void addHint(Function *F, hpvm::Target T) {
 }
 
 // This function removes the hint as metadata in hpvm code
-void removeHint(Function *F, hpvm::Target T) {
+static void removeHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T
@@ -514,6 +563,15 @@ void removeHint(Function *F, hpvm::Target T) {
   case hpvm::TENSOR_TARGET:
       HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise");
       break;
+  case hpvm::FFT_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_fft");
+    break;
+  case hpvm::VITERBI_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_viterbi");
+    break;
+  case hpvm::EPOCHS_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_epochs");
+    break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
     break;
@@ -539,7 +597,7 @@ void removeHint(Function *F, hpvm::Target T) {
   }
 }
 
-hpvm::Target getPreferredTarget(Function *F) {
+static hpvm::Target getPreferredTarget(Function *F) {
   DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
   Module *M = F->getParent();
 
@@ -565,6 +623,12 @@ hpvm::Target getPreferredTarget(Function *F) {
     return hpvm::CUDNN_TARGET;
   if (FoundPrefTarget("hpvm_hint_promise"))
     return hpvm::TENSOR_TARGET;
+  if (FoundPrefTarget("hpvm_hint_fft"))
+    return hpvm::FFT_TARGET;
+  if (FoundPrefTarget("hpvm_hint_viterbi"))
+    return hpvm::VITERBI_TARGET;
+  if (FoundPrefTarget("hpvm_hint_epochs"))
+    return hpvm::EPOCHS_TARGET;
   return hpvm::None;
 }
 
diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index b3b46de48260f965782b1fb13bc049d446f51da2..be3b3e65fe77843335a38d475af0126b6be10d2e 100644
--- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -53,6 +53,7 @@ bool BuildDFG::runOnModule(Module &M) {
         // Intrinsic Instruction has been initialized from this point on.
         Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts());
         Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F));
+        Root->setRoot();
         // errs() << "INTRINSIC: " << II << "\n";
         // errs() << "ROOT NODE" << Root << "\n";
         Roots.push_back(Root);
@@ -179,11 +180,12 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
       isInternalNode = true;
   }
 
-  // Number of Dimensions would be equal to the (number of operands - 1) as
-  // the first operand is the pointer to associated Function and the
+  // Number of Dimensions would be equal to the (number of operands - 2) as
+  // the first operand is the pointer to associated Function, the last 
+  // operand is the node criticality and the
   // remaining operands are the limits in each dimension.
   unsigned numOfDim =
-      II->getCalledFunction()->getFunctionType()->getNumParams() - 1;
+      II->getCalledFunction()->getFunctionType()->getNumParams() - 2;
   assert(numOfDim <= 3 &&
          "Invalid number of dimensions for createNode intrinsic!");
   std::vector<Value *> dimLimits;
@@ -194,11 +196,15 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
     dimLimits.push_back(cast<Value>(II->getOperand(i)));
   }
 
+  ConstantInt* Criticality = dyn_cast<ConstantInt>(II->getArgOperand(numOfDim + 1));
+
+  assert(Criticality && (Criticality->getSExtValue() >= 0) && "Criticality must be a non-negative constant integer (i64) value");
+
   if (isInternalNode) {
     // Create Internal DFNode, add it to the map and recursively build its
     // dataflow graph
     DFInternalNode *childDFNode = DFInternalNode::Create(
-        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits, Criticality->getSExtValue());
     // errs() << "INTERNAL NODE: " << childDFNode << "\n";
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
@@ -206,7 +212,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   } else {
     // Create Leaf DFnode and add it to the map.
     DFLeafNode *childDFNode = DFLeafNode::Create(
-        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits, Criticality->getSExtValue());
     // errs() << "LEAF NODE: " << childDFNode << "\n";
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a8850e61eb0dab26ac128b2b3965112261cf2ca0
--- /dev/null
+++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt
@@ -0,0 +1,14 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}")
+
+add_llvm_library( LLVMDFG2LLVM_EPOCHS
+  MODULE
+  DFG2LLVM_EPOCHS.cpp
+
+  DEPENDS intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba52c049d702b3e592235c4cd17f1483a577389
--- /dev/null
+++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp
@@ -0,0 +1,2498 @@
+//===-------------------------- DFG2LLVM_EPOCHS.cpp
+//--------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for generating code for host code and kernel code
+// for EPOCHS target using HPVM dataflow graph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#define DEBUG_TYPE "DFG2LLVM_EPOCHS"
+#include "SupportHPVM/DFG2LLVM.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include <string>
+#include <fstream>
+#include <queue>
+#include <sys/stat.h>
+
+#ifndef LLVM_BUILD_DIR
+#error LLVM_BUILD_DIR is not defined
+#endif
+
+#define STR_VALUE(X) #X
+#define STRINGIFY(X) STR_VALUE(X)
+#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR)
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+//// HPVM Command line option to use timer or not
+// static cl::opt<bool> HPVMTimer_EPOCHS("hpvm-timers-epochs",
+// cl::desc("Enable hpvm timers"));
+
+static cl::opt<std::string> SCHDULER_LIBRARY_PATH(
+    "sched-lib-path", cl::Required,
+    cl::desc("Path to the schedular library bitcode module."),
+    cl::value_desc("scheduler_module.bc"));
+
+static cl::opt<std::string>
+    SCHDULER_CONFIG_PATH("sched-config", cl::Required,
+                         cl::desc("Path to the schedular configuration file."),
+                         cl::value_desc("scheduler configuration file"));
+
+static cl::opt<std::string> TASK_CONFIG_PATH(
+    "task-config", cl::Required,
+    cl::desc("Path to the task description configuration file."),
+    cl::value_desc("task configuration file"));
+
+static cl::opt<std::string>
+    TASK_LIBRARY_PATH("task-lib-path", cl::Required,
+                      cl::desc("Path to the task library bitcode module."),
+                      cl::value_desc("tasklib.bc"));
+
+namespace dfg2llvm {
+
+static void findReturnInst(Function *, std::vector<ReturnInst *> &);
+
+struct TaskDeviceInfo {
+  hpvm::EPOCHS_DEVICES TaskDeviceTy;
+  std::string TaskDeviceFn;
+
+  TaskDeviceInfo() {
+    TaskDeviceTy = hpvm::CPU_ACCEL;
+    TaskDeviceFn = "";
+  }
+
+  void clearTaskDeviceInfo() {
+    TaskDeviceTy = hpvm::CPU_ACCEL;
+    TaskDeviceFn = "";
+  }
+};
+
+struct TaskInfo {
+  int TaskID;
+  std::string TaskName;
+  std::string TaskDescription;
+  std::string TaskSetup;
+  std::string TaskPrint;
+  std::string TaskRunStats;
+  std::string TaskFinishExecution;
+  std::string TaskAutoFinish;
+  std::vector<TaskDeviceInfo> Devices;
+  Value *TaskHandleAddr;
+
+  TaskInfo() {
+    TaskID = 0;
+    TaskName = "";
+    TaskDescription = "";
+    TaskSetup = "";
+    TaskPrint = "";
+    TaskRunStats = "";
+    TaskAutoFinish = "";
+    TaskFinishExecution = "";
+    Devices.clear();
+  }
+
+  void printTaskInfo() {
+    DEBUG(errs() << "TaskID :\t" << TaskID << "\n");
+    DEBUG(errs() << "TaskName :\t" << TaskName << "\n");
+    DEBUG(errs() << "TaskDescription :\t" << TaskDescription << "\n");
+    DEBUG(errs() << "TaskSetup :\t" << TaskSetup << "\n");
+    DEBUG(errs() << "TaskPrint :\t" << TaskPrint << "\n");
+    DEBUG(errs() << "TaskRunStats :\t" << TaskRunStats << "\n");
+    DEBUG(errs() << "TaskFinishExecution :\t" << TaskFinishExecution << "\n");
+    DEBUG(errs() << "TaskAutoFinish :\t" << TaskAutoFinish << "\n");
+    DEBUG(errs() << "Task Devices:\n");
+
+    for (TaskDeviceInfo &TDI : Devices) {
+      DEBUG(errs() << "\tDevice Name :\t" << TDI.TaskDeviceTy << "\n");
+      DEBUG(errs() << "\tDevice Function :\t" << TDI.TaskDeviceFn << "\n");
+    }
+  }
+
+  unsigned getNumDevices() { return Devices.size(); }
+
+  std::vector<hpvm::EPOCHS_DEVICES> getDeviceTypes() {
+    std::vector<hpvm::EPOCHS_DEVICES> Devs;
+    for (TaskDeviceInfo &TDI : Devices) {
+      Devs.push_back(TDI.TaskDeviceTy);
+    }
+    return Devs;
+  }
+
+  void clearTaskInfo() {
+    TaskName = "";
+    TaskDescription = "";
+    TaskSetup = "";
+    TaskPrint = "";
+    TaskRunStats = "";
+    TaskAutoFinish = "";
+    TaskFinishExecution = "";
+    Devices.clear();
+  }
+
+  bool operator<(const TaskInfo &rhs) const { return TaskID < rhs.TaskID; }
+
+  bool operator==(const TaskInfo &rhs) const { return TaskID == rhs.TaskID; }
+};
+
+void strSplit(std::string &str, char delim, std::vector<std::string> &out) {
+  size_t start;
+  size_t end = 0;
+
+  while ((start = str.find_first_not_of(delim, end)) != std::string::npos) {
+    end = str.find(delim, start);
+    out.push_back(str.substr(start, end - start));
+  }
+}
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_EPOCHS : public CodeGenTraversal {
+
+private:
+  // Member variables
+  std::vector<TaskInfo> TaskTypes;
+  std::map<DFLeafNode *, TaskInfo> TaskHandleMap;
+  std::map<DFLeafNode *, std::map<unsigned, unsigned>> returnArgMap;
+  std::map<DFLeafNode *, GlobalVariable *> TaskMBMap;
+  std::set<TaskInfo> RegisteredTasks;
+  std::set<DFLeafNode *> FinishedTasks;
+
+  // Global Variables
+  GlobalVariable *SchedulerHandleAddress;
+  GlobalVariable *DagIDGlobal;
+
+  std::unique_ptr<Module> TaskModule;
+
+  FunctionCallee malloc;
+  // HPVM Runtime API
+  FunctionCallee llvm_hpvm_cpu_launch;
+  FunctionCallee llvm_hpvm_cpu_wait;
+  FunctionCallee llvm_hpvm_cpu_argument_ptr;
+
+  FunctionCallee llvm_hpvm_streamLaunch;
+  FunctionCallee llvm_hpvm_streamPush;
+  FunctionCallee llvm_hpvm_streamPop;
+  FunctionCallee llvm_hpvm_streamWait;
+  FunctionCallee llvm_hpvm_createBindInBuffer;
+  FunctionCallee llvm_hpvm_createBindOutBuffer;
+  FunctionCallee llvm_hpvm_createEdgeBuffer;
+  FunctionCallee llvm_hpvm_createLastInputBuffer;
+  FunctionCallee llvm_hpvm_createThread;
+  FunctionCallee llvm_hpvm_bufferPush;
+  FunctionCallee llvm_hpvm_bufferPop;
+  //  FunctionCallee llvm_hpvm_cpu_dstack_push;
+  //  FunctionCallee llvm_hpvm_cpu_dstack_pop;
+  FunctionCallee llvm_hpvm_cpu_getDimLimit;
+  FunctionCallee llvm_hpvm_cpu_getDimInstance;
+
+  // EPOCHS Scheduler API
+  FunctionCallee set_up_scheduler;
+  FunctionCallee initialize_scheduler_from_config_file;
+  FunctionCallee initialize_task_lib;
+  FunctionCallee register_task_type;
+  FunctionCallee set_up_task;
+  FunctionCallee request_execution;
+  FunctionCallee finish_task_execution;
+  FunctionCallee wait_on_tasklist;
+  FunctionCallee shutdown_scheduler;
+
+  // Functions
+  std::vector<IntrinsicInst *> *getUseList(Value *LI);
+  void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
+  Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
+  StructType *getArgumentListStructTy(DFNode *);
+  Function *createFunctionFilter(DFNode *C);
+  void startNodeThread(DFNode *, std::vector<Value *>,
+                       DenseMap<DFEdge *, Value *>, Value *, Value *,
+                       Instruction *);
+  Function *createLaunchFunction(DFInternalNode *);
+
+  void finishNodeEPOCHS(DFLeafNode *, Instruction *IB, Function *ParentF);
+  void waitOnTask(DFLeafNode *Leaf, Instruction *IB);
+
+
+  // Task File parsing and utility functions
+  TaskInfo &getTaskInfo(int TaskID);
+
+  FunctionCallee getTaskFunctionHelper(std::string TaskFunctionName);
+  FunctionCallee getTaskSetupFunction(TaskInfo &TI);
+  FunctionCallee getTaskPrintFunction(TaskInfo &TI);
+  FunctionCallee getTaskRunStatsFunction(TaskInfo &TI);
+  FunctionCallee getTaskAutoFinishFunction(TaskInfo &TI);
+  FunctionCallee getTaskFinishExecutionFunction(TaskInfo &TI);
+
+  FunctionCallee getTaskDeviceFunction(TaskInfo &TI,
+                                       hpvm::EPOCHS_DEVICES DevTy);
+
+  // Virtual Functions
+  void init() {
+    // HPVMTimer = HPVMTimer_EPOCHS;
+    TargetName = "EPOCHS";
+  }
+  void initRuntimeAPI();
+  void initSchedulerAPI();
+  void initTaskAPI();
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
+  Function *codeGenStreamPush(DFInternalNode *N);
+  Function *codeGenStreamPop(DFInternalNode *N);
+  void invokeChild(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
+                   Instruction *IB, hpvm::Target Tag);
+  Value *getInValueAt(DFNode *Child, unsigned i, Function *ParntF_CPU,
+                      Instruction *InsertBefore);
+  // hpvm::Target Tag = hpvm::CPU_TARGET);
+
+public:
+  // Constructor
+  CGT_EPOCHS(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
+    init();
+    initRuntimeAPI();
+    initSchedulerAPI();
+    initTaskAPI();
+  }
+
+  void codeGenLaunch(DFInternalNode *Root);
+  void codeGenLaunchStreaming(DFInternalNode *Root);
+  void parseTaskFile(std::string TaskPath);
+};
+
+bool runDFG2LLVM_EPOCHS(Module &M, BuildDFG &DFG) {
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_EPOCHS *CGTVisitor = new CGT_EPOCHS(M, DFG);
+
+  // Populate scheduler library's task
+  // details from the provided task
+  // description file.
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto &rootNode : Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise
+    // return now.
+    // TODO: Later on, we might like to do this in a separate pass, which would
+    // allow us the flexibility to switch between complete static code
+    // generation for DFG or having a customized runtime+scheduler
+
+    // Do streaming code generation if root node is streaming. Usual otherwise
+    if (rootNode->isChildGraphStreaming())
+      CGTVisitor->codeGenLaunchStreaming(rootNode);
+    else
+      CGTVisitor->codeGenLaunch(rootNode);
+  }
+
+  for (auto &F : M) {
+    for (Function::arg_iterator ai = F.arg_begin(), ae = F.arg_end(); ai != ae;
+         ai++) {
+      Argument *Arg = &*ai;
+      if (Arg->hasAttribute(Attribute::In))
+        Arg->removeAttr(Attribute::In);
+      if (Arg->hasAttribute(Attribute::Out))
+        Arg->removeAttr(Attribute::Out);
+      if (Arg->hasAttribute(Attribute::InOut))
+        Arg->removeAttr(Attribute::InOut);
+    }
+  }
+
+  delete CGTVisitor;
+  return true;
+}
+
+bool DFG2LLVM_EPOCHS::runOnModule(Module &M) {
+  DEBUG(errs() << "\nDFG2LLVM_EPOCHS PASS\n");
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  return runDFG2LLVM_EPOCHS(M, DFG);
+}
+void CGT_EPOCHS::initTaskAPI() {
+  // Load Task API Module
+  SMDiagnostic Err;
+
+  std::string TaskAPI = TASK_LIBRARY_PATH;
+  TaskModule = parseIRFile(TaskAPI, Err, M.getContext());
+  if (TaskModule == nullptr) {
+    DEBUG(errs() << Err.getMessage() << " " << TaskAPI << "\n");
+    assert(false && "Couldn't parse TaskAPI module!");
+  } else
+    DEBUG(errs() << "Successfully loaded Task API module\n");
+
+  DECLARE_EPOCHS(initialize_task_lib, TaskModule);
+  IRBuilder<> Builder(InitCall);
+  // Issue initialize_task_lib call
+  CallInst *InitializeTaskLibInst =
+      Builder.CreateCall(initialize_task_lib, ArrayRef<Value *>());
+  DEBUG(errs() << *InitializeTaskLibInst << "\n");
+
+  std::string TaskConfigFile = TASK_CONFIG_PATH;
+  struct stat buffer;
+
+  // Check if file provided is accessible
+  if (stat(TaskConfigFile.c_str(), &buffer) != 0) {
+    DEBUG(errs() << "Invalid Task Config File provided: " << TaskConfigFile
+                 << "!\n");
+    assert(false && "Couldn't access task config file!");
+  }
+
+  // Populate task library info from the provided task
+  // description file.
+
+  parseTaskFile(TaskConfigFile);
+}
+void CGT_EPOCHS::initSchedulerAPI() {
+  // Load Scheduler API Module
+  SMDiagnostic Err;
+
+  std::string SchedulerAPI = SCHDULER_LIBRARY_PATH;
+  auto SchedulerModule = parseIRFile(SchedulerAPI, Err, M.getContext());
+  if (SchedulerModule == nullptr) {
+    DEBUG(errs() << Err.getMessage() << " " << SchedulerAPI << "\n");
+    assert(false && "Couldn't parse SchedulerAPI module!");
+  } else
+    DEBUG(errs() << "Successfully loaded Scheduler API module\n");
+
+  DECLARE_EPOCHS(set_up_scheduler, SchedulerModule);
+  DECLARE_EPOCHS(initialize_scheduler_from_config_file, SchedulerModule);
+  DECLARE_EPOCHS(register_task_type, SchedulerModule);
+  DECLARE_EPOCHS(set_up_task, SchedulerModule);
+  DECLARE_EPOCHS(request_execution, SchedulerModule);
+  DECLARE_EPOCHS(finish_task_execution, SchedulerModule);
+  DECLARE_EPOCHS(wait_on_tasklist, SchedulerModule);
+  DECLARE_EPOCHS(shutdown_scheduler, SchedulerModule);
+
+
+  IRBuilder<> Builder(InitCall);
+
+  // Issue set_up_scheduler call
+  CallInst *SetupSchedulerInst =
+      Builder.CreateCall(set_up_scheduler, ArrayRef<Value *>());
+  DEBUG(errs() << *SetupSchedulerInst << "\n");
+
+  // Issue initialize_scheduler call
+  std::string SchedulerConfigFile = SCHDULER_CONFIG_PATH;
+  struct stat buffer;
+  // Check if file provided is accessible
+  if (stat(SchedulerConfigFile.c_str(), &buffer) != 0) {
+    DEBUG(errs() << "Invalid Scheduler Config File provided: "
+                 << SchedulerConfigFile << "!\n");
+    assert(false && "Couldn't access scheduler config file!");
+  }
+  Value *ConfigFileStrPtr = Builder.CreateGlobalStringPtr(SchedulerConfigFile);
+  DEBUG(errs() << *ConfigFileStrPtr << "\n");
+  Value *InitializeSchedulerArgs[] = {ConfigFileStrPtr};
+  CallInst *InitSchedInst = Builder.CreateCall(
+      initialize_scheduler_from_config_file,
+      ArrayRef<Value *>(InitializeSchedulerArgs, 1), "scheduler");
+  DEBUG(errs() << *InitSchedInst << "\n");
+
+  SchedulerHandleAddress = new GlobalVariable(
+      M, InitSchedInst->getType(), false, GlobalValue::CommonLinkage,
+      Constant::getNullValue(InitSchedInst->getType()), "scheduler.addr");
+  DEBUG(errs() << "Store at: " << *SchedulerHandleAddress << "\n");
+  auto *SI = Builder.CreateStore(InitSchedInst, SchedulerHandleAddress);
+  DEBUG(errs() << *SI << "\n");
+
+  // Insert the DagID Global Variable, initialized to 0
+  // TODO: what to do when there are multiple DFGs?
+  DagIDGlobal = new GlobalVariable(
+      M, Type::getInt32Ty(M.getContext()), false, GlobalValue::CommonLinkage,
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0), "DagID");
+
+  DEBUG(errs() << *InitCall->getParent() << "\n");
+
+
+
+  // Add the shutdown scheduler call at the __hpvm__cleanup call.
+
+  Builder.SetInsertPoint(CleanupCall);
+
+
+  // Load the Schedule Pointer
+  auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress);
+  std::vector<Value*> ShutdownArgs = {SchedulerPtr};
+  auto *ShutdownCall = Builder.CreateCall(shutdown_scheduler, ArrayRef<Value *>(ShutdownArgs));
+
+  DEBUG(errs() <<"Shutdown call: "<< *ShutdownCall << "\n");
+
+
+}
+
+// Initialize the HPVM runtime API. This makes it easier to insert these calls
+void CGT_EPOCHS::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) +
+                           "/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
+
+  runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext());
+  if (runtimeModule == nullptr) {
+    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
+    assert(false && "couldn't parse runtime");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE_EPOCHS(llvm_hpvm_cpu_launch, runtimeModule);
+  DECLARE_EPOCHS(malloc, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_wait, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_argument_ptr, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_streamLaunch, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_streamPush, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_streamPop, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_streamWait, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_createBindInBuffer, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_createBindOutBuffer, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_createEdgeBuffer, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_createLastInputBuffer, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_createThread, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_bufferPush, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_bufferPop, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_dstack_push, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_dstack_pop, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_getDimLimit, runtimeModule);
+  DECLARE_EPOCHS(llvm_hpvm_cpu_getDimInstance, runtimeModule);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  // initTimerAPI();
+
+  // Insert init context in main
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
+  // DEBUG(errs() << "Inserting cpu timer initialization\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  // initializeTimerSet(InitCall);
+  // switchToTimer(hpvm_TimerID_NONE, I);
+  // Insert print instruction at hpvm exit
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
+  assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+
+  // DEBUG(errs() << "Inserting cpu timer print\n");
+  // printTimerSet(InitCall);
+}
+
+/* Returns vector of all wait instructions
+ */
+std::vector<IntrinsicInst *> *CGT_EPOCHS::getUseList(Value *GraphID) {
+  std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>();
+  // It must have been loaded from memory somewhere
+  for (Value::user_iterator ui = GraphID->user_begin(),
+                            ue = GraphID->user_end();
+       ui != ue; ++ui) {
+    if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      UseList->push_back(waitI);
+    } else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
+    }
+  }
+  return UseList;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_EPOCHS::addWhileLoop(Instruction *CondBlockStart,
+                              Instruction *BodyStart, Instruction *BodyEnd,
+                              Value *TerminationCond) {
+  BasicBlock *Entry = CondBlockStart->getParent();
+  BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+
+  // Replace the terminator instruction of conditional with new conditional
+  // branch which goes to while.body if true and branches to while.end otherwise
+  BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  ReplaceInstWithInst(CondBlock->getTerminator(), BI);
+
+  // While Body should jump to condition block
+  BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock);
+  ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
+}
+
+Instruction *CGT_EPOCHS::addWhileLoopCounter(BasicBlock *Entry,
+                                             BasicBlock *Cond,
+                                             BasicBlock *Body) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  return CounterPhi;
+}
+
+// Returns a packed struct type. The structtype is created by packing the input
+// types, output types and isLastInput buffer type. All the streaming
+// inputs/outputs are converted to i8*, since this is the type of buffer
+// handles.
+StructType *CGT_EPOCHS::getArgumentListStructTy(DFNode *C) {
+  std::vector<Type *> TyList;
+  // Input types
+  Function *CF = C->getFuncPointer();
+  for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+       ai != ae; ++ai) {
+    if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+      TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+    else
+      TyList.push_back(ai->getType());
+  }
+  // Output Types
+  StructType *OutStructTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
+    // All outputs of a node are streaming edge
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() &&
+           "All output edges of child node have to be streaming");
+    TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+  }
+  // isLastInput buffer element
+  TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+
+  StructType *STy =
+      StructType::create(CF->getContext(), TyList,
+                         Twine("struct.thread." + CF->getName()).str(), true);
+  return STy;
+}
+
+void CGT_EPOCHS::startNodeThread(DFNode *C, std::vector<Value *> Args,
+                                 DenseMap<DFEdge *, Value *> EdgeBufferMap,
+                                 Value *isLastInputBuffer, Value *graphID,
+                                 Instruction *IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // Create a filter/pipeline function for the child node
+  Function *C_Pipeline = createFunctionFilter(C);
+  Function *CF = C->getFuncPointer();
+
+  // Get module context and i32 0 constant, as they would be frequently used in
+  // this function.
+  LLVMContext &Ctx = IB->getParent()->getContext();
+  Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  // Marshall arguments
+  // Create a packed struct type with inputs of C followed by outputs and then
+  // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
+  // by i8*
+  //
+  StructType *STy = getArgumentListStructTy(C);
+  // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
+  CallInst *CI =
+      CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)),
+                       C->getFuncPointer()->getName() + ".inputs", IB);
+  CastInst *Struct = BitCastInst::CreatePointerCast(
+      CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB);
+  // AllocaInst* AI = new AllocaInst(STy,
+  // C->getFuncPointer()->getName()+".inputs", IB);
+  // Insert elements in the struct
+  DEBUG(errs() << "Marshall inputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // Marshall Inputs
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+    // Create constant int (i)
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    // Get Element pointer instruction
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".arg_" + Twine(i), IB);
+    DFEdge *E = C->getInDFEdgeAt(i);
+    if (E->getSourceDF()->isEntryNode()) {
+      // This is a Bind Input Edge
+      if (E->isStreamingEdge()) {
+        // Streaming Bind Input edge. Get buffer corresponding to it
+        assert(EdgeBufferMap.count(E) &&
+               "No mapping buffer for a Streaming Bind DFEdge!");
+        new StoreInst(EdgeBufferMap[E], GEP, IB);
+      } else {
+        // Non-streaming Bind edge
+        new StoreInst(Args[i], GEP, IB);
+      }
+    } else {
+      // This is an edge between siblings.
+      // This must be an streaming edge. As it is our assumption that all edges
+      // between two nodes in a DFG are streaming.
+      assert(EdgeBufferMap.count(E) &&
+             "No mapping buffer for a Streaming DFEdge!");
+      new StoreInst(EdgeBufferMap[E], GEP, IB);
+    }
+  }
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
+  // Marshall Outputs
+  DEBUG(errs() << "Marshall outputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
+    // Create constant int (i+numInputs)
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs);
+    // Get Element pointer instruction
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".out_" + Twine(i), IB);
+    DFEdge *E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() &&
+           "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) &&
+           "No mapping buffer for a Out Streaming DFEdge!");
+    new StoreInst(EdgeBufferMap[E], GEP, IB);
+  }
+  // Marshall last argument. isLastInput buffer
+  DEBUG(errs() << "Marshall isLastInput for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // Create constant int (i+numInputs)
+  Constant *Int_index =
+      ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs);
+  // Get Element pointer instruction
+  Value *GEPIndices[] = {IntZero, Int_index};
+  GetElementPtrInst *GEP = GetElementPtrInst::Create(
+      nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+      Struct->getName() + ".isLastInput", IB);
+  new StoreInst(isLastInputBuffer, GEP, IB);
+
+  // AllocaInst AI points to memory with all the arguments packed
+  // Call runtime to create the thread with these arguments
+  DEBUG(errs() << "Start Thread for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // DEBUG(errs() << *llvm_hpvm_createThread << "\n");
+  DEBUG(errs() << *graphID->getType() << "\n");
+  DEBUG(errs() << *C_Pipeline->getType() << "\n");
+  DEBUG(errs() << *Struct->getType() << "\n");
+  // Bitcast AI to i8*
+  CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx),
+                                                Struct->getName(), IB);
+  Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst::Create(llvm_hpvm_createThread,
+                   ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
+}
+
+Function *CGT_EPOCHS::createLaunchFunction(DFInternalNode *N) {
+  DEBUG(errs() << "Generating Streaming Launch Function\n");
+  // Get Function associated with Node N
+  Function *NF = N->getFuncPointer();
+
+  // Map from Streaming edge to buffer
+  DenseMap<DFEdge *, Value *> EdgeBufferMap;
+
+  /* Now we have all the necessary global declarations necessary to generate the
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+   * (2) Extract each of inputs from data.addr
+   * (3) create Buffers for all the streaming edges
+   *     - Put buffers in the context
+   * (4) Go over each child node
+   *     - marshall its arguments together (use buffers in place of streaming
+   *       arguments)
+   *     - Start the threads
+   * (5) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
+  // (1) Create Launch Function of type void (i8* args, i8* GraphID)
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType *LaunchFuncTy = FunctionType::get(
+      Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false);
+  Function *LaunchFunc = Function::Create(
+      LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M);
+  DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Argument *data = &*LaunchFunc->arg_begin();
+  // NOTE-HS: Check correctness with Maria
+  Argument *graphID = &*(LaunchFunc->arg_begin() + 1);
+  data->setName("data.addr");
+  graphID->setName("graphID");
+  // Add a basic block to this empty function and a return null statement to it
+  DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
+  BasicBlock *BB =
+      BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+
+  // (2) Extract each of inputs from data.addr
+  std::vector<Type *> TyList;
+  std::vector<std::string> names;
+  std::vector<Value *> Args;
+
+  for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
+       ai != ae; ++ai) {
+    if (N->getChildGraph()
+            ->getEntry()
+            ->getOutDFEdgeAt(ai->getArgNo())
+            ->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back(Twine(ai->getName() + "_buffer").str());
+      continue;
+    }
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  Args = extractElements(data, TyList, names, RI);
+  DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc
+               << "\n");
+  // (3) Create buffers for all the streaming edges
+  for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+                                de = N->getChildGraph()->dfedge_end();
+       di != de; ++di) {
+    DFEdge *Edge = *di;
+    DEBUG(errs() << *Edge->getType() << "\n");
+    Value *size = ConstantExpr::getSizeOf(Edge->getType());
+    Value *CallArgs[] = {graphID, size};
+    if (Edge->isStreamingEdge()) {
+      CallInst *CI;
+      // Create a buffer call
+      if (Edge->getSourceDF()->isEntryNode()) {
+        // Bind Input Edge
+        Constant *Int_ArgNo = ConstantInt::get(
+            Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
+        Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(
+            llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
+            "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
+      } else if (Edge->getDestDF()->isExitNode()) {
+        // Bind Output Edge
+        CI = CallInst::Create(
+            llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
+            "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
+      } else {
+        // Streaming Edge
+        CI = CallInst::Create(
+            llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
+            Edge->getSourceDF()->getFuncPointer()->getName() + "." +
+                Edge->getDestDF()->getFuncPointer()->getName(),
+            RI);
+      }
+      EdgeBufferMap[Edge] = CI;
+    }
+  }
+  // Create buffer for isLastInput for all the child nodes
+  DFGraph *G = N->getChildGraph();
+  DenseMap<DFNode *, Value *> NodeLastInputMap;
+  for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce;
+       ++ci) {
+    DFNode *child = *ci;
+    if (child->isDummyNode())
+      continue;
+    Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value *CallArgs[] = {graphID, size};
+    CallInst *CI = CallInst::Create(
+        llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
+        "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
+    NodeLastInputMap[child] = CI;
+  }
+  DEBUG(errs() << "Start Each child node filter\n");
+  // (4) Marshall arguments for each child node and start the thread with its
+  //     pipeline funtion
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+
+    // Marshall all the arguments for this node into an i8*
+    // Pass to the runtime to create the thread
+    // Start the thread for child node C
+    startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
+  }
+
+  DEBUG(errs() << "Launch function:\n");
+  DEBUG(errs() << *LaunchFunc << "\n");
+
+  return LaunchFunc;
+}
+
+/* This fuction does the steps necessary to launch a streaming graph
+ * Steps
+ * Create Pipeline/Filter function for each node in child graph of Root
+ * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
+ * Modify each of the instrinsic in host code
+ * Launch, Push, Pop, Wait
+ */
+void CGT_EPOCHS::codeGenLaunchStreaming(DFInternalNode *Root) {
+  IntrinsicInst *LI = Root->getInstruction();
+  Function *RootLaunch = createLaunchFunction(Root);
+  // Substitute launch intrinsic main
+  DEBUG(errs() << "Substitute launch intrinsic\n");
+  Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with cpu specific wait instructions
+  DEBUG(errs() << "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_streamPush,
+                            ArrayRef<Value *>(PushArgs, 2), "");
+      break;
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
+    };
+    DEBUG(errs() << "Replace:\n\t" << *II << "\n");
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << "\twith " << *CI << "\n");
+  }
+}
+
+void CGT_EPOCHS::codeGenLaunch(DFInternalNode *Root) {
+  // TODO: Place an assert to check if the constant passed by launch intrinsic
+  // as the number of arguments to DFG is same as the number of arguments of the
+  // root of DFG
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Get Launch Instruction
+  IntrinsicInst *LI = Root->getInstruction();
+  // switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI);
+  DEBUG(errs() << "Generating Launch Function\n");
+
+  /* Now we have all the necessary global declarations necessary to generate the
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type i8*(i8*)
+   * (2) Extract each of inputs from data.addr and pass them as arguments to the
+   * call to Root function
+   * (3) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
+  // Create Launch Function of type i8*(i8*) which calls the root function
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *AppFuncTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *AppFunc =
+      Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(),
+                       "LaunchDataflowGraph", &M);
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value *data = &*AppFunc->arg_begin();
+  data->setName("data.addr");
+  // Add a basic block to this empty function and a return null statement to it
+  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
+  ReturnInst *RI =
+      ReturnInst::Create(AppFunc->getContext(),
+                         Constant::getNullValue(AppFunc->getReturnType()), BB);
+  // switchToTimer(hpvm_TimerID_ARG_UNPACK, RI);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+  // Find the CPU function generated for Root and
+  //  Function* RootF_CPU = Root->getGenFunc();
+  Function *RootF_CPU = Root->getGenFuncForTarget(hpvm::CPU_TARGET);
+  assert(RootF_CPU && "Error: No generated CPU function for Root node\n");
+  assert(Root->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) &&
+         "Error: Generated Function for Root node with no cpu wrapper\n");
+
+  // Generate a call to RootF_CPU with null parameters for now
+  std::vector<Value *> Args;
+  for (unsigned i = 0; i < RootF_CPU->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(
+        Constant::getNullValue(RootF_CPU->getFunctionType()->getParamType(i)));
+  }
+  CallInst *CI =
+      CallInst::Create(RootF_CPU, Args, RootF_CPU->getName() + ".output", RI);
+
+  // Extract input data from i8* data.addr and patch them to correct argument of
+  // call to RootF_CPU. For each argument
+  std::vector<Type *> TyList;
+  std::vector<std::string> names;
+  for (Function::arg_iterator ai = RootF_CPU->arg_begin(),
+                              ae = RootF_CPU->arg_end();
+       ai != ae; ++ai) {
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  std::vector<Value *> elements = extractElements(data, TyList, names, CI);
+  // Patch the elements to the call arguments
+  for (unsigned i = 0; i < CI->getNumArgOperands(); i++)
+    CI->setArgOperand(i, elements[i]);
+
+  // Add timers around Call to RootF_CPU function
+  // switchToTimer(hpvm_TimerID_COMPUTATION, CI);
+  // switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI);
+
+  StructType *RootRetTy =
+      cast<StructType>(RootF_CPU->getFunctionType()->getReturnType());
+
+  // if Root has non empty return
+  if (RootRetTy->getNumElements()) {
+    // We can't access the type of the arg struct - build it
+    std::vector<Type *> TyList;
+    for (Function::arg_iterator ai = RootF_CPU->arg_begin(),
+                                ae = RootF_CPU->arg_end();
+         ai != ae; ++ai) {
+      TyList.push_back(ai->getType());
+    }
+    TyList.push_back(CI->getType());
+
+    StructType *ArgStructTy = StructType::create(
+        M.getContext(), ArrayRef<Type *>(TyList),
+        (RootF_CPU->getName() + ".arg.struct.ty").str(), true);
+
+    // Cast the data pointer to the type of the arg struct
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI);
+
+    // Result struct is the last element of the packed struct passed to launch
+    unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
+
+    ConstantInt *IntZero =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    ConstantInt *IntIdx =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx);
+
+    Value *GEPIIdxList[] = {IntZero, IntIdx};
+    // Get data pointer to the last element of struct - result field
+    GetElementPtrInst *OutGEPI = GetElementPtrInst::Create(
+        ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2),
+        CI->getName() + ".addr", RI);
+    // Store result there
+    new StoreInst(CI, OutGEPI, RI);
+  } else {
+    // There is no return - no need to actually code gen, but for fewer
+    // changes maintain what code was already doing
+    // We were casting the data pointer to the result type of Root, and
+    // returning result there. This would work at the LLVM level, but not
+    // at the C level, thus the rewrite.
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI);
+    new StoreInst(CI, OutputAddrCast, RI);
+  }
+
+  // switchToTimer(hpvm_TimerID_NONE, RI);
+
+  DEBUG(errs() << "Application specific function:\n");
+  DEBUG(errs() << *AppFunc << "\n");
+
+  // Substitute launch intrinsic main
+  Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_hpvm_cpu_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
+  // ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with cpu specific wait instructions
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_cpu_wait, ArrayRef<Value *>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
+    };
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << *CI << "\n");
+  }
+}
+
+/* This function takes a DFNode, and creates a filter function for it. By filter
+ * function we mean a function which keeps on getting input from input buffers,
+ * applying the function on the inputs and then pushes data on output buffers
+ */
+// Create a function with void* (void*) type.
+// Create a new basic block
+// Add a return instruction to the basic block
+// extract arguments from the aggregate data input. Type list would be
+// Replace the streaming inputs with i8* types signifying handle to
+// corresponding buffers
+// Add a boolean argument isLastInput
+// Add runtime API calls to get input for each of the streaming inputs
+// Add a call to the generated function of the child node
+// Add runtime API calls to push output for each of the streaming outputs
+// Add loop around the basic block, which exits the loop if isLastInput is false
+
+Function *CGT_EPOCHS::createFunctionFilter(DFNode *C) {
+  DEBUG(errs() << "*********Creating Function filter for "
+               << C->getFuncPointer()->getName() << "*****\n");
+
+  /* Create a function with same argument list as child.*/
+  DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
+  // Get the generated function for child node
+  Function *CF = C->getFuncPointer();
+  // Create Filter Function of type i8*(i8*) which calls the root function
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *CF_PipelineTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(),
+                                           CF->getName() + "_Pipeline", &M);
+  DEBUG(errs() << "Generating Pipeline Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value *data = &*CF_Pipeline->arg_begin();
+  data->setName("data.addr");
+  // Create a new basic block
+  DEBUG(errs() << "\tCreate new BB and add a return function\n");
+  // Add a basic block to this empty function
+  BasicBlock *BB =
+      BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  // Add a return instruction to the basic block
+  ReturnInst *RI =
+      ReturnInst::Create(CF_Pipeline->getContext(),
+                         UndefValue::get(CF_Pipeline->getReturnType()), BB);
+
+  /* Extract the elements from the aggregate argument to the function.
+   * Replace the streaming inputs with i8* types signifying handle to
+   * corresponding buffers
+   * Add outputs to the list as well
+   * Add isLastInput to the list
+   */
+  DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
+  // These Args will be used when passing arguments to the generated function
+  // inside loop, and reading outputs as well.
+  std::vector<Value *> Args;
+  std::vector<Type *> TyList;
+  std::vector<std::string> names;
+  // Adding inputs
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back((Twine(i->getName()) + "_buffer").str());
+    } else {
+      TyList.push_back(i->getType());
+      names.push_back(i->getName());
+    }
+  }
+  // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
+  // because we get there buffer handles
+  StructType *RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < RetTy->getNumElements(); i++) {
+    TyList.push_back(i8Ty->getPointerTo());
+    names.push_back("out");
+  }
+  /* Add a boolean argument isLastInput */
+  DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
+  TyList.push_back(i8Ty->getPointerTo());
+  names.push_back("isLastInput_buffer");
+
+  // Extract the inputs, outputs
+  Args = extractElements(data, TyList, names, RI);
+  for (unsigned i = 0; i < Args.size(); i++) {
+    DEBUG(errs() << *Args[i] << "\n");
+  }
+
+  // Split the Args vector into, input output and isLastInput
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = RetTy->getNumElements();
+  std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value *> OutputArgs(Args.begin() + numInputs,
+                                  Args.begin() + numInputs + numOutputs);
+  Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]);
+
+  /* Add runtime API calls to get input for each of the streaming input edges */
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the "
+                  "streaming input edges\n");
+  // First read the termination condition variable islastInput
+  CallInst *isLastInputPop = CallInst::Create(
+      llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
+
+  CastInst *BI = BitCastInst::CreateIntegerCast(
+      isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
+      "isLastInput", RI);
+  isLastInput = BI;
+  // Create a loop termination condition
+  CmpInst *Cond = CmpInst::Create(
+      Instruction::ICmp, CmpInst::ICMP_NE, isLastInput,
+      Constant::getNullValue(Type::getInt64Ty(CF->getContext())),
+      "isLastInputNotZero", RI);
+
+  // Get input from buffers of all the incoming streaming edges
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst *bufferIn =
+          CallInst::Create(llvm_hpvm_bufferPop,
+                           ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
+      CastInst *BI;
+      if (i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(),
+                              i->getName() + ".addr", RI);
+      } else if (i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn, i->getType(),
+                                    i->getName() + ".addr", RI);
+      } else {
+        BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false,
+                                         i->getName() + ".addr", RI);
+      }
+      // Replace the argument in Args vector. We would be using the vector as
+      // parameters passed to the call
+      InputArgs[i->getArgNo()] = BI;
+    }
+  }
+  /* Add a call to the generated function of the child node */
+  DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
+  //  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+  //  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+  //                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET);
+  DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
+  CallInst *CI =
+      CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
+
+  /* Add runtime API calls to push output for each of the streaming outputs */
+  // FIXME: Assumption
+  // All edges between siblings are streaming edges
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the "
+                  "streaming outputs\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
+    // Extract output
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI);
+    // Convert to i64
+    CastInst *BI;
+    if (EI->getType()->isPointerTy())
+      BI =
+          CastInst::Create(CastInst::PtrToInt, EI,
+                           Type::getInt64Ty(CF_Pipeline->getContext()), "", RI);
+    else
+      BI = CastInst::CreateIntegerCast(
+          EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI);
+    // Push to Output buffer
+    Value *bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2),
+                     "", RI);
+  }
+
+  // Add loop around the basic block, which exits the loop if isLastInput is
+  // false Pointers to keep the created loop structure
+  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
+  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
+  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
+
+  // Return the Function pointer
+  DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
+  DEBUG(errs() << *CF_Pipeline << "\n");
+  return CF_Pipeline;
+}
+// Redefining invokeChild() for CGT_EPOCHS because we need to use member
+// variables like the SPTR and the TASKDEFINITION
+void CGT_EPOCHS::invokeChild(DFNode *C, Function *F_CPU,
+                             ValueToValueMapTy &VMap, Instruction *IB,
+                             hpvm::Target Tag) {
+  Function *CF = C->getFuncPointer();
+  if (Tag == hpvm::CPU_TARGET) {
+
+    //  Function* CF_CPU = C->getGenFunc();
+    Function *CF_CPU = C->getGenFuncForTarget(Tag);
+    assert(CF_CPU != NULL &&
+           "Found leaf node for which code generation has not happened yet!\n");
+    assert(C->hasCPUGenFuncForTarget(Tag) &&
+           "The generated function to be called from cpu backend is not an cpu "
+           "function\n");
+    DEBUG(errs() << "Invoking child node" << CF_CPU->getName() << "\n");
+
+    std::vector<Value *> Args;
+    // Create argument list to pass to call instruction
+    // First find the correct values using the edges
+    // The remaing six values are inserted as constants for now.
+    for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+      Args.push_back(getInValueAt(C, i, F_CPU, IB));
+    }
+
+    Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_CPU->getContext()), 0);
+    for (unsigned j = 0; j < 6; j++)
+      Args.push_back(I64Zero);
+
+    DEBUG(errs() << "Gen Function type: " << *CF_CPU->getType() << "\n");
+    DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n");
+    DEBUG(errs() << "Arguments: "; for (const Value *Arg
+                                        : Args) errs()
+                                   << *(Arg->getType()) << ", ";
+          errs() << "\n");
+
+    // Copying over index calculation.
+    copyChildIndexCalc(C, F_CPU, VMap, IB);
+
+    // Call the F_CPU function associated with this node
+    CallInst *CI =
+        CallInst::Create(CF_CPU, Args, CF_CPU->getName() + "_output", IB);
+    DEBUG(errs() << *CI << "\n");
+    OutputMap[C] = CI;
+
+    // Find num of dimensions this node is replicated in.
+    // Based on number of dimensions, insert loop instructions
+    std::string varNames[3] = {"x", "y", "z"};
+    unsigned numArgs = CI->getNumArgOperands();
+    for (unsigned j = 0; j < C->getNumOfDim(); j++) {
+      Value *indexLimit = NULL;
+      // Limit can either be a constant or an arguement of the internal node.
+      // In case of constant we can use that constant value directly in the
+      // new F_CPU function. In case of an argument, we need to get the mapped
+      // value using VMap
+      if (ConstantInt *ConstDimLimit =
+              dyn_cast<ConstantInt>(C->getDimLimits()[j])) {
+        indexLimit = C->getDimLimits()[j];
+        DEBUG(errs() << "In Constant case:\n"
+                     << "  indexLimit type = " << *indexLimit->getType()
+                     << "\n");
+        if (ConstDimLimit->getZExtValue() == 1) {
+          DEBUG(errs() << "DimLimit is 1, no need for loop!\n");
+          continue;
+        }
+      } else if (isa<Constant>(C->getDimLimits()[j])) {
+        indexLimit = C->getDimLimits()[j];
+        DEBUG(errs() << "In Constant case:\n"
+                     << "  indexLimit type = " << *indexLimit->getType()
+                     << "\n");
+      } else {
+        indexLimit = VMap[C->getDimLimits()[j]];
+        DEBUG(errs() << "In VMap case:"
+                     << "  indexLimit type = " << *indexLimit->getType()
+                     << "\n");
+      }
+      assert(indexLimit && "Invalid dimension limit!");
+      // Insert loop
+      Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
+      DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
+      // Insert index variable and limit arguments
+      CI->setArgOperand(numArgs - 6 + j, indexVar);
+      CI->setArgOperand(numArgs - 3 + j, indexLimit);
+    }
+
+    // Insert call to runtime to push the dim limits and instanceID on the depth
+    // stack
+    Value *args[] = {
+        ConstantInt::get(Type::getInt32Ty(CI->getContext()),
+                         C->getNumOfDim()), // numDim
+        CI->getArgOperand(numArgs - 3 + 0), // limitX
+        CI->getArgOperand(numArgs - 6 + 0), // iX
+        CI->getArgOperand(numArgs - 3 + 1), // limitY
+        CI->getArgOperand(numArgs - 6 + 1), // iY
+        CI->getArgOperand(numArgs - 3 + 2), // limitZ
+        CI->getArgOperand(numArgs - 6 + 2)  // iZ
+    };
+
+    CallInst *Push = CallInst::Create(llvm_hpvm_cpu_dstack_push,
+                                      ArrayRef<Value *>(args, 7), "", CI);
+    DEBUG(errs() << "Push on stack: " << *Push << "\n");
+    // Insert call to runtime to pop the dim limits and instanceID from the
+    // depth stack
+    BasicBlock::iterator i(CI);
+    ++i;
+    Instruction *NextI = &*i;
+    // Next Instruction should also belong to the same basic block as the basic
+    // block will have a terminator instruction
+    assert(NextI->getParent() == CI->getParent() &&
+           "Next Instruction should also belong to the same basic block!");
+
+    CallInst *Pop = CallInst::Create(llvm_hpvm_cpu_dstack_pop, None, "", NextI);
+    DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
+    DEBUG(errs() << *CI->getParent()->getParent());
+  } else if (Tag == hpvm::EPOCHS_TARGET) {
+    // If this is an EPOCHS_TARGET node, we want to issue the setup_task call
+    IRBuilder<> Builder(IB);
+    DFLeafNode *CLeaf = dyn_cast<DFLeafNode>(C);
+    std::vector<Value *> SetupTaskArgs;
+
+    // Load the Schedule Pointer
+    auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress);
+    DEBUG(errs() << *SchedulerPtr << "\n");
+    auto *SchedulerPtrBC = Builder.CreateBitCast(
+        SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle");
+    DEBUG(errs() << *SchedulerPtrBC << "\n");
+    SetupTaskArgs.push_back(SchedulerPtrBC);
+
+    // Load the Task handle
+    auto TI = TaskHandleMap[CLeaf];
+    auto *TaskHandleAddr = TI.TaskHandleAddr;
+    auto *TaskHandle = Builder.CreateLoad(TaskHandleAddr);
+    DEBUG(errs() << *TaskHandleAddr << "\n");
+    DEBUG(errs() << *TaskHandle << "\n");
+    SetupTaskArgs.push_back(TaskHandle);
+
+    // Get the criticality
+    auto Criticality = CLeaf->getCriticality();
+    if (Criticality < 2)
+      Criticality = 1;
+    else if (Criticality > 2)
+      Criticality = 3;
+    Value *CriticalityVal =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), Criticality);
+    DEBUG(errs() << *CriticalityVal << "\n");
+    SetupTaskArgs.push_back(CriticalityVal);
+
+    // Get the AutoFinish. If node does not have any Out DFEdges then it should
+    // have autofinish.
+    Value *AutoFinish;
+    if (CLeaf->outdfedge_empty())
+      AutoFinish = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+    else
+      AutoFinish = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    DEBUG(errs() << *AutoFinish << "\n");
+    SetupTaskArgs.push_back(AutoFinish);
+
+    // Load the DAGId
+    auto *DagID = Builder.CreateLoad(DagIDGlobal);
+    DEBUG(errs() << *DagID << "\n");
+    SetupTaskArgs.push_back(DagID);
+
+    // Create argument list to pass to set_up_args
+    // Find the correct values using the edges
+    for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+      auto *InValueArg = getInValueAt(C, i, F_CPU, IB);
+      DEBUG(errs() << i << ": " << *InValueArg << "\n");
+      SetupTaskArgs.push_back(InValueArg);
+    }
+
+    CallInst *SetupTaskInst =
+        Builder.CreateCall(set_up_task, ArrayRef<Value *>(SetupTaskArgs),
+                           "setup_task_" + TI.TaskName);
+    DEBUG(errs() << *SetupTaskInst << "\n");
+
+    auto *TaskMBBlockAddr = new GlobalVariable(
+        M, SetupTaskInst->getType(), false, GlobalValue::CommonLinkage,
+        Constant::getNullValue(SetupTaskInst->getType()),
+        "MB_" + TI.TaskName + ".addr");
+    DEBUG(errs() << *TaskMBBlockAddr << "\n");
+    auto *SI = Builder.CreateStore(SetupTaskInst, TaskMBBlockAddr);
+    DEBUG(errs() << *SI << "\n");
+
+    TaskMBMap[CLeaf] = TaskMBBlockAddr;
+
+    Value *RequestExecArgs[] = {SetupTaskInst};
+    CallInst *RequestExecutionInst = Builder.CreateCall(
+        request_execution, ArrayRef<Value *>(RequestExecArgs));
+    DEBUG(errs() << *RequestExecutionInst << "\n");
+  }
+}
+
+Value *CGT_EPOCHS::getInValueAt(DFNode *Child, unsigned i,
+                                Function *ParentF_CPU,
+                                Instruction *InsertBefore) {
+  Value *inputVal;
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+  hpvm::Target Tag = Child->getTag();
+  // Find the incoming edge at the requested input port
+  DEBUG(errs() << "Finding incoming edge " << i << " for "
+               << Child->getFuncPointer()->getName() << "\n");
+  DFEdge *E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode *SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  if (SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_CPU, E->getSourcePosition());
+    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+  } else {
+    if (SrcDF->getTag() == hpvm::CPU_TARGET) {
+      // edge is from a sibling
+      // Check - code should already be generated for this source dfnode
+      assert(OutputMap.count(SrcDF) &&
+             "Source node call not found. Dependency violation!");
+
+      // Find CallInst associated with the Source DFNode using OutputMap
+      Value *CI = OutputMap[SrcDF];
+
+      // Extract element at source position from this call instruction
+      std::vector<unsigned> IndexList;
+      IndexList.push_back(E->getSourcePosition());
+      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+      ExtractValueInst *EI =
+          ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
+      inputVal = EI;
+    } else if (SrcDF->getTag() == hpvm::EPOCHS_TARGET) {
+      // TODO: Check how to handle internal node source
+      // Assumption: source can only be a leaf node
+      assert(isa<DFLeafNode>(SrcDF) &&
+             "Source can only be a leaf node for now!");
+      DFLeafNode *SrcLeaf = dyn_cast<DFLeafNode>(SrcDF);
+
+      // Issue the call to finish()
+      finishNodeEPOCHS(SrcLeaf, InsertBefore, ParentF_CPU);
+
+      // Get the inputVal
+      auto OutPos = E->getSourcePosition();
+      // The returnArgMap tells us which argument in the leaf node corresponds
+      // to this output. We need to get the corresponding argument in the
+      // wrapper.
+      auto LeafArgNum = returnArgMap[SrcLeaf][OutPos];
+      auto *InEdge = SrcLeaf->getInDFEdgeAt(LeafArgNum);
+      auto ParentArgNum = InEdge->getSourcePosition();
+      inputVal = getArgumentAt(ParentF_CPU, ParentArgNum);
+      // inputVal = getArgumentAt(ParentF_CPU,
+      // returnArgMap[SrcLeaf][E->getSourcePosition()]);
+    }
+  }
+  return inputVal;
+}
+
+void CGT_EPOCHS::finishNodeEPOCHS(DFLeafNode *Leaf, Instruction *IB,
+                                  Function *ParentF_CPU) {
+  if (FinishedTasks.find(Leaf) != FinishedTasks.end()) {
+    // Task has already been finished. Exit;
+    return;
+  }
+
+  DEBUG(errs() << "Finishing Task: \n");
+
+  IRBuilder<> Builder(IB);
+  std::vector<Value *> FinishArgs;
+  auto *TaskMBBlockAddress = TaskMBMap[Leaf];
+  auto *TaskMBBlock = Builder.CreateLoad(TaskMBBlockAddress);
+  DEBUG(errs() << *TaskMBBlockAddress << "\n");
+  DEBUG(errs() << *TaskMBBlock << "\n");
+  FinishArgs.push_back(TaskMBBlock);
+
+  for (auto ei = Leaf->outdfedge_begin(); ei != Leaf->outdfedge_end(); ++ei) {
+    auto *E = *ei;
+    DEBUG(E->dump());
+    auto OutPos = E->getSourcePosition();
+    // The returnArgMap tells us which argument in the leaf node corresponds to
+    // this output. We need to get the corresponding argument in the wrapper.
+    auto LeafArgNum = returnArgMap[Leaf][OutPos];
+    auto *InEdge = Leaf->getInDFEdgeAt(LeafArgNum);
+    auto ParentArgNum = InEdge->getSourcePosition();
+    Argument *OutArg = getArgumentAt(ParentF_CPU, ParentArgNum);
+    DEBUG(errs() << *OutArg << "\n");
+    if (std::find(FinishArgs.begin(), FinishArgs.end(), OutArg) ==
+        FinishArgs.end())
+      FinishArgs.push_back(OutArg);
+  }
+
+  auto *FinishCall =
+      Builder.CreateCall(finish_task_execution, ArrayRef<Value *>(FinishArgs));
+  DEBUG(errs() << *FinishCall << "\n");
+  FinishedTasks.insert(Leaf);
+
+  // Add explicit wait call before finish call 
+  waitOnTask(Leaf, FinishCall);
+  
+
+
+}
+
+void CGT_EPOCHS::waitOnTask(DFLeafNode *Leaf, Instruction *IB) {
+
+  DEBUG(errs() << "Adding wait call \n");
+  
+  std::vector<Value *> WaitArgs;
+  IRBuilder<> Builder(IB);
+
+  // Load the Schedule Pointer
+  auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress);
+  DEBUG(errs() << *SchedulerPtr << "\n");  
+  auto *SchedulerPtrBC = Builder.CreateBitCast(
+    SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle");
+  DEBUG(errs() << *SchedulerPtrBC << "\n");
+  WaitArgs.push_back(SchedulerPtrBC);
+
+
+  // Add number of tasks to wait on
+  // (In this case one task)
+  WaitArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()), 1));
+
+
+  auto *TaskMBBlockAddress = TaskMBMap[Leaf];
+  auto *TaskMBBlock = Builder.CreateLoad(TaskMBBlockAddress);
+  DEBUG(errs() << *TaskMBBlockAddress << "\n");
+  DEBUG(errs() << *TaskMBBlock << "\n");
+  WaitArgs.push_back(TaskMBBlock);
+
+
+ 
+  auto *WaitCall =
+      Builder.CreateCall(wait_on_tasklist, ArrayRef<Value *>(WaitArgs));
+  DEBUG(errs() << *WaitCall << "\n");
+
+}
+
+void CGT_EPOCHS::codeGen(DFInternalNode *N) {
+  // Check if N is root node and its graph is streaming. We do not do codeGen
+  // for Root in such a case
+  if (N->isRoot() && N->isChildGraphStreaming())
+    return;
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+  //  if(N->getGenFunc() != NULL)
+  //    return;
+  if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
+    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
+                 << " : skipping it\n");
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  // Sort children in topological order before code generation
+  N->getChildGraph()->sortChildren();
+
+  // Only process if all children have a CPU cpu function
+  // Otherwise skip to end
+  bool codeGen = true;
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+
+    if (!(C->hasCPUGenFuncForTarget(hpvm::CPU_TARGET)) &&
+        !(C->hasCPUGenFuncForTarget(hpvm::EPOCHS_TARGET))) {
+      DEBUG(errs() << "No CPU & EPOCHS cpu version for child node "
+                   << C->getFuncPointer()->getName()
+                   << "\n  Skip code gen for parent node "
+                   << N->getFuncPointer()->getName() << "\n");
+      codeGen = false;
+    }
+  }
+
+  if (codeGen) {
+    Function *F = N->getFuncPointer();
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function *F_CPU;
+
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+
+    // Create new function with the same type
+    F_CPU = Function::Create(F->getFunctionType(), F->getLinkage(),
+                             F->getName(), &M);
+
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_CPU->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Increment dest iterator
+      ++dest_iterator;
+    }
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(F_CPU->getContext(), "entry", F_CPU);
+    ReturnInst *RI = ReturnInst::Create(
+        F_CPU->getContext(), UndefValue::get(F_CPU->getReturnType()), BB);
+
+    // Add Index and Dim arguments except for the root node and the child graph
+    // of parent node is not streaming
+    if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+      F_CPU = addIdxDimArgs(F_CPU);
+
+    BB = &*F_CPU->begin();
+    RI = cast<ReturnInst>(BB->getTerminator());
+
+    // Add generated function info to DFNode
+    //    N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
+    N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true);
+
+    // Loop over the arguments, to create the VMap.
+    dest_iterator = F_CPU->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      // Add mapping and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+
+    // Iterate over children in topological order
+    for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                    ce = N->getChildGraph()->end();
+         ci != ce; ++ci) {
+      DFNode *C = *ci;
+      // Skip dummy node call
+      if (C->isDummyNode())
+        continue;
+
+      // Create calls to CPU function of child node
+      invokeChild(C, F_CPU, VMap, RI, C->getTag());
+    }
+
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    DFNode *C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType *OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_CPU->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge *E = C->getInDFEdgeAt(i);
+
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode *SrcDF = E->getSourceDF();
+
+      DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                   << "\n");
+
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value *inputVal;
+      if (SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_CPU, i);
+        DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+      } else {
+        if (SrcDF->getTag() == hpvm::CPU_TARGET) {
+          // edge is from a internal node
+          // Check - code should already be generated for this source dfnode
+          assert(OutputMap.count(SrcDF) &&
+                 "Source node call not found. Dependency violation!");
+
+          // Find Output Value associated with the Source DFNode using OutputMap
+          Value *CI = OutputMap[SrcDF];
+
+          // Extract element at source position from this call instruction
+          std::vector<unsigned> IndexList;
+          IndexList.push_back(E->getSourcePosition());
+          DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI
+                       << "\n");
+          ExtractValueInst *EI =
+              ExtractValueInst::Create(CI, IndexList, "", RI);
+          inputVal = EI;
+        } else if (SrcDF->getTag() == hpvm::EPOCHS_TARGET) {
+          // TODO: Check how to handle internal node source
+          // Assumption: source can only be a leaf node
+          assert(isa<DFLeafNode>(SrcDF) &&
+                 "Source can only be a leaf node for now!");
+          DFLeafNode *SrcLeaf = dyn_cast<DFLeafNode>(SrcDF);
+
+          // Issue the call to finish()
+          finishNodeEPOCHS(SrcLeaf, RI, F_CPU);
+
+          // Get the inputVal
+          auto OutPos = E->getSourcePosition();
+          // The returnArgMap tells us which argument in the leaf node
+          // corresponds to this output. We need to get the corresponding
+          // argument in the wrapper.
+          auto LeafArgNum = returnArgMap[SrcLeaf][OutPos];
+          auto *InEdge = SrcLeaf->getInDFEdgeAt(LeafArgNum);
+          auto ParentArgNum = InEdge->getSourcePosition();
+          inputVal = getArgumentAt(F_CPU, ParentArgNum);
+          // inputVal = getArgumentAt(
+          // F_CPU, returnArgMap[SrcLeaf][E->getSourcePosition()]);
+        }
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
+
+    // Increment the dagid
+    IRBuilder<> Builder(newRI);
+    auto *DagID = Builder.CreateLoad(DagIDGlobal, "DagID");
+    auto *Increment = Builder.CreateAdd(
+        DagID, ConstantInt::get(Type::getInt32Ty(M.getContext()), 1),
+        "DagID.increment");
+    Builder.CreateStore(Increment, DagIDGlobal);
+  }
+
+  //-------------------------------------------------------------------------//
+  // Here, we need to check if this node (N) has more than one versions
+  // If so, we query the policy and have a call to each version
+  // If not, we see which version exists, check that it is in fact an cpu
+  // function and save it as the CPU_TARGET function
+
+  // TODO: hpvm_id per node, so we can use this for id for policies
+  // For now, use node function name and change it later
+  Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
+
+  bool CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET);
+  bool GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET);
+
+  DEBUG(errs() << "Before editing\n");
+  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+               << N->getTag() << "\n");
+  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
+  DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n");
+  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
+  DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n");
+
+  if (N->getTag() == hpvm::None) {
+    // No code is available for this node. This (usually) means that this
+    // node is a node that
+    // - from the accelerator backends has been mapped to an intermediate
+    // node, and thus they have not produced a genFunc
+    // - a child node had no CPU hint, thus no code gen for CPU could
+    // take place
+    DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
+                 << N->getFuncPointer()->getName() << "\n");
+  } else if (hpvmUtils::isSingleTargetTag(N->getTag())) {
+    // There is a single version for this node according to code gen hints.
+    // Therefore, we do not need to check the policy, we simply use the
+    // available implementation, whichever target it is for.
+
+    // Sanity check - to be removed TODO
+    switch (N->getTag()) {
+    case hpvm::CPU_TARGET:
+      assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && "");
+      assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) && "");
+      break;
+    case hpvm::GPU_TARGET:
+      assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(!(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && "");
+      assert(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET) && "");
+      break;
+    default:
+      assert(false && "Unreachable: we checked that tag was single target!\n");
+      break;
+    }
+
+    N->addGenFunc(N->getGenFuncForTarget(N->getTag()), hpvm::CPU_TARGET, true);
+    N->removeGenFuncForTarget(hpvm::GPU_TARGET);
+    N->setTag(hpvm::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+    GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
+
+    CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET);
+    GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET);
+
+    DEBUG(errs() << "After editing\n");
+    DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+                 << N->getTag() << "\n");
+    DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
+    DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n");
+    DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
+    DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n");
+
+  } else {
+    assert(false && "Multiple tags unsupported!");
+  }
+}
+
+// Code generation for leaf nodes
+void CGT_EPOCHS::codeGen(DFLeafNode *N) {
+  // Skip code generation if it is a dummy node
+  if (N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Allocation nodes are skipped
+  if (N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // If this is a regular CPU node, perform regular CPU codegen.
+  if (N->getTargetHint() == hpvm::CPU_TARGET) {
+    if (N->getGenFuncForTarget(hpvm::CPU_TARGET) != NULL) {
+      DEBUG(errs() << "Already generated CPU code for this node!\n");
+      return;
+    }
+    //  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
+    //         "Error: Visiting a node for which code already generated\n");
+
+    std::vector<IntrinsicInst *> IItoRemove;
+    std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace;
+    BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+    // Get the function associated woth the dataflow node
+    Function *F = N->getFuncPointer();
+    DEBUG(errs() << "Generating CPU code for function " << F->getName()
+                 << "\n");
+
+    // Clone the function, if we are seeing this function for the first time.
+    Function *F_CPU;
+    ValueToValueMapTy VMap;
+    F_CPU = CloneFunction(F, VMap);
+    F_CPU->removeFromParent();
+    // Insert the cloned function into the module
+    M.getFunctionList().push_back(F_CPU);
+
+    // Add the new argument to the argument list. Add arguments only if the cild
+    // graph of parent node is not streaming
+    if (!N->getParent()->isChildGraphStreaming())
+      F_CPU = addIdxDimArgs(F_CPU);
+
+    // Add generated function info to DFNode
+    //  N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
+    N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true);
+
+    // Go through the arguments, and any pointer arguments with in attribute
+    // need to have cpu_argument_ptr call to get the cpu ptr of the argument
+    // Insert these calls in a new BB which would dominate all other BBs
+    // Create new BB
+    BasicBlock *EntryBB = &*F_CPU->begin();
+    BasicBlock *BB =
+        BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_CPU, EntryBB);
+    BranchInst *Terminator = BranchInst::Create(EntryBB, BB);
+    // Insert calls
+    for (Function::arg_iterator ai = F_CPU->arg_begin(), ae = F_CPU->arg_end();
+         ai != ae; ++ai) {
+      if (F_CPU->getAttributes().hasAttribute(ai->getArgNo() + 1,
+                                              Attribute::In)) {
+        assert(ai->getType()->isPointerTy() &&
+               "Only pointer arguments can have hpvm in/out attributes ");
+        Function::arg_iterator aiNext = ai;
+        ++aiNext;
+        Argument *size = &*aiNext;
+        assert(size->getType() == Type::getInt64Ty(M.getContext()) &&
+               "Next argument after a pointer should be an i64 type");
+        CastInst *BI = BitCastInst::CreatePointerCast(
+            &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr",
+            Terminator);
+        Value *ArgPtrCallArgs[] = {BI, size};
+        CallInst::Create(llvm_hpvm_cpu_argument_ptr,
+                         ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator);
+      }
+    }
+    DEBUG(errs() << *BB << "\n");
+
+    // Go through all the instructions
+    for (inst_iterator i = inst_begin(F_CPU), e = inst_end(F_CPU); i != e;
+         ++i) {
+      Instruction *I = &(*i);
+      DEBUG(errs() << *I << "\n");
+      // Leaf nodes should not contain HPVM graph intrinsics or launch
+      assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
+             "Launch intrinsic within a dataflow graph!");
+      assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
+             "HPVM graph intrinsic within a leaf dataflow node!");
+
+      if (BuildDFG::isHPVMQueryIntrinsic(I)) {
+        IntrinsicInst *II = cast<IntrinsicInst>(I);
+        IntrinsicInst *ArgII;
+        DFNode *ArgDFNode;
+
+        /***********************************************************************
+         *                        Handle HPVM Query intrinsics *
+         ***********************************************************************/
+        switch (II->getIntrinsicID()) {
+        /**************************** llvm.hpvm.getNode() *******************/
+        case Intrinsic::hpvm_getNode: {
+          // add mapping <intrinsic, this node> to the node-specific map
+          Leaf_HandleToDFNodeMap[II] = N;
+          IItoRemove.push_back(II);
+          break;
+        }
+        /************************* llvm.hpvm.getParentNode() ****************/
+        case Intrinsic::hpvm_getParentNode: {
+          // get the parent node of the arg node
+          // get argument node
+          ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+          // get the parent node of the arg node
+          ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+          // Add mapping <intrinsic, parent node> to the node-specific map
+          // the argument node must have been added to the map, orelse the
+          // code could not refer to it
+          Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+          IItoRemove.push_back(II);
+          break;
+        }
+        /*************************** llvm.hpvm.getNumDims() *****************/
+        case Intrinsic::hpvm_getNumDims: {
+          // get node from map
+          // get the appropriate field
+          ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+          int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
+          IntegerType *IntTy = Type::getInt32Ty(M.getContext());
+          ConstantInt *numOfDimConstant =
+              ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
+
+          II->replaceAllUsesWith(numOfDimConstant);
+          IItoRemove.push_back(II);
+          break;
+        }
+        /*********************** llvm.hpvm.getNodeInstanceID() **************/
+        case Intrinsic::hpvm_getNodeInstanceID_x:
+        case Intrinsic::hpvm_getNodeInstanceID_y:
+        case Intrinsic::hpvm_getNodeInstanceID_z: {
+          ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+          ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+          // The dfnode argument should be an ancestor of this leaf node or
+          // the leaf node itself
+          int parentLevel = N->getAncestorHops(ArgDFNode);
+          assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
+                 "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+          // Get specified dimension
+          // (dim = 0) => x
+          // (dim = 1) => y
+          // (dim = 2) => z
+          int dim =
+              (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x);
+          assert(
+              (dim >= 0) && (dim < 3) &&
+              "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic "
+              "ID!");
+
+          // For immediate ancestor, use the extra argument introduced in
+          // F_CPU
+          int numParamsF = F->getFunctionType()->getNumParams();
+          int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams();
+          assert((numParamsF_CPU - numParamsF == 6) &&
+                 "Difference of arguments between function and its clone is "
+                 "not 6!");
+
+          if (parentLevel == 0) {
+            // Case when the query is for this node itself
+            unsigned offset = 3 + (3 - dim);
+            // Traverse argument list of F_CPU in reverse order to find the
+            // correct index or dim argument.
+            Argument *indexVal = getArgumentFromEnd(F_CPU, offset);
+            assert(indexVal && "Index argument not found. Invalid offset!");
+
+            DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
+
+            II->replaceAllUsesWith(indexVal);
+            IItoRemove.push_back(II);
+          } else {
+            // Case when query is for an ancestor
+            Value *args[] = {
+                ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                 parentLevel),
+                ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
+            CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimInstance,
+                                            ArrayRef<Value *>(args, 2),
+                                            "nodeInstanceID", II);
+            DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+            II->replaceAllUsesWith(CI);
+            IItoRemove.push_back(II);
+          }
+          break;
+        }
+        /********************** llvm.hpvm.getNumNodeInstances() *************/
+        case Intrinsic::hpvm_getNumNodeInstances_x:
+        case Intrinsic::hpvm_getNumNodeInstances_y:
+        case Intrinsic::hpvm_getNumNodeInstances_z: {
+
+          ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+          ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+          // The dfnode argument should be an ancestor of this leaf node or
+          // the leaf node itself
+          int parentLevel = N->getAncestorHops(ArgDFNode);
+          assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
+                 "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+          // Get specified dimension
+          // (dim = 0) => x
+          // (dim = 1) => y
+          // (dim = 2) => z
+          int dim = (int)(II->getIntrinsicID() -
+                          Intrinsic::hpvm_getNumNodeInstances_x);
+          assert((dim >= 0) && (dim < 3) &&
+                 "Invalid dimension for getNumNodeInstances_[xyz]. Check "
+                 "Intrinsic ID!");
+
+          // For immediate ancestor, use the extra argument introduced in
+          // F_CPU
+          int numParamsF = F->getFunctionType()->getNumParams();
+          int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams();
+          assert((numParamsF_CPU - numParamsF == 6) &&
+                 "Difference of arguments between function and its clone is "
+                 "not 6!");
+
+          if (parentLevel == 0) {
+            // Case when the query is for this node itself
+            unsigned offset = 3 - dim;
+            // Traverse argument list of F_CPU in reverse order to find the
+            // correct index or dim argument.
+            Argument *limitVal = getArgumentFromEnd(F_CPU, offset);
+            assert(limitVal && "Limit argument not found. Invalid offset!");
+
+            DEBUG(errs() << *II << " replaced with " << *limitVal << "\n");
+
+            II->replaceAllUsesWith(limitVal);
+            IItoRemove.push_back(II);
+          } else {
+            // Case when query is from the ancestor
+            Value *args[] = {
+                ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                 parentLevel),
+                ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
+            CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimLimit,
+                                            ArrayRef<Value *>(args, 2),
+                                            "numNodeInstances", II);
+            DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+            II->replaceAllUsesWith(CI);
+            IItoRemove.push_back(II);
+          }
+
+          break;
+        }
+        default:
+          DEBUG(errs() << "Found unknown intrinsic with ID = "
+                       << II->getIntrinsicID() << "\n");
+          assert(false && "Unknown HPVM Intrinsic!");
+          break;
+        }
+
+      } else if (BuildDFG::isHPVMIntrinsic(I)) {
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+        if (II->getIntrinsicID() == Intrinsic::hpvm_nz_loop) {
+          IItoRemove.push_back(II);
+        }
+      }
+    }
+
+    // Remove them in reverse order
+    for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
+         i != IItoRemove.end(); ++i) {
+      (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
+      (*i)->eraseFromParent();
+    }
+
+    DEBUG(errs() << *F_CPU);
+
+  } else if (N->getTargetHint() == hpvm::EPOCHS_TARGET) {
+    // Do codegen for EPOCHS scheduler
+    // Only if the task hasn't been registered yet.
+    IRBuilder<> Builder(InitCall);
+
+    // Get Task Info
+    int TaskID = N->getTaskID();
+    TaskInfo &TI = getTaskInfo(TaskID);
+    // Check if task hasn't already been registered, then register it
+    if (RegisteredTasks.find(TI) == RegisteredTasks.end()) {
+      std::vector<Value *> RegisterTaskArgs;
+
+      // Load the Schedule Pointer
+      auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress);
+      DEBUG(errs() << *SchedulerPtr << "\n");
+      auto *SchedulerPtrBC = Builder.CreateBitCast(
+          SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle");
+      DEBUG(errs() << *SchedulerPtrBC << "\n");
+      RegisterTaskArgs.push_back(SchedulerPtrBC);
+
+      std::string TaskName = TI.TaskName;
+      Value *TaskNameStrPtr = Builder.CreateGlobalStringPtr(TaskName);
+      DEBUG(errs() << TaskName << ": " << *TaskNameStrPtr << "\n");
+      RegisterTaskArgs.push_back(TaskNameStrPtr);
+
+      std::string TaskDescription = TI.TaskDescription;
+      Value *TaskDescriptionStrPtr =
+          Builder.CreateGlobalStringPtr(TaskDescription);
+      DEBUG(errs() << TaskDescription << ": " << *TaskDescriptionStrPtr
+                   << "\n");
+      RegisterTaskArgs.push_back(TaskDescriptionStrPtr);
+
+      FunctionCallee TaskSetupFunction = getTaskSetupFunction(TI);
+      Value *TaskSetupFuncPtr = TaskSetupFunction.getCallee();
+      DEBUG(errs() << *TaskSetupFuncPtr << "\n");
+      RegisterTaskArgs.push_back(TaskSetupFuncPtr);
+
+      FunctionCallee TaskFinishExecutionFunction =
+          getTaskFinishExecutionFunction(TI);
+      Value *TaskFinishExecutionFuncPtr =
+          TaskFinishExecutionFunction.getCallee();
+      DEBUG(errs() << *TaskFinishExecutionFuncPtr << "\n");
+      RegisterTaskArgs.push_back(TaskFinishExecutionFuncPtr);
+
+      FunctionCallee TaskAutoFinishFunction = getTaskAutoFinishFunction(TI);
+      Value *TaskAutoFinishFuncPtr = TaskAutoFinishFunction.getCallee();
+      DEBUG(errs() << *TaskAutoFinishFuncPtr << "\n");
+      RegisterTaskArgs.push_back(TaskAutoFinishFuncPtr);
+
+      FunctionCallee TaskPrintFunction = getTaskPrintFunction(TI);
+      Value *TaskPrintFuncPtr = TaskPrintFunction.getCallee();
+      DEBUG(errs() << *TaskPrintFuncPtr << "\n");
+      RegisterTaskArgs.push_back(TaskPrintFuncPtr);
+
+      FunctionCallee TaskRunStatsFunction = getTaskRunStatsFunction(TI);
+      Value *TaskRunStatsFuncPtr = TaskRunStatsFunction.getCallee();
+      DEBUG(errs() << *TaskRunStatsFuncPtr << "\n");
+      RegisterTaskArgs.push_back(TaskRunStatsFuncPtr);
+
+      Value *NumAccels = ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                          TI.getNumDevices());
+      DEBUG(errs() << *NumAccels << "\n");
+      RegisterTaskArgs.push_back(NumAccels);
+
+      for (auto TDI : TI.Devices) {
+        hpvm::EPOCHS_DEVICES TaskDevice = TDI.TaskDeviceTy;
+        Value *TaskDeviceInt =
+            ConstantInt::get(Type::getInt64Ty(M.getContext()), TaskDevice);
+
+        FunctionCallee TaskDeviceExecutionFunction =
+            getTaskDeviceFunction(TI, TaskDevice);
+        Value *TaskDeviceExecutionFuncPtr =
+            TaskDeviceExecutionFunction.getCallee();
+
+        DEBUG(errs() << *TaskDeviceInt << ": " << *TaskDeviceExecutionFuncPtr
+                     << "\n");
+        RegisterTaskArgs.push_back(TaskDeviceInt);
+        RegisterTaskArgs.push_back(TaskDeviceExecutionFuncPtr);
+      }
+
+      DEBUG(errs() << "Calling: " << *register_task_type.getCallee()
+                   << "\n\tType: " << *register_task_type.getFunctionType()
+                   << "\n");
+
+      CallInst *RegisterTaskInst = Builder.CreateCall(
+          register_task_type, ArrayRef<Value *>(RegisterTaskArgs),
+          "reg_task_" + TaskName);
+      DEBUG(errs() << *RegisterTaskInst << "\n");
+
+      auto *TaskHandleAddr = new GlobalVariable(
+          M, RegisterTaskInst->getType(), //->getPointerTo(),
+          false, GlobalValue::CommonLinkage,
+          Constant::getNullValue(RegisterTaskInst->getType()),
+          "task_" + TaskName + ".addr");
+      DEBUG(errs() << *TaskHandleAddr << "\n");
+      auto *SI = Builder.CreateStore(RegisterTaskInst, TaskHandleAddr);
+      DEBUG(errs() << *SI << "\n");
+      TI.TaskHandleAddr = TaskHandleAddr;
+      RegisteredTasks.insert(TI);
+    }
+
+    TaskHandleMap[N] = TI;
+
+    N->addGenFunc(NULL, hpvm::EPOCHS_TARGET, true);
+
+    // Build the returnArgMap to handle the edges and insert finish() calls
+    Function *F = N->getFuncPointer();
+    StructType *FRetTy = cast<StructType>(F->getReturnType());
+    std::vector<ReturnInst *> RIvec;
+    findReturnInst(F, RIvec);
+    assert(RIvec.size() < 2 && "Cannot have more than 1 return!");
+
+    ReturnInst *RI = RIvec[0];
+
+    Value *RV = RI->getReturnValue();
+    DEBUG(errs() << *RV << "\n");
+    assert(isa<InsertValueInst>(RV) &&
+           "Expecting an InsertValueInst for the return");
+    InsertValueInst *IVI = dyn_cast<InsertValueInst>(RV);
+    Value *insertedValue = IVI->getInsertedValueOperand();
+    if (Argument *insertedArgument = dyn_cast<Argument>(insertedValue)) {
+      DEBUG(errs() << "Returned value is an argument! Treat it differently!\n");
+      assert(IVI->getIndices().size() == 1 &&
+             "Invalid indeces, expecting only 1.");
+      unsigned index = IVI->getIndices()[0];
+      returnArgMap[N][index] = insertedArgument->getArgNo();
+    } else {
+      llvm_unreachable("Unhandled return value!\n");
+    }
+    for (unsigned i = 1; i < FRetTy->getNumElements(); ++i) {
+      RV = IVI->getAggregateOperand();
+      assert(isa<InsertValueInst>(RV) &&
+             "Expecting an InsertValueInst for the return");
+      IVI = dyn_cast<InsertValueInst>(RV);
+      insertedValue = IVI->getInsertedValueOperand();
+      if (Argument *insertedArgument = dyn_cast<Argument>(insertedValue)) {
+        DEBUG(
+            errs() << "Returned value is an argument! Treat it differently!\n");
+        assert(IVI->getIndices().size() == 1 &&
+               "Invalid indeces, expecting only 1.");
+        unsigned index = IVI->getIndices()[0];
+        returnArgMap[N][index] = insertedArgument->getArgNo();
+      } else {
+        llvm_unreachable("Unhandled return value!\n");
+      }
+    }
+
+  } else {
+    // if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
+    //
+    // Do we need this???
+    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
+                 << " : skipping it\n");
+
+    switch (N->getTag()) {
+    case hpvm::GPU_TARGET:
+      // A leaf node should not have an cpu function for GPU
+      // by design of DFG2LLVM_OpenCL backend
+      assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) &&
+             "Leaf node not expected to have GPU GenFunc");
+      break;
+    default:
+      break;
+    }
+
+    // return;
+    //}
+  }
+}
+
+// Helper function, populate a vector with all return statements in a function
+static void findReturnInst(Function *F,
+                           std::vector<ReturnInst *> &ReturnInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    ReturnInst *RI = dyn_cast<ReturnInst>(I);
+    if (RI) {
+      ReturnInstVec.push_back(RI);
+    }
+  }
+}
+// Read the task details described in the Task configuration file
+// and populate the TaskTypes vector.
+void CGT_EPOCHS::parseTaskFile(std::string TaskPath) {
+  DEBUG(errs() << "parseTaskFile invoked on file path: " << TaskPath << "\n");
+  std::ifstream TaskFile(TaskPath);
+
+  TaskInfo TI;
+  TaskDeviceInfo TDI;
+
+  bool isTIEmpty = true;
+
+  for (std::string line; getline(TaskFile, line);) {
+    // std::cout << "Line: "<<line<<"\n";
+
+    std::vector<std::string> Tokens;
+
+    strSplit(line, ' ', Tokens);
+
+    if (Tokens.empty())
+      continue;
+
+    if (Tokens[0] == "TaskID") {
+      if (!isTIEmpty)
+        TaskTypes.push_back(TI);
+
+      TI.clearTaskInfo();
+      assert(Tokens.size() == 2 &&
+             "TaskID in Config File must provide integer TaskID");
+
+      TI.TaskID = std::stoi(Tokens[1]);
+      isTIEmpty = false;
+
+    } else if (Tokens[0] == "TaskName") {
+      assert(Tokens.size() == 2 &&
+             "TaskName in Config File must provide string TaskName");
+
+      TI.TaskName = Tokens[1];
+
+    } else if (Tokens[0] == "TaskDescription") {
+      assert(
+          Tokens.size() >= 2 &&
+          "TaskDescription in Config File must provide string TaskDescription");
+
+      std::string Description = "";
+      for (unsigned i = 1; i < Tokens.size(); i++) {
+        Description = Description + Tokens[i] + " ";
+      }
+
+      TI.TaskDescription = Description;
+
+    } else if (Tokens[0] == "TaskFinishExecution") {
+      assert(Tokens.size() == 2 && "TaskFinishExecution in Config File must "
+                                   "provide string Task Finish function name");
+
+      TI.TaskFinishExecution = Tokens[1];
+
+    } else if (Tokens[0] == "TaskAutoFinish") {
+      assert(Tokens.size() == 2 && "TaskAutoFinish in Config File must provide "
+                                   "string Task AutoFinish function name");
+
+      TI.TaskAutoFinish = Tokens[1];
+
+    } else if (Tokens[0] == "TaskSetup") {
+      assert(Tokens.size() == 2 && "TaskSetup in Config File must provide "
+                                   "string Task Setup function name");
+
+      TI.TaskSetup = Tokens[1];
+
+    } else if (Tokens[0] == "TaskPrint") {
+      assert(Tokens.size() == 2 && "TaskPrint in Config File must provide "
+                                   "string Task Print function name");
+
+      TI.TaskPrint = Tokens[1];
+
+    } else if (Tokens[0] == "TaskRunStats") {
+      assert(Tokens.size() == 2 && "TaskRunStats in Config File must provide "
+                                   "string Task Run Stats function name");
+
+      TI.TaskRunStats = Tokens[1];
+
+    } else if (Tokens[0].rfind("TaskDevice") == 0) {
+      assert(Tokens.size() == 2 &&
+             "TaskDevice* in Config File must provide string Task Device name");
+
+      TDI.clearTaskDeviceInfo();
+
+      // Check which device Ty
+      hpvm::EPOCHS_DEVICES DevTy = hpvm::CPU_ACCEL;
+
+      if (Tokens[1] == "CPU_ACCEL") {
+        DevTy = hpvm::CPU_ACCEL;
+      } else if (Tokens[1] == "1D_FFT_ACCEL") {
+        DevTy = hpvm::OneD_FFT_ACCEL;
+      } else if (Tokens[1] == "CV_CNN_ACCEL") {
+        DevTy = hpvm::CV_CNN_ACCEL;
+      } else if (Tokens[1] == "VITDEC_ACCEL") {
+        DevTy = hpvm::VITDEC_ACCEL;
+      } else {
+        assert(false && "Unknown Device Type encountered in Task Config file");
+      }
+
+      TDI.TaskDeviceTy = DevTy;
+
+    } else if (Tokens[0].rfind("TaskExecuteOnDevice") == 0) {
+      assert(Tokens.size() == 2 &&
+             "TaskExecuteOnDevice* in Config File must provide string Task "
+             "Execute on Device function name");
+      TDI.TaskDeviceFn = Tokens[1];
+      TI.Devices.push_back(TDI);
+
+    } else {
+      assert(false && "Unknown Token Type!");
+    }
+  }
+  if (!isTIEmpty) {
+    TaskTypes.push_back(TI);
+  }
+}
+
+TaskInfo &CGT_EPOCHS::getTaskInfo(int TaskID) {
+  for (TaskInfo &TI : TaskTypes) {
+    if (TI.TaskID == TaskID) {
+      return TI;
+    }
+  }
+
+  llvm_unreachable("Requested TaskInfo with unknown TaskID");
+}
+
+FunctionCallee CGT_EPOCHS::getTaskSetupFunction(TaskInfo &TI) {
+  return getTaskFunctionHelper(TI.TaskSetup);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskPrintFunction(TaskInfo &TI) {
+  return getTaskFunctionHelper(TI.TaskPrint);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskRunStatsFunction(TaskInfo &TI) {
+  return getTaskFunctionHelper(TI.TaskRunStats);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskDeviceFunction(TaskInfo &TI,
+                                                 hpvm::EPOCHS_DEVICES DevTy) {
+
+  std::string DeviceFunctionName = "";
+
+  for (TaskDeviceInfo TDI : TI.Devices) {
+    if (TDI.TaskDeviceTy == DevTy) {
+      DeviceFunctionName = TDI.TaskDeviceFn;
+      break;
+    }
+  }
+
+  assert(DeviceFunctionName != "" && "Device function name not found!");
+
+  return getTaskFunctionHelper(DeviceFunctionName);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskAutoFinishFunction(TaskInfo &TI) {
+  return getTaskFunctionHelper(TI.TaskAutoFinish);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskFinishExecutionFunction(TaskInfo &TI) {
+  return getTaskFunctionHelper(TI.TaskFinishExecution);
+}
+
+FunctionCallee CGT_EPOCHS::getTaskFunctionHelper(std::string TaskFunctionName) {
+  Function *TaskFn;
+
+  DEBUG(errs() << "Requesting Task Function " << TaskFunctionName << " ...\n");
+  if (!(TaskFn = M.getFunction(TaskFunctionName))) {
+    TaskFn = TaskModule->getFunction(TaskFunctionName);
+
+    assert(TaskFn && "Requested Task function does not exist in task module");
+  }
+
+  return M.getOrInsertFunction(TaskFunctionName, TaskFn->getFunctionType());
+}
+
+} // namespace dfg2llvm
+
+char DFG2LLVM_EPOCHS::ID = 0;
+static RegisterPass<DFG2LLVM_EPOCHS>
+    X("dfg2llvm-epochs", "Dataflow Graph to LLVM for EPOCHS backend",
+      false /* does not modify the CFG */,
+      true /* transformation, not just analysis */);
diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.exports b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.exports
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c620bd6030aee7e3742c09b7213baad9626a0ac5
--- /dev/null
+++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_EPOCHS
+parent = Transforms
diff --git a/hpvm/llvm_patches/include/llvm/Analysis/DDG.h b/hpvm/llvm_patches/include/llvm/Analysis/DDG.h
deleted file mode 100644
index 165efc97a480e06b8ea5e8031e97da469d1f6418..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/include/llvm/Analysis/DDG.h
+++ /dev/null
@@ -1,623 +0,0 @@
-//===- llvm/Analysis/DDG.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Data-Dependence Graph (DDG).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_DDG_H
-#define LLVM_ANALYSIS_DDG_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DirectedGraph.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/DependenceGraphBuilder.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Support/GraphWriter.h"
-#include <sstream>
-
-
-namespace llvm {
-class DDGNode;
-class DDGEdge;
-using DDGNodeBase = DGNode<DDGNode, DDGEdge>;
-using DDGEdgeBase = DGEdge<DDGNode, DDGEdge>;
-using DDGBase = DirectedGraph<DDGNode, DDGEdge>;
-class LPMUpdater;
-
-/// Data Dependence Graph Node
-/// The graph can represent the following types of nodes:
-/// 1. Single instruction node containing just one instruction.
-/// 2. Multiple instruction node where two or more instructions from
-///    the same basic block are merged into one node.
-/// 3. Pi-block node which is a group of other DDG nodes that are part of a
-///    strongly-connected component of the graph.
-///    A pi-block node contains more than one single or multiple instruction
-///    nodes. The root node cannot be part of a pi-block.
-/// 4. Root node is a special node that connects to all components such that
-///    there is always a path from it to any node in the graph.
-class DDGNode : public DDGNodeBase {
-public:
-  using InstructionListType = SmallVectorImpl<Instruction *>;
-
-  enum class NodeKind {
-    Unknown,
-    SingleInstruction,
-    MultiInstruction,
-    PiBlock,
-    Root,
-  };
-
-  DDGNode() = delete;
-  DDGNode(const NodeKind K) : DDGNodeBase(), Kind(K) {}
-  DDGNode(const DDGNode &N) : DDGNodeBase(N), Kind(N.Kind) {}
-  DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {}
-  virtual ~DDGNode() = 0;
-
-  DDGNode &operator=(const DDGNode &N) {
-    DGNode::operator=(N);
-    Kind = N.Kind;
-    return *this;
-  }
-
-  DDGNode &operator=(DDGNode &&N) {
-    DGNode::operator=(std::move(N));
-    Kind = N.Kind;
-    return *this;
-  }
-
-  /// Getter for the kind of this node.
-  NodeKind getKind() const { return Kind; }
-
-  /// Collect a list of instructions, in \p IList, for which predicate \p Pred
-  /// evaluates to true when iterating over instructions of this node. Return
-  /// true if at least one instruction was collected, and false otherwise.
-  bool collectInstructions(llvm::function_ref<bool(Instruction *)> const &Pred,
-                           InstructionListType &IList) const;
-
-protected:
-  /// Setter for the kind of this node.
-  void setKind(NodeKind K) { Kind = K; }
-
-private:
-  NodeKind Kind;
-};
-
-/// Subclass of DDGNode representing the root node of the graph.
-/// There should only be one such node in a given graph.
-class RootDDGNode : public DDGNode {
-public:
-  RootDDGNode() : DDGNode(NodeKind::Root) {}
-  RootDDGNode(const RootDDGNode &N) = delete;
-  RootDDGNode(RootDDGNode &&N) : DDGNode(std::move(N)) {}
-  ~RootDDGNode() {}
-
-  /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc.
-  static bool classof(const DDGNode *N) {
-    return N->getKind() == NodeKind::Root;
-  }
-  static bool classof(const RootDDGNode *N) { return true; }
-};
-
-/// Subclass of DDGNode representing single or multi-instruction nodes.
-class SimpleDDGNode : public DDGNode {
-  friend class DDGBuilder;
-
-public:
-  SimpleDDGNode() = delete;
-  SimpleDDGNode(Instruction &I);
-  SimpleDDGNode(const SimpleDDGNode &N);
-  SimpleDDGNode(SimpleDDGNode &&N);
-  ~SimpleDDGNode();
-
-  SimpleDDGNode &operator=(const SimpleDDGNode &N) {
-    DDGNode::operator=(N);
-    InstList = N.InstList;
-    return *this;
-  }
-
-  SimpleDDGNode &operator=(SimpleDDGNode &&N) {
-    DDGNode::operator=(std::move(N));
-    InstList = std::move(N.InstList);
-    return *this;
-  }
-
-  /// Get the list of instructions in this node.
-  const InstructionListType &getInstructions() const {
-    assert(!InstList.empty() && "Instruction List is empty.");
-    return InstList;
-  }
-  InstructionListType &getInstructions() {
-    return const_cast<InstructionListType &>(
-        static_cast<const SimpleDDGNode *>(this)->getInstructions());
-  }
-
-  /// Get the first/last instruction in the node.
-  Instruction *getFirstInstruction() const { return getInstructions().front(); }
-  Instruction *getLastInstruction() const { return getInstructions().back(); }
-
-  /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc.
-  static bool classof(const DDGNode *N) {
-    return N->getKind() == NodeKind::SingleInstruction ||
-           N->getKind() == NodeKind::MultiInstruction;
-  }
-  static bool classof(const SimpleDDGNode *N) { return true; }
-
-private:
-  /// Append the list of instructions in \p Input to this node.
-  void appendInstructions(const InstructionListType &Input) {
-    setKind((InstList.size() == 0 && Input.size() == 1)
-                ? NodeKind::SingleInstruction
-                : NodeKind::MultiInstruction);
-    InstList.insert(InstList.end(), Input.begin(), Input.end());
-  }
-  void appendInstructions(const SimpleDDGNode &Input) {
-    appendInstructions(Input.getInstructions());
-  }
-
-  /// List of instructions associated with a single or multi-instruction node.
-  SmallVector<Instruction *, 2> InstList;
-};
-
-/// Subclass of DDGNode representing a pi-block. A pi-block represents a group
-/// of DDG nodes that are part of a strongly-connected component of the graph.
-/// Replacing all the SCCs with pi-blocks results in an acyclic representation
-/// of the DDG. For example if we have:
-/// {a -> b}, {b -> c, d}, {c -> a}
-/// the cycle a -> b -> c -> a is abstracted into a pi-block "p" as follows:
-/// {p -> d} with "p" containing: {a -> b}, {b -> c}, {c -> a}
-class PiBlockDDGNode : public DDGNode {
-public:
-  using PiNodeList = SmallVector<DDGNode *, 4>;
-
-  PiBlockDDGNode() = delete;
-  PiBlockDDGNode(const PiNodeList &List);
-  PiBlockDDGNode(const PiBlockDDGNode &N);
-  PiBlockDDGNode(PiBlockDDGNode &&N);
-  ~PiBlockDDGNode();
-
-  PiBlockDDGNode &operator=(const PiBlockDDGNode &N) {
-    DDGNode::operator=(N);
-    NodeList = N.NodeList;
-    return *this;
-  }
-
-  PiBlockDDGNode &operator=(PiBlockDDGNode &&N) {
-    DDGNode::operator=(std::move(N));
-    NodeList = std::move(N.NodeList);
-    return *this;
-  }
-
-  /// Get the list of nodes in this pi-block.
-  const PiNodeList &getNodes() const {
-    assert(!NodeList.empty() && "Node list is empty.");
-    return NodeList;
-  }
-  PiNodeList &getNodes() {
-    return const_cast<PiNodeList &>(
-        static_cast<const PiBlockDDGNode *>(this)->getNodes());
-  }
-
-  /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc.
-  static bool classof(const DDGNode *N) {
-    return N->getKind() == NodeKind::PiBlock;
-  }
-
-private:
-  /// List of nodes in this pi-block.
-  PiNodeList NodeList;
-};
-
-/// Data Dependency Graph Edge.
-/// An edge in the DDG can represent a def-use relationship or
-/// a memory dependence based on the result of DependenceAnalysis.
-/// A rooted edge connects the root node to one of the components
-/// of the graph.
-class DDGEdge : public DDGEdgeBase {
-public:
-  /// The kind of edge in the DDG
-  enum class EdgeKind {
-    Unknown,
-    RegisterDefUse,
-    MemoryDependence,
-    Rooted,
-    Last = Rooted // Must be equal to the largest enum value.
-  };
-
-  explicit DDGEdge(DDGNode &N) = delete;
-  DDGEdge(DDGNode &N, EdgeKind K) : DDGEdgeBase(N), Kind(K) {}
-  DDGEdge(const DDGEdge &E) : DDGEdgeBase(E), Kind(E.getKind()) {}
-  DDGEdge(DDGEdge &&E) : DDGEdgeBase(std::move(E)), Kind(E.Kind) {}
-  DDGEdge &operator=(const DDGEdge &E) {
-    DDGEdgeBase::operator=(E);
-    Kind = E.Kind;
-    return *this;
-  }
-
-  DDGEdge &operator=(DDGEdge &&E) {
-    DDGEdgeBase::operator=(std::move(E));
-    Kind = E.Kind;
-    return *this;
-  }
-
-  /// Get the edge kind
-  EdgeKind getKind() const { return Kind; };
-
-  /// Return true if this is a def-use edge, and false otherwise.
-  bool isDefUse() const { return Kind == EdgeKind::RegisterDefUse; }
-
-  /// Return true if this is a memory dependence edge, and false otherwise.
-  bool isMemoryDependence() const { return Kind == EdgeKind::MemoryDependence; }
-
-  /// Return true if this is an edge stemming from the root node, and false
-  /// otherwise.
-  bool isRooted() const { return Kind == EdgeKind::Rooted; }
-
-private:
-  EdgeKind Kind;
-};
-
-/// Encapsulate some common data and functionality needed for different
-/// variations of data dependence graphs.
-template <typename NodeType> class DependenceGraphInfo {
-public:
-  using DependenceList = SmallVector<std::unique_ptr<Dependence>, 1>;
-
-  DependenceGraphInfo() = delete;
-  DependenceGraphInfo(const DependenceGraphInfo &G) = delete;
-  DependenceGraphInfo(const std::string &N, const DependenceInfo &DepInfo)
-      : Name(N), DI(DepInfo), Root(nullptr) {}
-  DependenceGraphInfo(DependenceGraphInfo &&G)
-      : Name(std::move(G.Name)), DI(std::move(G.DI)), Root(G.Root) {}
-  virtual ~DependenceGraphInfo() {}
-
-  /// Return the label that is used to name this graph.
-  const StringRef getName() const { return Name; }
-
-  /// Return the root node of the graph.
-  NodeType &getRoot() const {
-    assert(Root && "Root node is not available yet. Graph construction may "
-                   "still be in progress\n");
-    return *Root;
-  }
-
-	/// Collect all the data dependency infos coming from any pair of memory
-  /// accesses from \p Src to \p Dst, and store them into \p Deps. Return true
-  /// if a dependence exists, and false otherwise.
-  bool getDependencies(const NodeType &Src, const NodeType &Dst,
-                       DependenceList &Deps) const;
-
-protected:
-  // Name of the graph.
-  std::string Name;
-
-  // Store a copy of DependenceInfo in the graph, so that individual memory
-  // dependencies don't need to be stored. Instead when the dependence is
-  // queried it is recomputed using @DI.
-  const DependenceInfo DI;
-
-  // A special node in the graph that has an edge to every connected component of
-  // the graph, to ensure all nodes are reachable in a graph walk.
-  NodeType *Root = nullptr;
-};
-
-//===--------------------------------------------------------------------===//
-// DependenceGraphInfo Implementation
-//===--------------------------------------------------------------------===//
-
-template <typename NodeType>
-bool DependenceGraphInfo<NodeType>::getDependencies(
-    const NodeType &Src, const NodeType &Dst, DependenceList &Deps) const {
-  assert(Deps.empty() && "Expected empty output list at the start.");
-
-  // List of memory access instructions from src and dst nodes.
-  SmallVector<Instruction *, 8> SrcIList, DstIList;
-  auto isMemoryAccess = [](const Instruction *I) {
-    return I->mayReadOrWriteMemory();
-  };
-  Src.collectInstructions(isMemoryAccess, SrcIList);
-  Dst.collectInstructions(isMemoryAccess, DstIList);
-
-  for (auto *SrcI : SrcIList)
-    for (auto *DstI : DstIList)
-      if (auto Dep =
-              const_cast<DependenceInfo *>(&DI)->depends(SrcI, DstI, true))
-        Deps.push_back(std::move(Dep));
-
-  return !Deps.empty();
-}
-
-using DDGInfo = DependenceGraphInfo<DDGNode>;
-
-/// Data Dependency Graph
-class DataDependenceGraph : public DDGBase, public DDGInfo {
-  friend AbstractDependenceGraphBuilder<DataDependenceGraph>;
-  friend class DDGBuilder;
-
-public:
-  using NodeType = DDGNode;
-  using EdgeType = DDGEdge;
-
-  DataDependenceGraph() = delete;
-  DataDependenceGraph(const DataDependenceGraph &G) = delete;
-  DataDependenceGraph(DataDependenceGraph &&G)
-      : DDGBase(std::move(G)), DDGInfo(std::move(G)) {}
-  DataDependenceGraph(Function &F, DependenceInfo &DI);
-  DataDependenceGraph(Loop &L, LoopInfo &LI, DependenceInfo &DI);
-  ~DataDependenceGraph();
-
-  /// If node \p N belongs to a pi-block return a pointer to the pi-block,
-  /// otherwise return null.
-  const PiBlockDDGNode *getPiBlock(const NodeType &N) const;
-
-protected:
-  /// Add node \p N to the graph, if it's not added yet, and keep track of the
-  /// root node as well as pi-blocks and their members. Return true if node is
-  /// successfully added.
-  bool addNode(NodeType &N);
-
-private:
-  using PiBlockMapType = DenseMap<const NodeType *, const PiBlockDDGNode *>;
-
-  /// Mapping from graph nodes to their containing pi-blocks. If a node is not
-  /// part of a pi-block, it will not appear in this map.
-  PiBlockMapType PiBlockMap;
-};
-
-/// Concrete implementation of a pure data dependence graph builder. This class
-/// provides custom implementation for the pure-virtual functions used in the
-/// generic dependence graph build algorithm.
-///
-/// For information about time complexity of the build algorithm see the
-/// comments near the declaration of AbstractDependenceGraphBuilder.
-class DDGBuilder : public AbstractDependenceGraphBuilder<DataDependenceGraph> {
-public:
-  DDGBuilder(DataDependenceGraph &G, DependenceInfo &D,
-             const BasicBlockListType &BBs)
-      : AbstractDependenceGraphBuilder(G, D, BBs) {}
-  DDGNode &createRootNode() final override {
-    auto *RN = new RootDDGNode();
-    assert(RN && "Failed to allocate memory for DDG root node.");
-    Graph.addNode(*RN);
-    return *RN;
-  }
-  DDGNode &createFineGrainedNode(Instruction &I) final override {
-    auto *SN = new SimpleDDGNode(I);
-    assert(SN && "Failed to allocate memory for simple DDG node.");
-    Graph.addNode(*SN);
-    return *SN;
-  }
-  DDGNode &createPiBlock(const NodeListType &L) final override {
-    auto *Pi = new PiBlockDDGNode(L);
-    assert(Pi && "Failed to allocate memory for pi-block node.");
-    Graph.addNode(*Pi);
-    return *Pi;
-  }
-  DDGEdge &createDefUseEdge(DDGNode &Src, DDGNode &Tgt) final override {
-    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::RegisterDefUse);
-    assert(E && "Failed to allocate memory for edge");
-    Graph.connect(Src, Tgt, *E);
-    return *E;
-  }
-  DDGEdge &createMemoryEdge(DDGNode &Src, DDGNode &Tgt) final override {
-    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::MemoryDependence);
-    assert(E && "Failed to allocate memory for edge");
-    Graph.connect(Src, Tgt, *E);
-    return *E;
-  }
-  DDGEdge &createRootedEdge(DDGNode &Src, DDGNode &Tgt) final override {
-    auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::Rooted);
-    assert(E && "Failed to allocate memory for edge");
-    assert(isa<RootDDGNode>(Src) && "Expected root node");
-    Graph.connect(Src, Tgt, *E);
-    return *E;
-  }
-
-  const NodeListType &getNodesInPiBlock(const DDGNode &N) final override {
-    auto *PiNode = dyn_cast<const PiBlockDDGNode>(&N);
-    assert(PiNode && "Expected a pi-block node.");
-    return PiNode->getNodes();
-  }
-
-  /// Return true if the two nodes \pSrc and \pTgt are both simple nodes and
-  /// the consecutive instructions after merging belong to the same basic block.
-  bool areNodesMergeable(const DDGNode &Src,
-                         const DDGNode &Tgt) const final override;
-  void mergeNodes(DDGNode &Src, DDGNode &Tgt) final override;
-  bool shouldSimplify() const final override;
-  bool shouldCreatePiBlocks() const final override;
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const DDGNode &N);
-raw_ostream &operator<<(raw_ostream &OS, const DDGNode::NodeKind K);
-raw_ostream &operator<<(raw_ostream &OS, const DDGEdge &E);
-raw_ostream &operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K);
-raw_ostream &operator<<(raw_ostream &OS, const DataDependenceGraph &G);
-
-//===--------------------------------------------------------------------===//
-// DDG Analysis Passes
-//===--------------------------------------------------------------------===//
-
-/// Analysis pass that builds the DDG for a loop.
-class DDGAnalysis : public AnalysisInfoMixin<DDGAnalysis> {
-public:
-  using Result = std::unique_ptr<DataDependenceGraph>;
-  Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR);
-
-private:
-  friend AnalysisInfoMixin<DDGAnalysis>;
-  static AnalysisKey Key;
-};
-
-/// Textual printer pass for the DDG of a loop.
-class DDGAnalysisPrinterPass : public PassInfoMixin<DDGAnalysisPrinterPass> {
-public:
-  explicit DDGAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
-                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
-
-private:
-  raw_ostream &OS;
-};
-
-
-//===--------------------------------------------------------------------===//
-// GraphTraits specializations for the DDG
-//===--------------------------------------------------------------------===//
-
-
-
-/// non-const versions of the grapth trait specializations for DDG
-template <> struct GraphTraits<DDGNode *> {
-  using NodeRef = DDGNode *;
-
-  static DDGNode *DDGGetTargetNode(DGEdge<DDGNode, DDGEdge> *P) {
-    return &P->getTargetNode();
-  }
-
-  // Provide a mapped iterator so that the GraphTrait-based implementations can
-  // find the target nodes without having to explicitly go through the edges.
-  using ChildIteratorType =
-      mapped_iterator<DDGNode::iterator, decltype(&DDGGetTargetNode)>;
-  using ChildEdgeIteratorType = DDGNode::iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-  static ChildIteratorType child_begin(NodeRef N) {
-    return ChildIteratorType(N->begin(), &DDGGetTargetNode);
-  }
-  static ChildIteratorType child_end(NodeRef N) {
-    return ChildIteratorType(N->end(), &DDGGetTargetNode);
-  }
-
-  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
-    return N->begin();
-  }
-  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
-};
-
-template <>
-struct GraphTraits<DataDependenceGraph *> : public GraphTraits<DDGNode *> {
-  using nodes_iterator = DataDependenceGraph::iterator;
-  static NodeRef getEntryNode(DataDependenceGraph *DG) {
-    return &DG->getRoot();
-  }
-  static nodes_iterator nodes_begin(DataDependenceGraph *DG) {
-    return DG->begin();
-  }
-  static nodes_iterator nodes_end(DataDependenceGraph *DG) { return DG->end(); }
-};
-
-template <> struct DOTGraphTraits<DataDependenceGraph*> : public DefaultDOTGraphTraits {
-	DOTGraphTraits (bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
-	
-	static std::string getGraphName(DataDependenceGraph *Graph) { return "DDG";}
-
-	static std::string getGraphProperties(DataDependenceGraph *Graph) {
-		return "\tcompound=true;";
-	}
-
-	std::string getNodeLabel(DDGNode *Node, DataDependenceGraph *Graph) {
-		std::string Str;
-		raw_string_ostream ss(Str);
-		ss << Node << " : ";
-		switch(Node->getKind()) {
-			case DDGNode::NodeKind::Root :
-				ss << "Root Node\n";
-				break;
-			case DDGNode::NodeKind::SingleInstruction :
-			case DDGNode::NodeKind::MultiInstruction :
-				{
-				ss << "Simple Node\n";
-				SimpleDDGNode *SN = dyn_cast<SimpleDDGNode>(Node);
-				DDGNode::InstructionListType &instructions = SN->getInstructions();
-				for (auto i : instructions) {
-					ss << *i << "\n"; 
-				}
-				break;
-				}
-			case DDGNode::NodeKind::PiBlock :
-				{		
-					ss << "Pi Block\n";
-					PiBlockDDGNode *PBN = dyn_cast<PiBlockDDGNode>(Node);
-					PiBlockDDGNode::PiNodeList &nodes = PBN->getNodes();
-					for (auto n : nodes) {
-						ss << n << "\n";
-					}
-					break;
-				}
-			default :
-				ss << "Unknown\n";
-		}
-		return ss.str();	
-	}
-
-	std::string getEdgeLabel(DDGEdge *Edge, DataDependenceGraph *Graph) {
-		std::string Str;
-		raw_string_ostream ss(Str);
-		switch(Edge->getKind()) {
-			case DDGEdge::EdgeKind::RegisterDefUse :
-				ss << "def-use";
-				break;
-			case DDGEdge::EdgeKind::MemoryDependence :
-				ss << "memory" ;
-				break;
-			default :
-				ss << "";
-		}
-
-		return ss.str();
-	}
-};
-
-/// const versions of the grapth trait specializations for DDG
-template <> struct GraphTraits<const DDGNode *> {
-  using NodeRef = const DDGNode *;
-
-  static const DDGNode *DDGGetTargetNode(const DGEdge<DDGNode, DDGEdge> *P) {
-    return &P->getTargetNode();
-  }
-
-  // Provide a mapped iterator so that the GraphTrait-based implementations can
-  // find the target nodes without having to explicitly go through the edges.
-  using ChildIteratorType =
-      mapped_iterator<DDGNode::const_iterator, decltype(&DDGGetTargetNode)>;
-  using ChildEdgeIteratorType = DDGNode::const_iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-  static ChildIteratorType child_begin(NodeRef N) {
-    return ChildIteratorType(N->begin(), &DDGGetTargetNode);
-  }
-  static ChildIteratorType child_end(NodeRef N) {
-    return ChildIteratorType(N->end(), &DDGGetTargetNode);
-  }
-
-  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
-    return N->begin();
-  }
-  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
-};
-
-template <>
-struct GraphTraits<const DataDependenceGraph *>
-    : public GraphTraits<const DDGNode *> {
-  using nodes_iterator = DataDependenceGraph::const_iterator;
-  static NodeRef getEntryNode(const DataDependenceGraph *DG) {
-    return &DG->getRoot();
-  }
-  static nodes_iterator nodes_begin(const DataDependenceGraph *DG) {
-    return DG->begin();
-  }
-  static nodes_iterator nodes_end(const DataDependenceGraph *DG) {
-    return DG->end();
-  }
-};
-
-} // namespace llvm
-
-#endif // LLVM_ANALYSIS_DDG_H
diff --git a/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h b/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h
deleted file mode 100644
index 6f4e1be94164f797e4fc053dac754f8e28839e9e..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h
+++ /dev/null
@@ -1,203 +0,0 @@
-//===- llvm/Analysis/DependenceGraphBuilder.h -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a builder interface that can be used to populate dependence
-// graphs such as DDG and PDG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
-#define LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/SmallVector.h"
-
-namespace llvm {
-
-class BasicBlock;
-class DependenceInfo;
-class Instruction;
-
-/// This abstract builder class defines a set of high-level steps for creating
-/// DDG-like graphs. The client code is expected to inherit from this class and
-/// define concrete implementation for each of the pure virtual functions used
-/// in the high-level algorithm.
-template <class GraphType> class AbstractDependenceGraphBuilder {
-protected:
-  using BasicBlockListType = SmallVectorImpl<BasicBlock *>;
-
-private:
-  using NodeType = typename GraphType::NodeType;
-  using EdgeType = typename GraphType::EdgeType;
-
-public:
-  using ClassesType = EquivalenceClasses<BasicBlock *>;
-  using NodeListType = SmallVector<NodeType *, 4>;
-
-  AbstractDependenceGraphBuilder(GraphType &G, DependenceInfo &D,
-                                 const BasicBlockListType &BBs)
-      : Graph(G), DI(D), BBList(BBs) {}
-  virtual ~AbstractDependenceGraphBuilder() {}
-
-  /// The main entry to the graph construction algorithm. It starts by
-  /// creating nodes in increasing order of granularity and then
-  /// adds def-use and memory edges. As one of the final stages, it
-  /// also creates pi-block nodes to facilitate codegen in transformations
-  /// that use dependence graphs.
-  ///
-  /// The algorithmic complexity of this implementation is O(V^2 * I^2), where V
-  /// is the number of vertecies (nodes) and I is the number of instructions in
-  /// each node. The total number of instructions, N, is equal to V * I,
-  /// therefore the worst-case time complexity is O(N^2). The average time
-  /// complexity is O((N^2)/2).
-  void populate() {
-    computeInstructionOrdinals();
-    createFineGrainedNodes();
-    createDefUseEdges();
-    createMemoryDependencyEdges();
-    simplify();
-    createAndConnectRootNode();
-    createPiBlocks();
-    sortNodesTopologically();
-  }
-
-  /// Compute ordinal numbers for each instruction and store them in a map for
-  /// future look up. These ordinals are used to compute node ordinals which are
-  /// in turn used to order nodes that are part of a cycle.
-  /// Instruction ordinals are assigned based on lexical program order.
-  void computeInstructionOrdinals();
-
-  /// Create fine grained nodes. These are typically atomic nodes that
-  /// consist of a single instruction.
-  void createFineGrainedNodes();
-
-  /// Analyze the def-use chains and create edges from the nodes containing
-  /// definitions to the nodes containing the uses.
-  void createDefUseEdges();
-
-  /// Analyze data dependencies that exist between memory loads or stores,
-  /// in the graph nodes and create edges between them.
-  void createMemoryDependencyEdges();
-
-  /// Create a root node and add edges such that each node in the graph is
-  /// reachable from the root.
-  void createAndConnectRootNode();
-
-  /// Apply graph abstraction to groups of nodes that belong to a strongly
-  /// connected component of the graph to create larger compound nodes
-  /// called pi-blocks. The purpose of this abstraction is to isolate sets of
-  /// program elements that need to stay together during codegen and turn
-  /// the dependence graph into an acyclic graph.
-  void createPiBlocks();
-
-  /// Go through all the nodes in the graph and collapse any two nodes
-  /// 'a' and 'b' if all of the following are true:
-  ///   - the only edge from 'a' is a def-use edge to 'b' and
-  ///   - the only edge to 'b' is a def-use edge from 'a' and
-  ///   - there is no cyclic edge from 'b' to 'a' and
-  ///   - all instructions in 'a' and 'b' belong to the same basic block and
-  ///   - both 'a' and 'b' are simple (single or multi instruction) nodes.
-  void simplify();
-
-  /// Topologically sort the graph nodes.
-  void sortNodesTopologically();
-
-protected:
-  /// Create the root node of the graph.
-  virtual NodeType &createRootNode() = 0;
-
-  /// Create an atomic node in the graph given a single instruction.
-  virtual NodeType &createFineGrainedNode(Instruction &I) = 0;
-
-  /// Create a pi-block node in the graph representing a group of nodes in an
-  /// SCC of the graph.
-  virtual NodeType &createPiBlock(const NodeListType &L) = 0;
-
-  /// Create a def-use edge going from \p Src to \p Tgt.
-  virtual EdgeType &createDefUseEdge(NodeType &Src, NodeType &Tgt) = 0;
-
-  /// Create a memory dependence edge going from \p Src to \p Tgt.
-  virtual EdgeType &createMemoryEdge(NodeType &Src, NodeType &Tgt) = 0;
-
-  /// Create a rooted edge going from \p Src to \p Tgt .
-  virtual EdgeType &createRootedEdge(NodeType &Src, NodeType &Tgt) = 0;
-
-  /// Given a pi-block node, return a vector of all the nodes contained within
-  /// it.
-  virtual const NodeListType &getNodesInPiBlock(const NodeType &N) = 0;
-
-  /// Deallocate memory of edge \p E.
-  virtual void destroyEdge(EdgeType &E) { delete &E; }
-
-  /// Deallocate memory of node \p N.
-  virtual void destroyNode(NodeType &N) { delete &N; }
-
-  /// Return true if creation of pi-blocks are supported and desired,
-  /// and false otherwise.
-  virtual bool shouldCreatePiBlocks() const { return true; }
-
-  /// Return true if graph simplification step is requested, and false
-  /// otherwise.
-  virtual bool shouldSimplify() const { return true; }
-
-  /// Return true if it's safe to merge the two nodes.
-  virtual bool areNodesMergeable(const NodeType &A,
-                                 const NodeType &B) const = 0;
-
-  /// Append the content of node \p B into node \p A and remove \p B and
-  /// the edge between \p A and \p B from the graph.
-  virtual void mergeNodes(NodeType &A, NodeType &B) = 0;
-
-  /// Given an instruction \p I return its associated ordinal number.
-  size_t getOrdinal(Instruction &I) {
-    assert(InstOrdinalMap.find(&I) != InstOrdinalMap.end() &&
-           "No ordinal computed for this instruction.");
-    return InstOrdinalMap[&I];
-  }
-
-  /// Given a node \p N return its associated ordinal number.
-  size_t getOrdinal(NodeType &N) {
-    assert(NodeOrdinalMap.find(&N) != NodeOrdinalMap.end() &&
-           "No ordinal computed for this node.");
-    return NodeOrdinalMap[&N];
-  }
-
-  /// Map types to map instructions to nodes used when populating the graph.
-  using InstToNodeMap = DenseMap<Instruction *, NodeType *>;
-
-  /// Map Types to map instruction/nodes to an ordinal number.
-  using InstToOrdinalMap = DenseMap<Instruction *, size_t>;
-  using NodeToOrdinalMap = DenseMap<NodeType *, size_t>;
-
-  /// Reference to the graph that gets built by a concrete implementation of
-  /// this builder.
-  GraphType &Graph;
-
-  /// Dependence information used to create memory dependence edges in the
-  /// graph.
-  DependenceInfo &DI;
-
-  /// The list of basic blocks to consider when building the graph.
-  const BasicBlockListType &BBList;
-
-  /// A mapping from instructions to the corresponding nodes in the graph.
-  InstToNodeMap IMap;
-
-  /// A mapping from each instruction to an ordinal number. This map is used to
-  /// populate the \p NodeOrdinalMap.
-  InstToOrdinalMap InstOrdinalMap;
-
-  /// A mapping from nodes to an ordinal number. This map is used to sort nodes
-  /// in a pi-block based on program order.
-  NodeToOrdinalMap NodeOrdinalMap;
-};
-
-} // namespace llvm
-
-#endif // LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H
diff --git a/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt b/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt
deleted file mode 100644
index 0f1c6014a329fab214630a8f102f51d47279750c..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-add_llvm_library(LLVMAnalysis
-  AliasAnalysis.cpp
-  AliasAnalysisEvaluator.cpp
-  AliasAnalysisSummary.cpp
-  AliasSetTracker.cpp
-  Analysis.cpp
-  AssumptionCache.cpp
-  BasicAliasAnalysis.cpp
-  BlockFrequencyInfo.cpp
-  BlockFrequencyInfoImpl.cpp
-  BranchProbabilityInfo.cpp
-  CFG.cpp
-  CFGPrinter.cpp
-  CFLAndersAliasAnalysis.cpp
-  CFLSteensAliasAnalysis.cpp
-  CGSCCPassManager.cpp
-  CallGraph.cpp
-  CallGraphSCCPass.cpp
-  CallPrinter.cpp
-  CaptureTracking.cpp
-  CmpInstAnalysis.cpp
-  CostModel.cpp
-  CodeMetrics.cpp
-  ConstantFolding.cpp
-	DDG.cpp
-	Delinearization.cpp
-  DemandedBits.cpp
-  DependenceAnalysis.cpp
-	DependenceGraphBuilder.cpp
-  DivergenceAnalysis.cpp
-  DomPrinter.cpp
-  DomTreeUpdater.cpp
-  DominanceFrontier.cpp
-  EHPersonalities.cpp
-  GlobalsModRef.cpp
-  GuardUtils.cpp
-  IVDescriptors.cpp
-  IVUsers.cpp
-  IndirectCallPromotionAnalysis.cpp
-  InlineCost.cpp
-  InstCount.cpp
-  InstructionPrecedenceTracking.cpp
-  InstructionSimplify.cpp
-  Interval.cpp
-  IntervalPartition.cpp
-  LazyBranchProbabilityInfo.cpp
-  LazyBlockFrequencyInfo.cpp
-  LazyCallGraph.cpp
-  LazyValueInfo.cpp
-  LegacyDivergenceAnalysis.cpp
-  Lint.cpp
-  Loads.cpp
-  LoopAccessAnalysis.cpp
-  LoopAnalysisManager.cpp
-  LoopUnrollAnalyzer.cpp
-  LoopInfo.cpp
-  LoopPass.cpp
-  MemDepPrinter.cpp
-  MemDerefPrinter.cpp
-  MemoryBuiltins.cpp
-  MemoryDependenceAnalysis.cpp
-  MemoryLocation.cpp
-  MemorySSA.cpp
-  MemorySSAUpdater.cpp
-  ModuleDebugInfoPrinter.cpp
-  ModuleSummaryAnalysis.cpp
-  MustExecute.cpp
-  ObjCARCAliasAnalysis.cpp
-  ObjCARCAnalysisUtils.cpp
-  ObjCARCInstKind.cpp
-  OptimizationRemarkEmitter.cpp
-  OrderedBasicBlock.cpp
-  OrderedInstructions.cpp
-  PHITransAddr.cpp
-  PhiValues.cpp
-  PostDominators.cpp
-  ProfileSummaryInfo.cpp
-  PtrUseVisitor.cpp
-  RegionInfo.cpp
-  RegionPass.cpp
-  RegionPrinter.cpp
-  ScalarEvolution.cpp
-  ScalarEvolutionAliasAnalysis.cpp
-  ScalarEvolutionExpander.cpp
-  ScalarEvolutionNormalization.cpp
-  StackSafetyAnalysis.cpp
-  SyncDependenceAnalysis.cpp
-  SyntheticCountsUtils.cpp
-  TargetLibraryInfo.cpp
-  TargetTransformInfo.cpp
-  Trace.cpp
-  TypeBasedAliasAnalysis.cpp
-  TypeMetadataUtils.cpp
-  ScopedNoAliasAA.cpp
-  ValueLattice.cpp
-  ValueLatticeUtils.cpp
-  ValueTracking.cpp
-  VectorUtils.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
-
-  DEPENDS
-  intrinsics_gen
-  )
diff --git a/hpvm/llvm_patches/lib/Analysis/DDG.cpp b/hpvm/llvm_patches/lib/Analysis/DDG.cpp
deleted file mode 100644
index 7bd3044918492ee29aae21599c77f6e66f38804c..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/lib/Analysis/DDG.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===- DDG.cpp - Data Dependence Graph -------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The implementation for the data dependence graph.
-//===----------------------------------------------------------------------===//
-#include "llvm/Analysis/DDG.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Support/CommandLine.h"
-#include <memory>
-
-using namespace llvm;
-
-static cl::opt<bool> SimplifyDDG(
-    "ddg-simplify", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-    cl::desc(
-        "Simplify DDG by merging nodes that have less interesting edges."));
-
-static cl::opt<bool>
-    CreatePiBlocks("ddg-pi-blocks", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-                   cl::desc("Create pi-block nodes."));
-
-#define DEBUG_TYPE "ddg"
-
-template class llvm::DGEdge<DDGNode, DDGEdge>;
-template class llvm::DGNode<DDGNode, DDGEdge>;
-template class llvm::DirectedGraph<DDGNode, DDGEdge>;
-
-//===--------------------------------------------------------------------===//
-// DDGNode implementation
-//===--------------------------------------------------------------------===//
-DDGNode::~DDGNode() {}
-
-bool DDGNode::collectInstructions(
-    llvm::function_ref<bool(Instruction *)> const &Pred,
-    InstructionListType &IList) const {
-  assert(IList.empty() && "Expected the IList to be empty on entry.");
-  if (isa<SimpleDDGNode>(this)) {
-    for (Instruction *I : cast<const SimpleDDGNode>(this)->getInstructions())
-      if (Pred(I))
-        IList.push_back(I);
-  } else if (isa<PiBlockDDGNode>(this)) {
-    for (const DDGNode *PN : cast<const PiBlockDDGNode>(this)->getNodes()) {
-      assert(!isa<PiBlockDDGNode>(PN) && "Nested PiBlocks are not supported.");
-      SmallVector<Instruction *, 8> TmpIList;
-      PN->collectInstructions(Pred, TmpIList);
-      IList.insert(IList.end(), TmpIList.begin(), TmpIList.end());
-    }
-  } else
-    llvm_unreachable("unimplemented type of node");
-  return !IList.empty();
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode::NodeKind K) {
-  const char *Out;
-  switch (K) {
-  case DDGNode::NodeKind::SingleInstruction:
-    Out = "single-instruction";
-    break;
-  case DDGNode::NodeKind::MultiInstruction:
-    Out = "multi-instruction";
-    break;
-  case DDGNode::NodeKind::PiBlock:
-    Out = "pi-block";
-    break;
-  case DDGNode::NodeKind::Root:
-    Out = "root";
-    break;
-  case DDGNode::NodeKind::Unknown:
-    Out = "?? (error)";
-    break;
-  }
-  OS << Out;
-  return OS;
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) {
-  OS << "Node Address:" << &N << ":" << N.getKind() << "\n";
-  if (isa<SimpleDDGNode>(N)) {
-    OS << " Instructions:\n";
-    for (const Instruction *I : cast<const SimpleDDGNode>(N).getInstructions())
-      OS.indent(2) << *I << "\n";
-  } else if (isa<PiBlockDDGNode>(&N)) {
-    OS << "--- start of nodes in pi-block ---\n";
-    auto &Nodes = cast<const PiBlockDDGNode>(&N)->getNodes();
-    unsigned Count = 0;
-    for (const DDGNode *N : Nodes)
-      OS << *N << (++Count == Nodes.size() ? "" : "\n");
-    OS << "--- end of nodes in pi-block ---\n";
-  } else if (!isa<RootDDGNode>(N))
-    llvm_unreachable("unimplemented type of node");
-
-  OS << (N.getEdges().empty() ? " Edges:none!\n" : " Edges:\n");
-  for (auto &E : N.getEdges())
-    OS.indent(2) << *E;
-  return OS;
-}
-
-//===--------------------------------------------------------------------===//
-// SimpleDDGNode implementation
-//===--------------------------------------------------------------------===//
-
-SimpleDDGNode::SimpleDDGNode(Instruction &I)
-  : DDGNode(NodeKind::SingleInstruction), InstList() {
-  assert(InstList.empty() && "Expected empty list.");
-  InstList.push_back(&I);
-}
-
-SimpleDDGNode::SimpleDDGNode(const SimpleDDGNode &N)
-    : DDGNode(N), InstList(N.InstList) {
-  assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) ||
-          (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) &&
-         "constructing from invalid simple node.");
-}
-
-SimpleDDGNode::SimpleDDGNode(SimpleDDGNode &&N)
-    : DDGNode(std::move(N)), InstList(std::move(N.InstList)) {
-  assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) ||
-          (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) &&
-         "constructing from invalid simple node.");
-}
-
-SimpleDDGNode::~SimpleDDGNode() { InstList.clear(); }
-
-//===--------------------------------------------------------------------===//
-// PiBlockDDGNode implementation
-//===--------------------------------------------------------------------===//
-
-PiBlockDDGNode::PiBlockDDGNode(const PiNodeList &List)
-    : DDGNode(NodeKind::PiBlock), NodeList(List) {
-  assert(!NodeList.empty() && "pi-block node constructed with an empty list.");
-}
-
-PiBlockDDGNode::PiBlockDDGNode(const PiBlockDDGNode &N)
-    : DDGNode(N), NodeList(N.NodeList) {
-  assert(getKind() == NodeKind::PiBlock && !NodeList.empty() &&
-         "constructing from invalid pi-block node.");
-}
-
-PiBlockDDGNode::PiBlockDDGNode(PiBlockDDGNode &&N)
-    : DDGNode(std::move(N)), NodeList(std::move(N.NodeList)) {
-  assert(getKind() == NodeKind::PiBlock && !NodeList.empty() &&
-         "constructing from invalid pi-block node.");
-}
-
-PiBlockDDGNode::~PiBlockDDGNode() { NodeList.clear(); }
-
-//===--------------------------------------------------------------------===//
-// DDGEdge implementation
-//===--------------------------------------------------------------------===//
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K) {
-  const char *Out;
-  switch (K) {
-  case DDGEdge::EdgeKind::RegisterDefUse:
-    Out = "def-use";
-    break;
-  case DDGEdge::EdgeKind::MemoryDependence:
-    Out = "memory";
-    break;
-  case DDGEdge::EdgeKind::Rooted:
-    Out = "rooted";
-    break;
-  case DDGEdge::EdgeKind::Unknown:
-    Out = "?? (error)";
-    break;
-  }
-  OS << Out;
-  return OS;
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge &E) {
-  OS << "[" << E.getKind() << "] to " << &E.getTargetNode() << "\n";
-  return OS;
-}
-
-//===--------------------------------------------------------------------===//
-// DataDependenceGraph implementation
-//===--------------------------------------------------------------------===//
-using BasicBlockListType = SmallVector<BasicBlock *, 8>;
-
-DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D)
-    : DependenceGraphInfo(F.getName().str(), D) {
-  // Put the basic blocks in program order for correct dependence
-  // directions.
-  BasicBlockListType BBList;
-  for (auto &SCC : make_range(scc_begin(&F), scc_end(&F)))
-    for (BasicBlock * BB : SCC)
-      BBList.push_back(BB);
-  std::reverse(BBList.begin(), BBList.end());
-  DDGBuilder(*this, D, BBList).populate();
-}
-
-DataDependenceGraph::DataDependenceGraph(Loop &L, LoopInfo &LI,
-                                         DependenceInfo &D)
-    : DependenceGraphInfo(Twine(L.getHeader()->getParent()->getName() + "." +
-                                L.getHeader()->getName())
-                              .str(),
-                          D) {
-  // Put the basic blocks in program order for correct dependence
-  // directions.
-  LoopBlocksDFS DFS(&L);
-  DFS.perform(&LI);
-  BasicBlockListType BBList;
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    BBList.push_back(BB);
-  DDGBuilder(*this, D, BBList).populate();
-}
-
-DataDependenceGraph::~DataDependenceGraph() {
-  for (auto *N : Nodes) {
-    for (auto *E : *N)
-      delete E;
-    delete N;
-  }
-}
-
-bool DataDependenceGraph::addNode(DDGNode &N) {
-  if (!DDGBase::addNode(N))
-    return false;
-
-  // In general, if the root node is already created and linked, it is not safe
-  // to add new nodes since they may be unreachable by the root. However,
-  // pi-block nodes need to be added after the root node is linked, and they are
-  // always reachable by the root, because they represent components that are
-  // already reachable by root.
-  auto *Pi = dyn_cast<PiBlockDDGNode>(&N);
-  assert((!Root || Pi) &&
-         "Root node is already added. No more nodes can be added.");
-
-  if (isa<RootDDGNode>(N))
-    Root = &N;
-
-  if (Pi)
-    for (DDGNode *NI : Pi->getNodes())
-      PiBlockMap.insert(std::make_pair(NI, Pi));
-
-  return true;
-}
-
-const PiBlockDDGNode *DataDependenceGraph::getPiBlock(const NodeType &N) const {
-  if (PiBlockMap.find(&N) == PiBlockMap.end())
-    return nullptr;
-  auto *Pi = PiBlockMap.find(&N)->second;
-  assert(PiBlockMap.find(Pi) == PiBlockMap.end() &&
-         "Nested pi-blocks detected.");
-  return Pi;
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const DataDependenceGraph &G) {
-  for (DDGNode *Node : G)
-    // Avoid printing nodes that are part of a pi-block twice. They will get
-    // printed when the pi-block is printed.
-    if (!G.getPiBlock(*Node))
-      OS << *Node << "\n";
-  OS << "\n";
-  return OS;
-}
-
-//===--------------------------------------------------------------------===//
-// DDGBuilder implementation
-//===--------------------------------------------------------------------===//
-
-bool DDGBuilder::areNodesMergeable(const DDGNode &Src,
-                                   const DDGNode &Tgt) const {
-  // Only merge two nodes if they are both simple nodes and the consecutive
-  // instructions after merging belong to the same BB.
-  const auto *SimpleSrc = dyn_cast<const SimpleDDGNode>(&Src);
-  const auto *SimpleTgt = dyn_cast<const SimpleDDGNode>(&Tgt);
-  if (!SimpleSrc || !SimpleTgt)
-    return false;
-
-  return SimpleSrc->getLastInstruction()->getParent() ==
-         SimpleTgt->getFirstInstruction()->getParent();
-}
-
-void DDGBuilder::mergeNodes(DDGNode &A, DDGNode &B) {
-  DDGEdge &EdgeToFold = A.back();
-  assert(A.getEdges().size() == 1 && EdgeToFold.getTargetNode() == B &&
-         "Expected A to have a single edge to B.");
-  assert(isa<SimpleDDGNode>(&A) && isa<SimpleDDGNode>(&B) &&
-         "Expected simple nodes");
-
-  // Copy instructions from B to the end of A.
-  cast<SimpleDDGNode>(&A)->appendInstructions(*cast<SimpleDDGNode>(&B));
-
-  // Move to A any outgoing edges from B.
-  for (DDGEdge *BE : B)
-    Graph.connect(A, BE->getTargetNode(), *BE);
-
-  A.removeEdge(EdgeToFold);
-  destroyEdge(EdgeToFold);
-  Graph.removeNode(B);
-  destroyNode(B);
-}
-
-bool DDGBuilder::shouldSimplify() const { return SimplifyDDG; }
-
-bool DDGBuilder::shouldCreatePiBlocks() const { return CreatePiBlocks; }
-
-//===--------------------------------------------------------------------===//
-// DDG Analysis Passes
-//===--------------------------------------------------------------------===//
-
-/// DDG as a loop pass.
-DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM,
-                                     LoopStandardAnalysisResults &AR) {
-  Function *F = L.getHeader()->getParent();
-  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
-  return std::make_unique<DataDependenceGraph>(L, AR.LI, DI);
-}
-AnalysisKey DDGAnalysis::Key;
-
-PreservedAnalyses DDGAnalysisPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
-                                              LoopStandardAnalysisResults &AR,
-                                              LPMUpdater &U) {
-  OS << "'DDG' for loop '" << L.getHeader()->getName() << "':\n";
-  OS << *AM.getResult<DDGAnalysis>(L, AR);
-  return PreservedAnalyses::all();
-}
diff --git a/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp b/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp
deleted file mode 100644
index 95a39f984d6378af4d7a8042954a5913bbf1a971..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp
+++ /dev/null
@@ -1,535 +0,0 @@
-//===- DependenceGraphBuilder.cpp ------------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file implements common steps of the build algorithm for construction
-// of dependence graphs such as DDG and PDG.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/DependenceGraphBuilder.h"
-#include "llvm/ADT/EnumeratedArray.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DDG.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "dgb"
-
-STATISTIC(TotalGraphs, "Number of dependence graphs created.");
-STATISTIC(TotalDefUseEdges, "Number of def-use edges created.");
-STATISTIC(TotalMemoryEdges, "Number of memory dependence edges created.");
-STATISTIC(TotalFineGrainedNodes, "Number of fine-grained nodes created.");
-STATISTIC(TotalPiBlockNodes, "Number of pi-block nodes created.");
-STATISTIC(TotalConfusedEdges,
-          "Number of confused memory dependencies between two nodes.");
-STATISTIC(TotalEdgeReversals,
-          "Number of times the source and sink of dependence was reversed to "
-          "expose cycles in the graph.");
-
-using InstructionListType = SmallVector<Instruction *, 2>;
-
-//===--------------------------------------------------------------------===//
-// AbstractDependenceGraphBuilder implementation
-//===--------------------------------------------------------------------===//
-
-template <class G>
-void AbstractDependenceGraphBuilder<G>::computeInstructionOrdinals() {
-  // The BBList is expected to be in program order.
-  size_t NextOrdinal = 1;
-  for (auto *BB : BBList)
-    for (auto &I : *BB)
-      InstOrdinalMap.insert(std::make_pair(&I, NextOrdinal++));
-}
-
-template <class G>
-void AbstractDependenceGraphBuilder<G>::createFineGrainedNodes() {
-  ++TotalGraphs;
-  assert(IMap.empty() && "Expected empty instruction map at start");
-  for (BasicBlock *BB : BBList)
-    for (Instruction &I : *BB) {
-      auto &NewNode = createFineGrainedNode(I);
-      IMap.insert(std::make_pair(&I, &NewNode));
-      NodeOrdinalMap.insert(std::make_pair(&NewNode, getOrdinal(I)));
-      ++TotalFineGrainedNodes;
-    }
-}
-
-template <class G>
-void AbstractDependenceGraphBuilder<G>::createAndConnectRootNode() {
-  // Create a root node that connects to every connected component of the graph.
-  // This is done to allow graph iterators to visit all the disjoint components
-  // of the graph, in a single walk.
-  //
-  // This algorithm works by going through each node of the graph and for each
-  // node N, do a DFS starting from N. A rooted edge is established between the
-  // root node and N (if N is not yet visited). All the nodes reachable from N
-  // are marked as visited and are skipped in the DFS of subsequent nodes.
-  //
-  // Note: This algorithm tries to limit the number of edges out of the root
-  // node to some extent, but there may be redundant edges created depending on
-  // the iteration order. For example for a graph {A -> B}, an edge from the
-  // root node is added to both nodes if B is visited before A. While it does
-  // not result in minimal number of edges, this approach saves compile-time
-  // while keeping the number of edges in check.
-  auto &RootNode = createRootNode();
-  df_iterator_default_set<const NodeType *, 4> Visited;
-  for (auto *N : Graph) {
-    if (*N == RootNode)
-      continue;
-    for (auto I : depth_first_ext(N, Visited))
-      if (I == N)
-        createRootedEdge(RootNode, *N);
-  }
-}
-
-template <class G> void AbstractDependenceGraphBuilder<G>::createPiBlocks() {
-  if (!shouldCreatePiBlocks())
-    return;
-
-  LLVM_DEBUG(dbgs() << "==== Start of Creation of Pi-Blocks ===\n");
-
-  // The overall algorithm is as follows:
-  // 1. Identify SCCs and for each SCC create a pi-block node containing all
-  //    the nodes in that SCC.
-  // 2. Identify incoming edges incident to the nodes inside of the SCC and
-  //    reconnect them to the pi-block node.
-  // 3. Identify outgoing edges from the nodes inside of the SCC to nodes
-  //    outside of it and reconnect them so that the edges are coming out of the
-  //    SCC node instead.
-
-  // Adding nodes as we iterate through the SCCs cause the SCC
-  // iterators to get invalidated. To prevent this invalidation, we first
-  // collect a list of nodes that are part of an SCC, and then iterate over
-  // those lists to create the pi-block nodes. Each element of the list is a
-  // list of nodes in an SCC. Note: trivial SCCs containing a single node are
-  // ignored.
-  SmallVector<NodeListType, 4> ListOfSCCs;
-  for (auto &SCC : make_range(scc_begin(&Graph), scc_end(&Graph))) {
-    if (SCC.size() > 1)
-      ListOfSCCs.emplace_back(SCC.begin(), SCC.end());
-  }
-
-  for (NodeListType &NL : ListOfSCCs) {
-    LLVM_DEBUG(dbgs() << "Creating pi-block node with " << NL.size()
-                      << " nodes in it.\n");
-
-    // SCC iterator may put the nodes in an order that's different from the
-    // program order. To preserve original program order, we sort the list of
-    // nodes based on ordinal numbers computed earlier.
-    llvm::sort(NL, [&](NodeType *LHS, NodeType *RHS) {
-      return getOrdinal(*LHS) < getOrdinal(*RHS);
-    });
-
-    NodeType &PiNode = createPiBlock(NL);
-    ++TotalPiBlockNodes;
-
-    // Build a set to speed up the lookup for edges whose targets
-    // are inside the SCC.
-    SmallPtrSet<NodeType *, 4> NodesInSCC(NL.begin(), NL.end());
-
-    // We have the set of nodes in the SCC. We go through the set of nodes
-    // that are outside of the SCC and look for edges that cross the two sets.
-    for (NodeType *N : Graph) {
-
-      // Skip the SCC node and all the nodes inside of it.
-      if (*N == PiNode || NodesInSCC.count(N))
-        continue;
-
-      for (NodeType *SCCNode : NL) {
-
-        enum Direction {
-          Incoming,      // Incoming edges to the SCC
-          Outgoing,      // Edges going ot of the SCC
-          DirectionCount // To make the enum usable as an array index.
-        };
-
-        // Use these flags to help us avoid creating redundant edges. If there
-        // are more than one edges from an outside node to inside nodes, we only
-        // keep one edge from that node to the pi-block node. Similarly, if
-        // there are more than one edges from inside nodes to an outside node,
-        // we only keep one edge from the pi-block node to the outside node.
-        // There is a flag defined for each direction (incoming vs outgoing) and
-        // for each type of edge supported, using a two-dimensional boolean
-        // array.
-        using EdgeKind = typename EdgeType::EdgeKind;
-        EnumeratedArray<bool, EdgeKind> EdgeAlreadyCreated[DirectionCount]{
-            false, false};
-
-        auto createEdgeOfKind = [this](NodeType &Src, NodeType &Dst,
-                                       const EdgeKind K) {
-          switch (K) {
-          case EdgeKind::RegisterDefUse:
-            createDefUseEdge(Src, Dst);
-            break;
-          case EdgeKind::MemoryDependence:
-            createMemoryEdge(Src, Dst);
-            break;
-          case EdgeKind::Rooted:
-            createRootedEdge(Src, Dst);
-            break;
-          default:
-            llvm_unreachable("Unsupported type of edge.");
-          }
-        };
-
-        auto reconnectEdges = [&](NodeType *Src, NodeType *Dst, NodeType *New,
-                                  const Direction Dir) {
-          if (!Src->hasEdgeTo(*Dst))
-            return;
-          LLVM_DEBUG(dbgs()
-                     << "reconnecting("
-                     << (Dir == Direction::Incoming ? "incoming)" : "outgoing)")
-                     << ":\nSrc:" << *Src << "\nDst:" << *Dst
-                     << "\nNew:" << *New << "\n");
-          assert((Dir == Direction::Incoming || Dir == Direction::Outgoing) &&
-                 "Invalid direction.");
-
-          SmallVector<EdgeType *, 10> EL;
-          Src->findEdgesTo(*Dst, EL);
-          for (EdgeType *OldEdge : EL) {
-            EdgeKind Kind = OldEdge->getKind();
-            if (!EdgeAlreadyCreated[Dir][Kind]) {
-              if (Dir == Direction::Incoming) {
-                createEdgeOfKind(*Src, *New, Kind);
-                LLVM_DEBUG(dbgs() << "created edge from Src to New.\n");
-              } else if (Dir == Direction::Outgoing) {
-                createEdgeOfKind(*New, *Dst, Kind);
-                LLVM_DEBUG(dbgs() << "created edge from New to Dst.\n");
-              }
-              EdgeAlreadyCreated[Dir][Kind] = true;
-            }
-            Src->removeEdge(*OldEdge);
-            destroyEdge(*OldEdge);
-            LLVM_DEBUG(dbgs() << "removed old edge between Src and Dst.\n\n");
-          }
-        };
-
-        // Process incoming edges incident to the pi-block node.
-        reconnectEdges(N, SCCNode, &PiNode, Direction::Incoming);
-
-        // Process edges that are coming out of the pi-block node.
-        reconnectEdges(SCCNode, N, &PiNode, Direction::Outgoing);
-      }
-    }
-  }
-
-  // Ordinal maps are no longer needed.
-  InstOrdinalMap.clear();
-  NodeOrdinalMap.clear();
-
-  LLVM_DEBUG(dbgs() << "==== End of Creation of Pi-Blocks ===\n");
-}
-
-template <class G> void AbstractDependenceGraphBuilder<G>::createDefUseEdges() {
-  for (NodeType *N : Graph) {
-    InstructionListType SrcIList;
-    N->collectInstructions([](const Instruction *I) { return true; }, SrcIList);
-
-    // Use a set to mark the targets that we link to N, so we don't add
-    // duplicate def-use edges when more than one instruction in a target node
-    // use results of instructions that are contained in N.
-    SmallPtrSet<NodeType *, 4> VisitedTargets;
-
-    for (Instruction *II : SrcIList) {
-      for (User *U : II->users()) {
-        Instruction *UI = dyn_cast<Instruction>(U);
-        if (!UI)
-          continue;
-        NodeType *DstNode = nullptr;
-        if (IMap.find(UI) != IMap.end())
-          DstNode = IMap.find(UI)->second;
-
-        // In the case of loops, the scope of the subgraph is all the
-        // basic blocks (and instructions within them) belonging to the loop. We
-        // simply ignore all the edges coming from (or going into) instructions
-        // or basic blocks outside of this range.
-        if (!DstNode) {
-          LLVM_DEBUG(
-              dbgs()
-              << "skipped def-use edge since the sink" << *UI
-              << " is outside the range of instructions being considered.\n");
-          continue;
-        }
-
-        // Self dependencies are ignored because they are redundant and
-        // uninteresting.
-        if (DstNode == N) {
-          LLVM_DEBUG(dbgs()
-                     << "skipped def-use edge since the sink and the source ("
-                     << N << ") are the same.\n");
-          continue;
-        }
-
-        if (VisitedTargets.insert(DstNode).second) {
-          createDefUseEdge(*N, *DstNode);
-          ++TotalDefUseEdges;
-        }
-      }
-    }
-  }
-}
-
-template <class G>
-void AbstractDependenceGraphBuilder<G>::createMemoryDependencyEdges() {
-  using DGIterator = typename G::iterator;
-  auto isMemoryAccess = [](const Instruction *I) {
-    return I->mayReadOrWriteMemory();
-  };
-  for (DGIterator SrcIt = Graph.begin(), E = Graph.end(); SrcIt != E; ++SrcIt) {
-		LLVM_DEBUG(errs() << "Src Node: " << *SrcIt << "\n");
-    InstructionListType SrcIList;
-    (*SrcIt)->collectInstructions(isMemoryAccess, SrcIList);
-    if (SrcIList.empty())
-      continue;
-
-    for (DGIterator DstIt = SrcIt; DstIt != E; ++DstIt) {
-			if (**SrcIt == **DstIt)
-        continue;
-      InstructionListType DstIList;
-      (*DstIt)->collectInstructions(isMemoryAccess, DstIList);
-      if (DstIList.empty())
-        continue;
-      bool ForwardEdgeCreated = false;
-      bool BackwardEdgeCreated = false;
-			LLVM_DEBUG(errs() << "***********************************************\n");
-      for (Instruction *ISrc : SrcIList) {
-				LLVM_DEBUG(errs() << "Src: " << *ISrc << "\n");
-        for (Instruction *IDst : DstIList) {
-					LLVM_DEBUG(errs() << "Dst: " << *IDst << "\n");
-          auto D = DI.depends(ISrc, IDst, true);
-          if (!D) {
-            LLVM_DEBUG(errs() << "--> No Dependence, moving on!\n");
-						continue;
-
-					}
-					LLVM_DEBUG(D->dump(errs()));
-
-          // If we have a dependence with its left-most non-'=' direction
-          // being '>' we need to reverse the direction of the edge, because
-          // the source of the dependence cannot occur after the sink. For
-          // confused dependencies, we will create edges in both directions to
-          // represent the possibility of a cycle.
-
-          auto createConfusedEdges = [&](NodeType &Src, NodeType &Dst) {
-            if (!ForwardEdgeCreated) {
-              createMemoryEdge(Src, Dst);
-              ++TotalMemoryEdges;
-            }
-            if (!BackwardEdgeCreated) {
-              createMemoryEdge(Dst, Src);
-              ++TotalMemoryEdges;
-            }
-            ForwardEdgeCreated = BackwardEdgeCreated = true;
-            ++TotalConfusedEdges;
-          };
-
-          auto createForwardEdge = [&](NodeType &Src, NodeType &Dst) {
-            if (!ForwardEdgeCreated) {
-              createMemoryEdge(Src, Dst);
-              ++TotalMemoryEdges;
-            }
-            ForwardEdgeCreated = true;
-          };
-
-          auto createBackwardEdge = [&](NodeType &Src, NodeType &Dst) {
-            if (!BackwardEdgeCreated) {
-              createMemoryEdge(Dst, Src);
-              ++TotalMemoryEdges;
-            }
-            BackwardEdgeCreated = true;
-          };
-
-          if (D->isConfused()) {
-						LLVM_DEBUG(errs() << "--> Confused Dependence: creating Confused Edge\n");
-            createConfusedEdges(**SrcIt, **DstIt);
-					} else if (D->isOrdered() && !D->isLoopIndependent()) {
-						LLVM_DEBUG(errs() << "--> Ordered, Loop-Dependent Dependence:\n");
-            bool ReversedEdge = false;
-            for (unsigned Level = 1; Level <= D->getLevels(); ++Level) {
-							LLVM_DEBUG(errs() << "----> Lvl: " << Level << ": ");
-              if (D->getDirection(Level) == Dependence::DVEntry::EQ) {
-								LLVM_DEBUG(errs() << "EQ\n");
-                continue;
-							} else if (D->getDirection(Level) == Dependence::DVEntry::GT) {
-                LLVM_DEBUG(errs() << "GT\n");
-								LLVM_DEBUG(errs() << "------> Invalid Dependence. Creating Backward Edge!\n");
-								createBackwardEdge(**SrcIt, **DstIt);
-                ReversedEdge = true;
-                ++TotalEdgeReversals;
-                break;
-              } else if (D->getDirection(Level) == Dependence::DVEntry::LT){
-								LLVM_DEBUG(errs() << "LT\n");
-                break;
-							} else {
-								LLVM_DEBUG(errs() << " Confused\n");
-								createConfusedEdges(**SrcIt, **DstIt);
-								break;
-							}
-						}
-						if (!ReversedEdge) {
-							LLVM_DEBUG(errs() << "------> Creating Forward Edge!\n");
-							createForwardEdge(**SrcIt, **DstIt);
-						}
-					} else {
-						LLVM_DEBUG(errs() << "--> Creating Forward Edge!\n");
-						createForwardEdge(**SrcIt, **DstIt);
-					}
-					// Avoid creating duplicate edges.
-					if (ForwardEdgeCreated && BackwardEdgeCreated) { 
-						LLVM_DEBUG(errs() << "-->  Created all possible edges between Src and Dst!\n");
-						break;
-					}
-				}
-
-				// If we've created edges in both directions, there is no more
-				// unique edge that we can create between these two nodes, so we
-				// can exit early.
-				if (ForwardEdgeCreated && BackwardEdgeCreated) {
-					LLVM_DEBUG(errs() << "No more unique edges possible!\n");
-					break;
-				}
-			}
-		}
-	}
-}
-
-template <class G> void AbstractDependenceGraphBuilder<G>::simplify() {
-	if (!shouldSimplify())
-		return;
-	LLVM_DEBUG(dbgs() << "==== Start of Graph Simplification ===\n");
-
-	// This algorithm works by first collecting a set of candidate nodes that have
-	// an out-degree of one (in terms of def-use edges), and then ignoring those
-	// whose targets have an in-degree more than one. Each node in the resulting
-	// set can then be merged with its corresponding target and put back into the
-	// worklist until no further merge candidates are available.
-	SmallPtrSet<NodeType *, 32> CandidateSourceNodes;
-
-	// A mapping between nodes and their in-degree. To save space, this map
-	// only contains nodes that are targets of nodes in the CandidateSourceNodes.
-	DenseMap<NodeType *, unsigned> TargetInDegreeMap;
-
-	for (NodeType *N : Graph) {
-		if (N->getEdges().size() != 1)
-			continue;
-		EdgeType &Edge = N->back();
-		if (!Edge.isDefUse())
-			continue;
-		CandidateSourceNodes.insert(N);
-
-		// Insert an element into the in-degree map and initialize to zero. The
-		// count will get updated in the next step.
-		TargetInDegreeMap.insert({&Edge.getTargetNode(), 0});
-	}
-
-	LLVM_DEBUG({
-			dbgs() << "Size of candidate src node list:" << CandidateSourceNodes.size()
-			<< "\nNode with single outgoing def-use edge:\n";
-			for (NodeType *N : CandidateSourceNodes) {
-			dbgs() << N << "\n";
-			}
-			});
-
-	for (NodeType *N : Graph) {
-		for (EdgeType *E : *N) {
-			NodeType *Tgt = &E->getTargetNode();
-			auto TgtIT = TargetInDegreeMap.find(Tgt);
-			if (TgtIT != TargetInDegreeMap.end())
-				++(TgtIT->second);
-		}
-	}
-
-	LLVM_DEBUG({
-			dbgs() << "Size of target in-degree map:" << TargetInDegreeMap.size()
-			<< "\nContent of in-degree map:\n";
-			for (auto &I : TargetInDegreeMap) {
-			dbgs() << I.first << " --> " << I.second << "\n";
-			}
-			});
-
-	SmallVector<NodeType *, 32> Worklist(CandidateSourceNodes.begin(),
-			CandidateSourceNodes.end());
-	while (!Worklist.empty()) {
-		NodeType &Src = *Worklist.pop_back_val();
-		// As nodes get merged, we need to skip any node that has been removed from
-		// the candidate set (see below).
-		if (CandidateSourceNodes.find(&Src) == CandidateSourceNodes.end())
-			continue;
-		CandidateSourceNodes.erase(&Src);
-
-		assert(Src.getEdges().size() == 1 &&
-				"Expected a single edge from the candidate src node.");
-		NodeType &Tgt = Src.back().getTargetNode();
-		assert(TargetInDegreeMap.find(&Tgt) != TargetInDegreeMap.end() &&
-				"Expected target to be in the in-degree map.");
-
-		if (TargetInDegreeMap[&Tgt] != 1)
-			continue;
-
-		if (!areNodesMergeable(Src, Tgt))
-			continue;
-
-		// Do not merge if there is also an edge from target to src (immediate
-		// cycle).
-		if (Tgt.hasEdgeTo(Src))
-			continue;
-
-		LLVM_DEBUG(dbgs() << "Merging:" << Src << "\nWith:" << Tgt << "\n");
-
-		mergeNodes(Src, Tgt);
-
-		// If the target node is in the candidate set itself, we need to put the
-		// src node back into the worklist again so it gives the target a chance
-		// to get merged into it. For example if we have:
-		// {(a)->(b), (b)->(c), (c)->(d), ...} and the worklist is initially {b, a},
-		// then after merging (a) and (b) together, we need to put (a,b) back in
-		// the worklist so that (c) can get merged in as well resulting in
-		// {(a,b,c) -> d}
-		// We also need to remove the old target (b), from the worklist. We first
-		// remove it from the candidate set here, and skip any item from the
-		// worklist that is not in the set.
-		if (CandidateSourceNodes.find(&Tgt) != CandidateSourceNodes.end()) {
-			Worklist.push_back(&Src);
-			CandidateSourceNodes.insert(&Src);
-			CandidateSourceNodes.erase(&Tgt);
-			LLVM_DEBUG(dbgs() << "Putting " << &Src << " back in the worklist.\n");
-		}
-	}
-	LLVM_DEBUG(dbgs() << "=== End of Graph Simplification ===\n");
-}
-
-template <class G>
-void AbstractDependenceGraphBuilder<G>::sortNodesTopologically() {
-
-	// If we don't create pi-blocks, then we may not have a DAG.
-	if (!shouldCreatePiBlocks())
-		return;
-
-	SmallVector<NodeType *, 64> NodesInPO;
-	using NodeKind = typename NodeType::NodeKind;
-	for (NodeType *N : post_order(&Graph)) {
-		if (N->getKind() == NodeKind::PiBlock) {
-			// Put members of the pi-block right after the pi-block itself, for
-			// convenience.
-			const NodeListType &PiBlockMembers = getNodesInPiBlock(*N);
-			NodesInPO.insert(NodesInPO.end(), PiBlockMembers.begin(),
-					PiBlockMembers.end());
-		}
-		NodesInPO.push_back(N);
-	}
-
-	size_t OldSize = Graph.Nodes.size();
-	Graph.Nodes.clear();
-	for (NodeType *N : reverse(NodesInPO))
-		Graph.Nodes.push_back(N);
-	if (Graph.Nodes.size() != OldSize)
-		assert(false &&
-				"Expected the number of nodes to stay the same after the sort");
-}
-
-template class llvm::AbstractDependenceGraphBuilder<DataDependenceGraph>;
-template class llvm::DependenceGraphInfo<DDGNode>;
diff --git a/hpvm/projects/hpvm-rt/hpvm-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
index f0716378fe8c62639555396a3d268aae051534ba..0ff234e952f6aec93377ec33d51fa2a1a9f64c4e 100644
--- a/hpvm/projects/hpvm-rt/hpvm-rt.cpp
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
@@ -1,6 +1,3 @@
-
-//#define HPVM_USE_OPENCL 1
-
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
@@ -10,13 +7,8 @@
 #include <map>
 #include <pthread.h>
 #include <string>
-#include <unistd.h>
 
-#ifdef HPVM_USE_OPENCL
-
-#include <CL/cl.h>
-
-#endif
+#include <unistd.h>
 
 #if _POSIX_VERSION >= 200112L
 #include <sys/time.h>
@@ -48,22 +40,6 @@ typedef struct {
 } DFNodeContext_CPU;
 
 
-#ifdef HPVM_USE_OPENCL
-
-typedef struct {
-  cl_context clOCLContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
-} DFNodeContext_OCL;
-
-cl_context globalOCLContext;
-cl_device_id *clDevices;
-cl_command_queue globalCommandQue;
-
-#endif
-
-
 MemTracker MTracker;
 vector<DFGDepth> DStack;
 // Mutex to prevent concurrent access by multiple thereads in pipeline
@@ -72,167 +48,6 @@ pthread_mutex_t ocl_mtx;
 #define NUM_TESTS 1
 hpvm_TimerSet kernel_timer;
 
-#ifdef HPVM_USE_OPENCL
-
-static const char *getErrorString(cl_int error) {
-  switch (error) {
-  // run-time and JIT compiler errors
-  case 0:
-    return "CL_SUCCESS";
-  case -1:
-    return "CL_DEVICE_NOT_FOUND";
-  case -2:
-    return "CL_DEVICE_NOT_AVAILABLE";
-  case -3:
-    return "CL_COMPILER_NOT_AVAILABLE";
-  case -4:
-    return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-  case -5:
-    return "CL_OUT_OF_RESOURCES";
-  case -6:
-    return "CL_OUT_OF_HOST_MEMORY";
-  case -7:
-    return "CL_PROFILING_INFO_NOT_AVAILABLE";
-  case -8:
-    return "CL_MEM_COPY_OVERLAP";
-  case -9:
-    return "CL_IMAGE_FORMAT_MISMATCH";
-  case -10:
-    return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-  case -11:
-    return "CL_BUILD_PROGRAM_FAILURE";
-  case -12:
-    return "CL_MAP_FAILURE";
-  case -13:
-    return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-  case -14:
-    return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-  case -15:
-    return "CL_COMPILE_PROGRAM_FAILURE";
-  case -16:
-    return "CL_LINKER_NOT_AVAILABLE";
-  case -17:
-    return "CL_LINK_PROGRAM_FAILURE";
-  case -18:
-    return "CL_DEVICE_PARTITION_FAILED";
-  case -19:
-    return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-
-  // compile-time errors
-  case -30:
-    return "CL_INVALID_VALUE";
-  case -31:
-    return "CL_INVALID_DEVICE_TYPE";
-  case -32:
-    return "CL_INVALID_PLATFORM";
-  case -33:
-    return "CL_INVALID_DEVICE";
-  case -34:
-    return "CL_INVALID_CONTEXT";
-  case -35:
-    return "CL_INVALID_QUEUE_PROPERTIES";
-  case -36:
-    return "CL_INVALID_COMMAND_QUEUE";
-  case -37:
-    return "CL_INVALID_HOST_PTR";
-  case -38:
-    return "CL_INVALID_MEM_OBJECT";
-  case -39:
-    return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case -40:
-    return "CL_INVALID_IMAGE_SIZE";
-  case -41:
-    return "CL_INVALID_SAMPLER";
-  case -42:
-    return "CL_INVALID_BINARY";
-  case -43:
-    return "CL_INVALID_BUILD_OPTIONS";
-  case -44:
-    return "CL_INVALID_PROGRAM";
-  case -45:
-    return "CL_INVALID_PROGRAM_EXECUTABLE";
-  case -46:
-    return "CL_INVALID_KERNEL_NAME";
-  case -47:
-    return "CL_INVALID_KERNEL_DEFINITION";
-  case -48:
-    return "CL_INVALID_KERNEL";
-  case -49:
-    return "CL_INVALID_ARG_INDEX";
-  case -50:
-    return "CL_INVALID_ARG_VALUE";
-  case -51:
-    return "CL_INVALID_ARG_SIZE";
-  case -52:
-    return "CL_INVALID_KERNEL_ARGS";
-  case -53:
-    return "CL_INVALID_WORK_DIMENSION";
-  case -54:
-    return "CL_INVALID_WORK_GROUP_SIZE";
-  case -55:
-    return "CL_INVALID_WORK_ITEM_SIZE";
-  case -56:
-    return "CL_INVALID_GLOBAL_OFFSET";
-  case -57:
-    return "CL_INVALID_EVENT_WAIT_LIST";
-  case -58:
-    return "CL_INVALID_EVENT";
-  case -59:
-    return "CL_INVALID_OPERATION";
-  case -60:
-    return "CL_INVALID_GL_OBJECT";
-  case -61:
-    return "CL_INVALID_BUFFER_SIZE";
-  case -62:
-    return "CL_INVALID_MIP_LEVEL";
-  case -63:
-    return "CL_INVALID_GLOBAL_WORK_SIZE";
-  case -64:
-    return "CL_INVALID_PROPERTY";
-  case -65:
-    return "CL_INVALID_IMAGE_DESCRIPTOR";
-  case -66:
-    return "CL_INVALID_COMPILER_OPTIONS";
-  case -67:
-    return "CL_INVALID_LINKER_OPTIONS";
-  case -68:
-    return "CL_INVALID_DEVICE_PARTITION_COUNT";
-
-  // extension errors
-  case -1000:
-    return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
-  case -1001:
-    return "CL_PLATFORM_NOT_FOUND_KHR";
-  case -1002:
-    return "CL_INVALID_D3D10_DEVICE_KHR";
-  case -1003:
-    return "CL_INVALID_D3D10_RESOURCE_KHR";
-  case -1004:
-    return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
-  case -1005:
-    return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
-  default:
-    return "Unknown OpenCL error";
-  }
-}
-
-static inline void checkErr(cl_int err, cl_int success, const char *name) {
-  if (err != success) {
-    cout << "ERROR: " << name << flush << "\n";
-    cout << "ErrorCode: " << getErrorString(err) << flush << "\n";
-    exit(EXIT_FAILURE);
-  }
-}
-
-#endif
-
-
-void openCLAbort(){
- cout <<" ERROR: OpenCL NOT found!. Please Recompile with OpenCL - Make sure to have OpenCL on System \n ";
- abort();
-}
-
-
 /************************* Depth Stack Routines ***************************/
 
 void llvm_hpvm_cpu_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
@@ -284,9 +99,6 @@ uint64_t llvm_hpvm_cpu_getDimInstance(unsigned level, unsigned dim) {
 /********************** Memory Tracking Routines **************************/
 
 void llvm_hpvm_track_mem(void *ptr, size_t size) {
-
-#ifdef HPVM_USE_OPENCL 
-  
   DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE != NULL) {
@@ -296,19 +108,9 @@ void llvm_hpvm_track_mem(void *ptr, size_t size) {
   DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n");
   MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr);
   DEBUG(MTracker.print());
-
-#else
-
-  openCLAbort();
-  
-#endif
-  
 }
 
 void llvm_hpvm_untrack_mem(void *ptr) {
-
-#ifdef HPVM_USE_OPENCL 
-
   DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE == NULL) {
@@ -317,94 +119,16 @@ void llvm_hpvm_untrack_mem(void *ptr) {
     return;
   }
   DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n");
-  if (MTE->getLocation() == MemTrackerEntry::DEVICE)
-    clReleaseMemObject((cl_mem)MTE->getAddress());
   MTracker.remove(ptr);
   DEBUG(MTracker.print());
-
-#else
-
-  openCLAbort();
-  
-#endif
-  
-}
-
-
-#ifdef HPVM_USE_OPENCL 
-
-static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size,
-                                       DFNodeContext_OCL *Context, bool isInput,
-                                       bool isOutput) {
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "[OCL] Request memory: " << ptr
-             << " for context: " << Context->clOCLContext << flush << "\n");
-  MemTrackerEntry *MTE = MTracker.lookup(ptr);
-  if (MTE == NULL) {
-    MTracker.print();
-    cout << "ERROR: Requesting memory not present in Table\n";
-    exit(EXIT_FAILURE);
-  }
-  // If already on device
-  if (MTE->getLocation() == MemTrackerEntry::DEVICE &&
-      ((DFNodeContext_OCL *)MTE->getContext())->clOCLContext ==
-          Context->clOCLContext) {
-    DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
-               << "\n");
-    pthread_mutex_unlock(&ocl_mtx);
-    return MTE->getAddress();
-  }
-
-  DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush
-             << "\n");
-  DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n");
-  // Else copy and update the latest copy
-  cl_mem_flags clFlags;
-  cl_int errcode;
-
-  if (isInput && isOutput)
-    clFlags = CL_MEM_READ_WRITE;
-  else if (isInput)
-    clFlags = CL_MEM_READ_ONLY;
-  else if (isOutput)
-    clFlags = CL_MEM_WRITE_ONLY;
-  else
-    clFlags = CL_MEM_READ_ONLY;
-
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
-  cl_mem d_input =
-      clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
-  DEBUG(cout << "\nMemory allocated on device: " << d_input << flush << "\n");
-  if (isInput) {
-    DEBUG(cout << "\tCopying ...");
-    errcode = clEnqueueWriteBuffer(Context->clCommandQue, d_input, CL_TRUE, 0,
-                                   size, MTE->getAddress(), 0, NULL, NULL);
-    checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
-  }
-
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
-  DEBUG(cout << " done\n");
-  MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context);
-  DEBUG(cout << "Updated Table\n");
-  DEBUG(MTracker.print());
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_input;
-  
 }
 
-#endif
-
 
 void *llvm_hpvm_cpu_argument_ptr(void *ptr, size_t size) {
   return llvm_hpvm_request_mem(ptr, size);
 }
 
 void *llvm_hpvm_request_mem(void *ptr, size_t size) {
-
-#ifdef HPVM_USE_OPENCL 
-
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[CPU] Request memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
@@ -422,32 +146,8 @@ void *llvm_hpvm_request_mem(void *ptr, size_t size) {
   }
 
   // Else copy from device and update table
-  DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
-             << "\n");
-  DEBUG(cout << "\tCopying ...");
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
-  // pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clEnqueueReadBuffer(
-      ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue,
-      (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL);
-  // pthread_mutex_unlock(&ocl_mtx);
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
-  DEBUG(cout << " done\n");
-  checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
-  DEBUG(cout << "Free mem object on device\n");
-  clReleaseMemObject((cl_mem)MTE->getAddress());
-  DEBUG(cout << "Updated Table\n");
-  MTE->update(MemTrackerEntry::HOST, ptr);
-  DEBUG(MTracker.print());
-  pthread_mutex_unlock(&ocl_mtx);
-  return ptr;
-
-#else
-
-  openCLAbort();
-
-#endif
-
+	assert(0 && "Should not reach this point with CPU-only code-gen");
+	return NULL;
 }
 
 /*************************** Timer Routines **********************************/
@@ -481,134 +181,17 @@ get_last_async(struct hpvm_TimerSet *timers) {
 }
 
 static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  cl_int ciErrNum = CL_SUCCESS;
-  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
-
-  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
-    new_event = &((*new_event)->next);
-  }
-
-  if (*new_event == NULL) {
-    *new_event = (struct hpvm_async_time_marker_list *)malloc(
-        sizeof(struct hpvm_async_time_marker_list));
-    (*new_event)->marker = calloc(1, sizeof(cl_event));
-    (*new_event)->next = NULL;
-  }
-
-  /* valid event handle now aquired: insert the event record */
-  (*new_event)->label = NULL;
-  (*new_event)->timerID = timer;
-  ciErrNum =
-      clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  if (ciErrNum != CL_SUCCESS) {
-    fprintf(stderr, "Error Enqueueing Marker!\n");
-  }
-
-#else
-
-  openCLAbort();
-
-#endif
-
+	return;
 }
 
 static void insert_submarker(struct hpvm_TimerSet *tset, char *label,
                              enum hpvm_TimerID timer) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  cl_int ciErrNum = CL_SUCCESS;
-  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
-
-  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
-    new_event = &((*new_event)->next);
-  }
-
-  if (*new_event == NULL) {
-    *new_event = (struct hpvm_async_time_marker_list *)malloc(
-        sizeof(struct hpvm_async_time_marker_list));
-    (*new_event)->marker = calloc(1, sizeof(cl_event));
-    (*new_event)->next = NULL;
-  }
-
-  /* valid event handle now aquired: insert the event record */
-  (*new_event)->label = label;
-  (*new_event)->timerID = timer;
-  ciErrNum =
-      clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  if (ciErrNum != CL_SUCCESS) {
-    fprintf(stderr, "Error Enqueueing Marker!\n");
-  }
-
-#else
-
-  openCLAbort();
-
-#endif
-  
+	return;
 }
 
 /* Assumes that all recorded events have completed */
 static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  struct hpvm_async_time_marker_list *next_interval = NULL;
-  struct hpvm_async_time_marker_list *last_marker = get_last_async(tset);
-  hpvm_Timestamp total_async_time = 0;
-
-  for (next_interval = tset->async_markers; next_interval != last_marker;
-       next_interval = next_interval->next) {
-    cl_ulong command_start = 0, command_end = 0;
-    cl_int ciErrNum = CL_SUCCESS;
-
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker),
-                                       CL_PROFILING_COMMAND_END,
-                                       sizeof(cl_ulong), &command_start, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
-    }
-
-    ciErrNum = clGetEventProfilingInfo(
-        *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END,
-        sizeof(cl_ulong), &command_end, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n",
-              ciErrNum);
-    }
-
-    hpvm_Timestamp interval =
-        (hpvm_Timestamp)(((double)(command_end - command_start)));
-    tset->timers[next_interval->timerID].elapsed += interval;
-    if (next_interval->label != NULL) {
-      struct hpvm_SubTimer *subtimer =
-          tset->sub_timer_list[next_interval->timerID]->subtimer_list;
-      while (subtimer != NULL) {
-        if (strcmp(subtimer->label, next_interval->label) == 0) {
-          subtimer->timer.elapsed += interval;
-          break;
-        }
-        subtimer = subtimer->next;
-      }
-    }
-    total_async_time += interval;
-    next_interval->timerID = INVALID_TIMERID;
-  }
-
-  if (next_interval != NULL)
-    next_interval->timerID = INVALID_TIMERID;
-
-  return total_async_time;
-
-#else
-
-  openCLAbort();
-
-#endif
-  
+	return 0;
 }
 
 static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start,
@@ -825,295 +408,10 @@ void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
 }
 
 void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  // cerr << "Switch to timer: " << timer << flush << "\n";
-  /* Stop the currently running timer */
-  if (timers->current != hpvm_TimerID_NONE) {
-    struct hpvm_SubTimerList *subtimerlist =
-        timers->sub_timer_list[timers->current];
-    struct hpvm_SubTimer *currSubTimer =
-        (subtimerlist != NULL) ? subtimerlist->current : NULL;
-
-    if (!is_async(timers->current)) {
-      if (timers->current != timer) {
-        if (currSubTimer != NULL) {
-          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
-                                    &currSubTimer->timer);
-        } else {
-          hpvm_StopTimer(&timers->timers[timers->current]);
-        }
-      } else {
-        if (currSubTimer != NULL) {
-          hpvm_StopTimer(&currSubTimer->timer);
-        }
-      }
-    } else {
-      insert_marker(timers, timer);
-      if (!is_async(timer)) { // if switching to async too, keep driver going
-        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  hpvm_Timestamp currentTime = get_time();
-
-  /* The only cases we check for asynchronous task completion is
-   * when an overlapping CPU operation completes, or the next
-   * segment blocks on completion of previous async operations */
-  if (asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(timer))) {
-
-    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
-    /* CL_COMPLETE if completed */
-
-    cl_int ciErrNum = CL_SUCCESS;
-    cl_int async_done = CL_COMPLETE;
-
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
-                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
-                              &async_done, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stdout, "Error Querying EventInfo1!\n");
-    }
-
-    if (is_blocking(timer)) {
-      /* Async operations completed after previous CPU operations:
-       * overlapped time is the total CPU time since this set of async
-       * operations were first issued */
-
-      // timer to switch to is COPY or NONE
-      if (async_done != CL_COMPLETE) {
-        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
-                        timers->async_begin, currentTime);
-      }
-
-      /* Wait on async operation completion */
-      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
-      if (ciErrNum != CL_SUCCESS) {
-        fprintf(stderr, "Error Waiting for Events!\n");
-      }
-
-      hpvm_Timestamp total_async_time = record_async_times(timers);
-
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      if (async_done == CL_COMPLETE) {
-        // fprintf(stderr, "Async_done: total_async_type = %lld\n",
-        // total_async_time);
-        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
-      }
-
-    } else
-        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
-        // outstanding so something is deeper in stack
-        if (async_done == CL_COMPLETE) {
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
-          record_async_times(timers);
-    }
-  }
-
-  /* Start the new timer */
-  if (timer != hpvm_TimerID_NONE) {
-    if (!is_async(timer)) {
-      hpvm_StartTimer(&timers->timers[timer]);
-    } else {
-      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
-      if (!asyncs_outstanding(timers)) {
-        /* No asyncs outstanding, insert a fresh async marker */
-
-        insert_marker(timers, timer);
-        timers->async_begin = currentTime;
-      } else if (!is_async(timers->current)) {
-        /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation,
-         * so we can rename that marker as the beginning of this async
-         * operation */
-
-        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
-        last_event->label = NULL;
-        last_event->timerID = timer;
-      }
-      if (!is_async(timers->current)) {
-        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
-      }
-    }
-  }
-  timers->current = timer;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-
 }
 
 void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
                            enum hpvm_TimerID category) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  struct hpvm_SubTimerList *subtimerlist =
-      timers->sub_timer_list[timers->current];
-  struct hpvm_SubTimer *curr =
-      (subtimerlist != NULL) ? subtimerlist->current : NULL;
-
-  if (timers->current != hpvm_TimerID_NONE) {
-    if (!is_async(timers->current)) {
-      if (timers->current != category) {
-        if (curr != NULL) {
-          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
-                                    &curr->timer);
-        } else {
-          hpvm_StopTimer(&timers->timers[timers->current]);
-        }
-      } else {
-        if (curr != NULL) {
-          hpvm_StopTimer(&curr->timer);
-        }
-      }
-    } else {
-      insert_submarker(timers, label, category);
-      if (!is_async(category)) { // if switching to async too, keep driver going
-        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  hpvm_Timestamp currentTime = get_time();
-
-  /* The only cases we check for asynchronous task completion is
-   * when an overlapping CPU operation completes, or the next
-   * segment blocks on completion of previous async operations */
-  if (asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(category))) {
-
-    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
-    /* CL_COMPLETE if completed */
-
-    cl_int ciErrNum = CL_SUCCESS;
-    cl_int async_done = CL_COMPLETE;
-
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
-                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
-                              &async_done, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stdout, "Error Querying EventInfo2!\n");
-    }
-
-    if (is_blocking(category)) {
-      /* Async operations completed after previous CPU operations:
-       * overlapped time is the total CPU time since this set of async
-       * operations were first issued */
-
-      // timer to switch to is COPY or NONE
-      // if it hasn't already finished, then just take now and use that as the
-      // elapsed time in OVERLAP anything happening after now isn't OVERLAP
-      // because everything is being stopped to wait for synchronization it
-      // seems that the extra sync wall time isn't being recorded anywhere
-      if (async_done != CL_COMPLETE)
-        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
-                        timers->async_begin, currentTime);
-
-      /* Wait on async operation completion */
-      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
-      if (ciErrNum != CL_SUCCESS) {
-        fprintf(stderr, "Error Waiting for Events!\n");
-      }
-      hpvm_Timestamp total_async_time = record_async_times(timers);
-
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      // If it did finish, then accumulate all the async time that did happen
-      // into OVERLAP the immediately preceding EventSynchronize theoretically
-      // didn't have any effect since it was already completed.
-      if (async_done == CL_COMPLETE /*cudaSuccess*/)
-        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
-
-    } else
-        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
-        // outstanding so something is deeper in stack
-        if (async_done == CL_COMPLETE /*cudaSuccess*/) {
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
-          record_async_times(timers);
-    }
-    // else, this isn't blocking, so just check the next time around
-  }
-
-  subtimerlist = timers->sub_timer_list[category];
-  struct hpvm_SubTimer *subtimer = NULL;
-
-  if (label != NULL) {
-    subtimer = subtimerlist->subtimer_list;
-    while (subtimer != NULL) {
-      if (strcmp(subtimer->label, label) == 0) {
-        break;
-      } else {
-        subtimer = subtimer->next;
-      }
-    }
-  }
-
-  /* Start the new timer */
-  if (category != hpvm_TimerID_NONE) {
-    if (!is_async(category)) {
-      if (subtimerlist != NULL) {
-        subtimerlist->current = subtimer;
-      }
-
-      if (category != timers->current && subtimer != NULL) {
-        hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
-      } else if (subtimer != NULL) {
-        hpvm_StartTimer(&subtimer->timer);
-      } else {
-        hpvm_StartTimer(&timers->timers[category]);
-      }
-    } else {
-      if (subtimerlist != NULL) {
-        subtimerlist->current = subtimer;
-      }
-
-      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
-      if (!asyncs_outstanding(timers)) {
-        /* No asyncs outstanding, insert a fresh async marker */
-        insert_submarker(timers, label, category);
-        timers->async_begin = currentTime;
-      } else if (!is_async(timers->current)) {
-        /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation,
-         * so we can rename that marker as the beginning of this async
-         * operation */
-
-        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
-        last_event->timerID = category;
-        last_event->label = label;
-      } // else, marker for switchToThis was already inserted
-
-      // toSwitchto is already asynchronous, but if current/prev state is async
-      // too, then DRIVER is already running
-      if (!is_async(timers->current)) {
-        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  timers->current = category;
-
-#else
-
-  openCLAbort();
-
-#endif
-  
 }
 
 void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
@@ -1182,53 +480,6 @@ void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
 }
 
 void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  /* clean up all of the async event markers */
-  struct hpvm_async_time_marker_list *event = timers->async_markers;
-  while (event != NULL) {
-
-    cl_int ciErrNum = CL_SUCCESS;
-    ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
-    if (ciErrNum != CL_SUCCESS) {
-      // fprintf(stderr, "Error Waiting for Events!\n");
-    }
-
-    ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker));
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Release Events!\n");
-    }
-
-    free((event)->marker);
-    struct hpvm_async_time_marker_list *next = ((event)->next);
-
-    free(event);
-
-    event = next;
-  }
-
-  int i = 0;
-  for (i = 0; i < hpvm_TimerID_LAST; ++i) {
-    if (timers->sub_timer_list[i] != NULL) {
-      struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
-      struct hpvm_SubTimer *prev = NULL;
-      while (subtimer != NULL) {
-        free(subtimer->label);
-        prev = subtimer;
-        subtimer = subtimer->next;
-        free(prev);
-      }
-      free(timers->sub_timer_list[i]);
-    }
-  }
-
-#else
-
-  openCLAbort();
-
-#endif
-
 }
 
 /**************************** Pipeline API ************************************/
@@ -1427,541 +678,6 @@ void llvm_hpvm_cpu_wait(void *graphID) {
   DEBUG(cout << "\t... pthread Done!\n");
 }
 
-
-#ifdef HPVM_USE_OPENCL
-
-// Returns the platform name.
-std::string getPlatformName(cl_platform_id pid) {
- 
-  cl_int status;
-  size_t sz;
-  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
-  checkErr(status, CL_SUCCESS, "Query for platform name size failed");
-
-  char *name = new char[sz];
-  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
-  checkErr(status, CL_SUCCESS, "Query for platform name failed");
-
-  const auto &tmp = std::string(name, name + sz);
-  delete[] name;
-  return tmp;  
-}
-
-#endif
-
-
-#ifdef HPVM_USE_OPENCL
-
-// Searches all platforms for the first platform whose name
-// contains the search string (case-insensitive).
-cl_platform_id findPlatform(const char *platform_name_search) {
-  
-  cl_int status;
-
-  std::string search = platform_name_search;
-  std::transform(search.begin(), search.end(), search.begin(), ::tolower);
-
-  // Get number of platforms.
-  cl_uint num_platforms;
-  status = clGetPlatformIDs(0, NULL, &num_platforms);
-  checkErr(status, CL_SUCCESS, "Query for number of platforms failed");
-
-  // Get a list of all platform ids.
-  cl_platform_id *pids =
-      (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
-  status = clGetPlatformIDs(num_platforms, pids, NULL);
-  checkErr(status, CL_SUCCESS, "Query for all platform ids failed");
-
-  // For each platform, get name and compare against the search string.
-  for (unsigned i = 0; i < num_platforms; ++i) {
-    std::string name = getPlatformName(pids[i]);
-
-    // Convert to lower case.
-    std::transform(name.begin(), name.end(), name.begin(), ::tolower);
-
-    if (name.find(search) != std::string::npos) {
-      // Found!
-      cl_platform_id pid = pids[i];
-      free(pids);
-      return pid;
-    }
-  }
-
-  free(pids);
-  // No platform found.
-  assert(false && "No matching platform found!");
-}
-
-#endif
-
-
-void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR");
-  DEBUG(cout << "Initializing Context for " << Target << " device\n");
-  cl_uint numPlatforms;
-  cl_int errcode;
-  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
-  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
-
-  // now get all the platform IDs
-  cl_platform_id *platforms =
-      (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
-  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
-
-  for (unsigned i = 0; i < numPlatforms; i++) {
-    char buffer[10240];
-    DEBUG(cout << "Device " << i << " Info -->\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
-    DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    DEBUG(cout << "\tVERSION = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
-    DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
-    DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer,
-                      NULL);
-    DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
-  }
-  cl_platform_id platformId;
-  if (T == hpvm::GPU_TARGET) {
-    platformId = findPlatform("nvidia");
-    char buffer[10240];
-    DEBUG(cout << "Found NVIDIA Device \n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
-    DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    DEBUG(cout << "\tVERSION = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_NAME, 10240, buffer, NULL);
-    DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
-    DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
-    DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
-  } else {
-    platformId = findPlatform("intel");
-    char buffer[10240];
-    DEBUG(cout << "Found Intel Device \n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
-    DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    DEBUG(cout << "\tVERSION = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_NAME, 10240, buffer, NULL);
-    DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
-    DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
-    clGetPlatformInfo(platformId, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
-    DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
-  }
-  DEBUG(cout << "Found plarform with id: " << platformId << "\n");
-  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (long)platformId,
-                                        0};
-  globalOCLContext = clCreateContextFromType(
-      properties,
-      T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
-      NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create context");
-  // get the list of OCL devices associated with context
-  size_t dataBytes;
-  errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, NULL,
-                             &dataBytes);
-  checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
-
-  DEBUG(cout << "Got databytes: " << dataBytes << "\n");
-
-  clDevices = (cl_device_id *)malloc(dataBytes);
-  errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
-                              clDevices, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-
-  free(platforms);
-  DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n");
-  checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
-
-  DEBUG(cout << "Initialize Kernel Timer\n");
-  hpvm_InitializeTimerSet(&kernel_timer);
-
-  pthread_mutex_unlock(&ocl_mtx);
-  return globalOCLContext;
-
-#else
-
-  openCLAbort();
-
-#endif
-  
-}
-
-void llvm_hpvm_ocl_clearContext(void *graphID) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Clear Context\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  // FIXME: Have separate function to release command queue and clear context.
-  // Would be useful when a context has multiple command queues
-  clReleaseKernel(Context->clKernel);
-  free(Context);
-  DEBUG(cout << "Done with OCL kernel\n");
-  cout << "Printing HPVM Timer: KernelTimer\n";
-  hpvm_PrintTimerSet(&kernel_timer);
-  pthread_mutex_unlock(&ocl_mtx);
-
-#else
-
-  openCLAbort();
-
-#endif
-  
-}
-
-void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Shared Memory Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
-             << flush << "\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  DEBUG(cout << "Using Context: " << Context << flush << "\n");
-  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
-  pthread_mutex_unlock(&ocl_mtx);
-
-#else
-
-  openCLAbort();
-
-#endif
-  
-}
-
-void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index,
-                                   size_t size) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Scalar Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
-             << flush << "\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  DEBUG(cout << "Using Context: " << Context << flush << "\n");
-  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
-  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
-  pthread_mutex_unlock(&ocl_mtx);
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index,
-                                 size_t size, bool isInput, bool isOutput) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Pointer Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input
-             << ", Size = " << size << flush << "\n");
-  // Size should be non-zero
-  assert(size != 0 && "Size of data pointed to has to be non-zero!");
-  DEBUG(cout << "\tInput = " << isInput << "\tOutput = " << isOutput << flush
-             << "\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-
-  pthread_mutex_unlock(&ocl_mtx);
-  // Check with runtime the location of this memory
-  cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context,
-                                                     isInput, isOutput);
-
-  pthread_mutex_lock(&ocl_mtx);
-  // Set Kernel Argument
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem),
-                                  (void *)&d_input);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
-  DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_input;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set device memory for Output Struct:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
-             << flush << "\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  cl_int errcode;
-  cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY,
-                                   size, NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device");
-  errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem),
-                           (void *)&d_output);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
-  DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_output;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-void llvm_hpvm_ocl_free(void *ptr) {}
-
-void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output,
-                              size_t size) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Get Output:\n");
-  DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output
-             << ", Size = " << size << flush << "\n");
-  if (h_output == NULL)
-    h_output = malloc(size);
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  cl_int errcode =
-      clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0,
-                          size, h_output, 0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
-  pthread_mutex_unlock(&ocl_mtx);
-  return h_output;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim,
-                                const size_t *localWorkSize,
-                                const size_t *globalWorkSize) {
-
-#ifdef HPVM_USE_OPENCL 
- 
-  pthread_mutex_lock(&ocl_mtx);
-
-  size_t GlobalWG[3];
-  size_t LocalWG[3];
-
-  // OpenCL EnqeueNDRangeKernel function results in segementation fault if we
-  // directly use local and global work groups arguments. Hence, allocating it
-  // on stack and copying.
-  for (unsigned i = 0; i < workDim; i++) {
-    GlobalWG[i] = globalWorkSize[i];
-  }
-
-  // OpenCL allows local workgroup to be null.
-  if (localWorkSize != NULL) {
-    for (unsigned i = 0; i < workDim; i++) {
-      LocalWG[i] = localWorkSize[i];
-    }
-  }
-
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  // TODO: Would like to use event to ensure better scheduling of kernels.
-  // Currently passing the event paratemeter results in seg fault with
-  // clEnqueueNDRangeKernel.
-  DEBUG(cout << "Enqueuing kernel:\n");
-  DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n");
-  DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n");
-  DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n");
-  DEBUG(cout << "\tGlobal Work Group: ( ");
-  for (unsigned i = 0; i < workDim; i++) {
-    DEBUG(cout << GlobalWG[i] << " ");
-  }
-  DEBUG(cout << ")\n");
-  if (localWorkSize != NULL) {
-    DEBUG(cout << "\tLocal Work Group: ( ");
-    for (unsigned i = 0; i < workDim; i++) {
-      DEBUG(cout << LocalWG[i] << " ");
-    }
-    DEBUG(cout << ")\n");
-  }
-  clFinish(Context->clCommandQue);
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION);
-  cl_int errcode = clEnqueueNDRangeKernel(
-      Context->clCommandQue, Context->clKernel, workDim, NULL, GlobalWG,
-      (localWorkSize == NULL) ? NULL : LocalWG, 0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
-  clFinish(Context->clCommandQue);
-  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
-
-  pthread_mutex_unlock(&ocl_mtx);
-  return NULL;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program binary file.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param Filename        program filename
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
-  DEBUG(cout << "Load Prog Source\n");
-  // locals
-  FILE *pFileStream = NULL;
-  size_t szSourceLength;
-
-  // open the OpenCL source code file
-  pFileStream = fopen(Filename, "rb");
-  if (pFileStream == 0) {
-    return NULL;
-  }
-
-  // get the length of the source code
-  fseek(pFileStream, 0, SEEK_END);
-  szSourceLength = ftell(pFileStream);
-  fseek(pFileStream, 0, SEEK_SET);
-
-  // allocate a buffer for the source code string and read it in
-  char *cSourceString = (char *)malloc(szSourceLength + 1);
-  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) {
-    fclose(pFileStream);
-    free(cSourceString);
-    return 0;
-  }
-
-  // close the file and return the total length of the combined (preamble +
-  // source) string
-  fclose(pFileStream);
-  if (szFinalLength != 0) {
-    *szFinalLength = szSourceLength;
-  }
-  cSourceString[szSourceLength] = '\0';
-
-  return cSourceString;
-}
-
-void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Launch OCL Kernel\n");
-  // Initialize OpenCL
-
-  // OpenCL specific variables
-  DFNodeContext_OCL *Context =
-      (DFNodeContext_OCL *)malloc(sizeof(DFNodeContext_OCL));
-
-  size_t kernelLength;
-  cl_int errcode;
-
-  // For a single context for all kernels
-  Context->clOCLContext = globalOCLContext;
-
-  // Create a command-queue
-  Context->clCommandQue = clCreateCommandQueue(
-      Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
-  globalCommandQue = Context->clCommandQue;
-  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
-
-  DEBUG(cout << "Loading program binary: " << FileName << flush << "\n");
-  char *programSource = LoadProgSource(FileName, &kernelLength);
-  checkErr(programSource != NULL, 1 /*bool true*/,
-           "Failure to load Program Binary");
-
-  Context->clProgram = clCreateProgramWithSource(
-      Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
-
-  DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName
-             << flush << "\n");
-  errcode =
-      clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL);
-  // If build fails, get build log from device
-  if (errcode != CL_SUCCESS) {
-    cout << "ERROR: Failure to build program\n";
-    size_t len = 0;
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0],
-                                    CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
-    cout << "LOG LENGTH: " << len << flush << "\n";
-    checkErr(errcode, CL_SUCCESS,
-             "Failure to collect program build log length");
-    char *log = (char *)malloc(len * sizeof(char));
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0],
-                                    CL_PROGRAM_BUILD_LOG, len, log, NULL);
-    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
-
-    cout << "Device Build Log:\n" << log << flush << "\n";
-    free(log);
-    pthread_mutex_unlock(&ocl_mtx);
-    exit(EXIT_FAILURE);
-  }
-
-  Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
-
-  DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
-  free(programSource);
-
-  pthread_mutex_unlock(&ocl_mtx);
-  return Context;
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
-void llvm_hpvm_ocl_wait(void *graphID) {
-
-#ifdef HPVM_USE_OPENCL
-  
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Wait\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
-  clFinish(Context->clCommandQue);
-  pthread_mutex_unlock(&ocl_mtx);
-
-#else
-
-  openCLAbort();
-
-#endif
-
-}
-
 void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) {
   pthread_mutex_lock(&ocl_mtx);
   pthread_mutex_unlock(&ocl_mtx);
@@ -1984,5 +700,3 @@ void *llvm_hpvm_initializeTimerSet() {
   pthread_mutex_unlock(&ocl_mtx);
   return TS;
 }
-
-