diff --git a/hpvm/include/SupportHPVM/DFG2LLVM.h b/hpvm/include/SupportHPVM/DFG2LLVM.h
index c1ade92e9a7201a5c3c80e9302b9bac57c750537..0703eda772088b87d7edc917babd1df84718d563 100644
--- a/hpvm/include/SupportHPVM/DFG2LLVM.h
+++ b/hpvm/include/SupportHPVM/DFG2LLVM.h
@@ -1,6 +1,3 @@
-#ifndef __DFG2LLVM_H__
-#define __DFG2LLVM_H__
-
 //===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -15,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __DFG2LLVM_H__
+#define __DFG2LLVM_H__
+
 #include "BuildDFG/BuildDFG.h"
 #include "SupportHPVM/HPVMHint.h"
 #include "SupportHPVM/HPVMTimer.h"
@@ -25,6 +25,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <queue>
 
 using namespace llvm;
 using namespace builddfg;
@@ -37,13 +39,19 @@ using namespace builddfg;
   } while (0)
 #define DECLARE(X)                                                             \
   X = M.getOrInsertFunction(                                                   \
-      #X, runtimeModule->getFunction(#X)->getFunctionType());
+      getMangledName(#X), runtimeModule->getFunction(getMangledName(#X))->getFunctionType());
 
+#define DECLARE_EPOCHS(X, Y)                                                             \
+  X = M.getOrInsertFunction(                                                   \
+      getMangledName(#X), Y->getFunction(getMangledName(#X))->getFunctionType());
 namespace dfg2llvm {
 // Helper Functions
 static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID);
 
-bool hasAttribute(Function *, unsigned, Attribute::AttrKind);
+inline bool hasAttribute(Function *, unsigned, Attribute::AttrKind);
+
+//  void addArgument(Function*, Type*, const Twine& Name = "");
+inline Function *addArgument(Function *, Type *, const Twine &Name = "");
 
 // DFG2LLVM abstract class implementation
 class DFG2LLVM : public ModulePass {
@@ -65,6 +73,40 @@ public:
   }
 };
 
+// DFG2LLVM_CPU - The first implementation.
+struct DFG2LLVM_CPU : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_CPU() : DFG2LLVM(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Utility function that invokes the HPVM CPU backend.
+bool runDFG2LLVM_CPU(Module &M, BuildDFG &DFG);
+
+// DFG2LLVM_EPOCHS - The first implementation.
+struct DFG2LLVM_EPOCHS : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_EPOCHS() : DFG2LLVM(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Utility function that invokes the HPVM EPOCHS backend.
+bool runDFG2LLVM_EPOCHS(Module &M, BuildDFG &DFG);
+
 // Abstract Visitor for Code generation traversal (tree traversal for now)
 class CodeGenTraversal : public DFNodeVisitor {
 
@@ -87,22 +129,22 @@ protected:
   FunctionCallee llvm_hpvm_initializeTimerSet;
   FunctionCallee llvm_hpvm_switchToTimer;
   FunctionCallee llvm_hpvm_printTimerSet;
+  FunctionCallee llvm_hpvm_cpu_dstack_push;
+  FunctionCallee llvm_hpvm_cpu_dstack_pop;
   GlobalVariable *TimerSet;
   GlobalVariable *GraphIDAddr;
   Instruction *InitCall;
   Instruction *CleanupCall;
 
   // Functions
-  Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
-                          const Twine &Name = "");
-  //  void addArgument(Function*, Type*, const Twine& Name = "");
-  Function *addArgument(Function *, Type *, const Twine &Name = "");
+  inline Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
+				 const Twine &Name = "");
   //  void addIdxDimArgs(Function* F);
-  Function *addIdxDimArgs(Function *F);
-  std::vector<Value *> extractElements(Value *, std::vector<Type *>,
-                                       std::vector<std::string>, Instruction *);
-  Argument *getArgumentAt(Function *F, unsigned offset);
-  void initTimerAPI();
+  inline Function *addIdxDimArgs(Function *F);
+  inline std::vector<Value *> extractElements(Value *, std::vector<Type *>,
+    std::vector<std::string>, Instruction *);
+  inline Argument *getArgumentAt(Function *F, unsigned offset);
+  inline void initTimerAPI();
 
   // Pure Virtual Functions
   virtual void init() = 0;
@@ -111,19 +153,39 @@ protected:
   virtual void codeGen(DFLeafNode *N) = 0;
 
   // Virtual Functions
-  virtual void initializeTimerSet(Instruction *);
-  virtual void switchToTimer(enum hpvm_TimerID, Instruction *);
-  virtual void printTimerSet(Instruction *);
+  virtual inline void initializeTimerSet(Instruction *);
+  virtual inline void switchToTimer(enum hpvm_TimerID, Instruction *);
+  virtual inline void printTimerSet(Instruction *);
 
   virtual ~CodeGenTraversal() {}
 
+  inline Value *addLoop(Instruction *I, Value *limit,
+    const Twine &indexName = "");
+  inline void invokeChild(DFNode *C, Function *F, ValueToValueMapTy &VMap,
+    Instruction *InsertBefore, hpvm::Target Tag);
+  inline Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_CPU,
+    Instruction *InsertBefore);
+  inline Argument *getArgumentFromEnd(Function *F, unsigned offset);
+
+  inline void copyChildIndexCalc(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
+                       Instruction *InsertBefore);
+
+  std::string getMangledName(std::string Name) {
+    //if (!Name.compare("llvm_hpvm_ocl_launch"))
+      //return "_Z20llvm_hpvm_ocl_launchPKcS0_N4hpvm6TargetEb";
+    //if (!Name.compare("llvm_hpvm_ocl_executeNode"))
+      //return "_Z25llvm_hpvm_ocl_executeNodePvjPKmS1_";
+    //else
+      return Name;
+  }
+
 public:
   // Constructor
   CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
-  static bool checkPreferredTarget(DFNode *N, hpvm::Target T);
-  static bool preferredTargetIncludes(DFNode *N, hpvm::Target T);
-  hpvm::Target getPreferredTarget(DFNode *N);
+  inline static bool checkPreferredTarget(DFNode *N, hpvm::Target T);
+  inline static bool preferredTargetIncludes(DFNode *N, hpvm::Target T);
+  inline hpvm::Target getPreferredTarget(DFNode *N);
 
   virtual void visit(DFInternalNode *N) {
     // If code has already been generated for this internal node, skip the
@@ -160,6 +222,313 @@ public:
 
 // -------------- CodeGenTraversal Implementation -----------------
 
+/* Traverse the function argument list in reverse order to get argument at a
+ * distance offset fromt he end of argument list of function F
+ */
+Argument *CodeGenTraversal::getArgumentFromEnd(Function *F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
+         "Invalid offset to access arguments!");
+  Function::arg_iterator e = F->arg_end();
+  // Last element of argument iterator is dummy. Skip it.
+  e--;
+  Argument *arg;
+  for (; offset != 0; e--) {
+    offset--;
+    arg = &*e;
+  }
+  return arg;
+}
+Value *CodeGenTraversal::getInValueAt(DFNode *Child, unsigned i,
+                                      Function *ParentF_CPU,
+                                      Instruction *InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DEBUG(errs() << "Finding incoming edge " << i << " for "
+    << Child->getFuncPointer()->getName() << "\n");
+  DFEdge *E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode *SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value *inputVal;
+  if (SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_CPU, E->getSourcePosition());
+    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+  } else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF) &&
+           "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using OutputMap
+    Value *CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+// Copy over any index limit calculations which may be used
+// to specify dynamic nodes instances.
+void CodeGenTraversal::copyChildIndexCalc(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
+        Instruction *InsertBefore) {
+
+    for(unsigned j =0; j < C->getNumOfDim(); j++){
+        std::vector<Instruction*> toClone;
+        std::queue<Value*> workList;
+
+        workList.push(C->getDimLimits()[j]);
+
+
+        
+        while (workList.size()){
+            Value* entry = workList.front();
+            workList.pop();
+
+            // Arguments are already available in
+            // the cloned function.
+            if(isa<Argument>(entry)) continue;
+
+            // Constant values should be available in
+            // the operand;
+            if(isa<Constant>(entry)) continue;
+
+
+            // Only calls with scalar operands are currrently allowed
+            if(CallInst* CI = dyn_cast<CallInst>(entry)){
+                
+                for(unsigned c = 0; c < CI->getNumArgOperands(); c++){
+                    Value* CArg = CI->getArgOperand(c);
+                    assert(!CArg->getType()->isPointerTy() && "Index calculation chain of instructions can not contain calls with pointer operands");
+                }
+            }
+
+            assert(!isa<LoadInst>(entry) && !isa<StoreInst>(entry) && "Only non-memory reading/writing operations legal in dimension limit calculation");
+
+
+            Instruction* I = dyn_cast<Instruction>(entry);
+
+            if(!I){
+                DEBUG(errs()<<"Unknown value type: "<<*entry<<"\n");
+                assert(false && "Non-argument or instruction type used in limit calc.");
+            }
+
+            toClone.push_back(I);
+
+            for(unsigned k =0; k < I->getNumOperands(); k++){
+                workList.push(I->getOperand(k));
+
+            }
+        }
+
+        
+        // Clone instructions in reverse order due to use-def chains
+        for(int i = toClone.size()-1; i >= 0; i--){
+            Instruction* I = toClone[i];
+
+            // Already copied over value.
+            if(VMap.find(&*I) != VMap.end()) continue;
+
+
+            Instruction* NewInst = I->clone();
+
+            // Update the cloned instructions operands to use
+            // the VMapped Values.
+            for(unsigned o = 0; o < NewInst->getNumOperands(); o++){
+                
+                if(isa<Constant>(NewInst->getOperand(o))) continue;
+
+                assert(VMap.find(I->getOperand(o)) != VMap.end() &&
+                        "Copied Value use-def chain not correctly copied!");
+
+                NewInst->setOperand(o,VMap[I->getOperand(o)]);
+            }
+            
+            NewInst->insertBefore(InsertBefore);
+
+            // Add instruction to VMap;
+            VMap[&*I] = NewInst;
+
+
+        }
+    }
+
+    DEBUG(errs()<<"F_CPU after cloning in index values: "<<*F_CPU<<"\n");
+}
+
+
+void CodeGenTraversal::invokeChild(DFNode *C, Function *F_CPU,
+                                   ValueToValueMapTy &VMap, Instruction *IB,
+                                   hpvm::Target Tag) {
+  Function *CF = C->getFuncPointer();
+
+  //  Function* CF_CPU = C->getGenFunc();
+  Function *CF_CPU = C->getGenFuncForTarget(Tag);
+  assert(CF_CPU != NULL &&
+         "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasCPUGenFuncForTarget(Tag) &&
+         "The generated function to be called from cpu backend is not an cpu "
+         "function\n");
+  DEBUG(errs() << "Invoking child node" << CF_CPU->getName() << "\n");
+
+  std::vector<Value *> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_CPU, IB));
+  }
+
+  Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_CPU->getContext()), 0);
+  for (unsigned j = 0; j < 6; j++)
+    Args.push_back(I64Zero);
+
+  DEBUG(errs() << "Gen Function type: " << *CF_CPU->getType() << "\n");
+  DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n");
+  DEBUG(errs() << "Arguments: ";
+  for (const Value *Arg : Args)
+    errs() << *(Arg->getType()) << ", ";
+  errs() << "\n");
+
+  // Copying over index calculation.
+  copyChildIndexCalc(C, F_CPU, VMap, IB);
+
+  // Call the F_CPU function associated with this node
+  CallInst *CI =
+      CallInst::Create(CF_CPU, Args, CF_CPU->getName() + "_output", IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  unsigned numArgs = CI->getNumArgOperands();
+  for (unsigned j = 0; j < C->getNumOfDim(); j++) {
+    Value *indexLimit = NULL;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_CPU function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if (ConstantInt *ConstDimLimit = dyn_cast<ConstantInt>(C->getDimLimits()[j])) {
+      indexLimit = C->getDimLimits()[j];
+      DEBUG(errs() << "In Constant case:\n"
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+      if (ConstDimLimit->getZExtValue() == 1) {
+        DEBUG(errs() << "DimLimit is 1, no need for loop!\n");
+        continue;
+      }
+    } else if (isa<Constant>(C->getDimLimits()[j])) {
+      indexLimit = C->getDimLimits()[j];
+      DEBUG(errs() << "In Constant case:\n"
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    } else {
+      indexLimit = VMap[C->getDimLimits()[j]];
+      DEBUG(errs() << "In VMap case:"
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
+    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs - 6 + j, indexVar);
+    CI->setArgOperand(numArgs - 3 + j, indexLimit);
+  }
+
+  if (Tag == hpvm::CPU_TARGET) {
+    // Insert call to runtime to push the dim limits and instanceID on the depth
+    // stack
+    Value *args[] = {
+        ConstantInt::get(Type::getInt32Ty(CI->getContext()),
+                         C->getNumOfDim()), // numDim
+        CI->getArgOperand(numArgs - 3 + 0), // limitX
+        CI->getArgOperand(numArgs - 6 + 0), // iX
+        CI->getArgOperand(numArgs - 3 + 1), // limitY
+        CI->getArgOperand(numArgs - 6 + 1), // iY
+        CI->getArgOperand(numArgs - 3 + 2), // limitZ
+        CI->getArgOperand(numArgs - 6 + 2)  // iZ
+    };
+
+    CallInst *Push = CallInst::Create(llvm_hpvm_cpu_dstack_push,
+                                      ArrayRef<Value *>(args, 7), "", CI);
+    DEBUG(errs() << "Push on stack: " << *Push << "\n");
+    // Insert call to runtime to pop the dim limits and instanceID from the
+    // depth stack
+    BasicBlock::iterator i(CI);
+    ++i;
+    Instruction *NextI = &*i;
+    // Next Instruction should also belong to the same basic block as the basic
+    // block will have a terminator instruction
+    assert(NextI->getParent() == CI->getParent() &&
+           "Next Instruction should also belong to the same basic block!");
+
+    CallInst *Pop = CallInst::Create(llvm_hpvm_cpu_dstack_pop, None, "", NextI);
+    DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
+    DEBUG(errs() << *CI->getParent()->getParent());
+  }
+}
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+Value *CodeGenTraversal::addLoop(Instruction *I, Value *limit,
+                                 const Twine &indexName) {
+  BasicBlock *Entry = I->getParent();
+  BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
+
+  BasicBlock::iterator i(I);
+  ++i;
+  Instruction *NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == ForBody &&
+         "Next Instruction should also belong to the same basic block!");
+  BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+
+  // Add Phi Node for index variable
+  PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
+                                      "index." + indexName, I);
+
+  // Add incoming edge to phi
+  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
+                        Entry);
+  // Increment index variable
+  BinaryOperator *IndexInc = BinaryOperator::Create(
+      Instruction::Add, IndexPhi,
+      ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+      "index." + indexName + ".inc", ForBody->getTerminator());
+
+  // Compare index variable with limit
+  CmpInst *Cond =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
+                      "cond." + indexName, ForBody->getTerminator());
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+  // Add incoming edge to phi node in body
+  IndexPhi->addIncoming(IndexInc, ForBody);
+  return IndexPhi;
+}
 bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) {
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
@@ -168,7 +537,7 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) {
   case hpvm::GPU_TARGET:
     HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case hpvm::CUDNN_TARGET:
+    case hpvm::CUDNN_TARGET:
     HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cudnn");
     break;
   case hpvm::CPU_TARGET:
@@ -195,7 +564,6 @@ hpvm::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
 }
 
 bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) {
-
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
   std::vector<NamedMDNode *> HintNode;
@@ -230,6 +598,7 @@ bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) {
         return true;
     }
   }
+
   return false;
 }
 
@@ -249,7 +618,7 @@ Value *CodeGenTraversal::getStringPointer(const Twine &S, Instruction *IB,
   return SPtr;
 }
 
-void renameNewArgument(Function *newF, const Twine &argName) {
+inline void renameNewArgument(Function *newF, const Twine &argName) {
   // Get Last argument in Function Arg List and rename it to given name
   Argument *lastArg = &*(newF->arg_end() - 1);
   lastArg->setName(argName);
@@ -257,8 +626,7 @@ void renameNewArgument(Function *newF, const Twine &argName) {
 
 // Creates a function with an additional argument of the specified type and
 // name. The previous function is not deleted.
-Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
-                                        const Twine &name) {
+inline Function *addArgument(Function *F, Type *Ty, const Twine &name) {
   Argument *new_arg = new Argument(Ty, name);
 
   // Create the argument type list with added argument types
@@ -276,7 +644,7 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
   FunctionType *FTy =
       FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
   Function *newF = Function::Create(FTy, F->getLinkage(),
-                                    F->getName() + "_cloned", F->getParent());
+                                    F->getName() + "_c", F->getParent());
   renameNewArgument(newF, name);
   newF = hpvmUtils::cloneFunction(F, newF, false);
 
@@ -291,7 +659,6 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
 // Return new function with additional index and limit arguments.
 // The original function is removed from the module and erased.
 Function *CodeGenTraversal::addIdxDimArgs(Function *F) {
-  DEBUG(errs() << "Adding dimension and limit arguments to Function: " << F->getName());
   DEBUG(errs() << "Function Type: " << *F->getFunctionType() << "\n");
   // Add Index and Dim arguments
   std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
diff --git a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
index 10667ddeecc7f072222032e930d27fd1f75e7b2d..70a40ba07226e613300e3cb4ae073cfde9e8a584 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
@@ -12,10 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Intrinsics.h"
 #define DEBUG_TYPE "DFG2LLVM_CPU"
-
 #include "SupportHPVM/DFG2LLVM.h"
-
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
@@ -28,6 +27,8 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
+#include <queue>
+
 #ifndef LLVM_BUILD_DIR
 #error LLVM_BUILD_DIR is not defined
 #endif
@@ -44,21 +45,7 @@ using namespace dfg2llvm;
 static cl::opt<bool> HPVMTimer_CPU("hpvm-timers-cpu",
                                    cl::desc("Enable hpvm timers"));
 
-namespace {
-
-// DFG2LLVM_CPU - The first implementation.
-struct DFG2LLVM_CPU : public DFG2LLVM {
-  static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_CPU() : DFG2LLVM(ID) {}
-
-private:
-  // Member variables
-
-  // Functions
-
-public:
-  bool runOnModule(Module &M);
-};
+namespace dfg2llvm {
 
 // Visitor for Code generation traversal (tree traversal for now)
 class CGT_CPU : public CodeGenTraversal {
@@ -83,21 +70,15 @@ private:
   FunctionCallee llvm_hpvm_createThread;
   FunctionCallee llvm_hpvm_bufferPush;
   FunctionCallee llvm_hpvm_bufferPop;
-  FunctionCallee llvm_hpvm_cpu_dstack_push;
-  FunctionCallee llvm_hpvm_cpu_dstack_pop;
+//  FunctionCallee llvm_hpvm_cpu_dstack_push;
+//  FunctionCallee llvm_hpvm_cpu_dstack_pop;
   FunctionCallee llvm_hpvm_cpu_getDimLimit;
   FunctionCallee llvm_hpvm_cpu_getDimInstance;
 
   // Functions
   std::vector<IntrinsicInst *> *getUseList(Value *LI);
-  Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = "");
   void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
   Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
-  Argument *getArgumentFromEnd(Function *F, unsigned offset);
-  Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_CPU,
-                      Instruction *InsertBefore);
-  void invokeChild_CPU(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
-                       Instruction *InsertBefore);
   void invokeChild_PTX(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
                        Instruction *InsertBefore);
   StructType *getArgumentListStructTy(DFNode *);
@@ -129,14 +110,7 @@ public:
   void codeGenLaunchStreaming(DFInternalNode *Root);
 };
 
-bool DFG2LLVM_CPU::runOnModule(Module &M) {
-  DEBUG(errs() << "\nDFG2LLVM_CPU PASS\n");
-
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
-
+bool runDFG2LLVM_CPU (Module &M, BuildDFG&DFG) {
   // DFInternalNode *Root = DFG.getRoot();
   std::vector<DFInternalNode *> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
@@ -161,11 +135,35 @@ bool DFG2LLVM_CPU::runOnModule(Module &M) {
     else
       CGTVisitor->codeGenLaunch(rootNode);
   }
+	
+	for (auto &F : M) {
+		for (Function::arg_iterator ai = F.arg_begin(), ae = F.arg_end(); ai != ae; ai++) {
+			Argument *Arg = &*ai;
+			if(Arg->hasAttribute(Attribute::In))
+				Arg->removeAttr(Attribute::In);
+			if(Arg->hasAttribute(Attribute::Out))
+				Arg->removeAttr(Attribute::Out);
+			if(Arg->hasAttribute(Attribute::InOut))
+				Arg->removeAttr(Attribute::InOut);
+
+		}
+	}
 
   delete CGTVisitor;
   return true;
 }
 
+bool DFG2LLVM_CPU::runOnModule(Module &M) {
+  DEBUG(errs() << "\nDFG2LLVM_CPU PASS\n");
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  return runDFG2LLVM_CPU(M, DFG);
+}
+
 // Initialize the HPVM runtime API. This makes it easier to insert these calls
 void CGT_CPU::initRuntimeAPI() {
 
@@ -238,22 +236,6 @@ std::vector<IntrinsicInst *> *CGT_CPU::getUseList(Value *GraphID) {
   return UseList;
 }
 
-/* Traverse the function argument list in reverse order to get argument at a
- * distance offset fromt he end of argument list of function F
- */
-Argument *CGT_CPU::getArgumentFromEnd(Function *F, unsigned offset) {
-  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
-         "Invalid offset to access arguments!");
-  Function::arg_iterator e = F->arg_end();
-  // Last element of argument iterator is dummy. Skip it.
-  e--;
-  Argument *arg;
-  for (; offset != 0; e--) {
-    offset--;
-    arg = &*e;
-  }
-  return arg;
-}
 
 /* Add Loop around the instruction I
  * Algorithm:
@@ -283,58 +265,31 @@ void CGT_CPU::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
   ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
 }
 
-/* Add Loop around the instruction I
- * Algorithm:
- * (1) Split the basic block of instruction I into three parts, where the
- * middleblock/body would contain instruction I.
- * (2) Add phi node before instruction I. Add incoming edge to phi node from
- * predecessor
- * (3) Add increment and compare instruction to index variable
- * (4) Replace terminator/branch instruction of body with conditional branch
- * which loops over bidy if true and goes to end if false
- * (5) Update phi node of body
- */
-Value *CGT_CPU::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
-  BasicBlock *Entry = I->getParent();
-  BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
-
-  BasicBlock::iterator i(I);
-  ++i;
-  Instruction *NextI = &*i;
-  // Next Instruction should also belong to the same basic block as the basic
-  // block will have a terminator instruction
-  assert(NextI->getParent() == ForBody &&
-         "Next Instruction should also belong to the same basic block!");
-  BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
-
-  // Add Phi Node for index variable
-  PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
-                                      "index." + indexName, I);
-
-  // Add incoming edge to phi
-  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
-                        Entry);
-  // Increment index variable
-  BinaryOperator *IndexInc = BinaryOperator::Create(
-      Instruction::Add, IndexPhi,
-      ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
-      "index." + indexName + ".inc", ForBody->getTerminator());
-
-  // Compare index variable with limit
-  CmpInst *Cond =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
-                      "cond." + indexName, ForBody->getTerminator());
-
-  // Replace the terminator instruction of for.body with new conditional
-  // branch which loops over body if true and branches to for.end otherwise
-  BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
-  ReplaceInstWithInst(ForBody->getTerminator(), BI);
-
-  // Add incoming edge to phi node in body
-  IndexPhi->addIncoming(IndexInc, ForBody);
-  return IndexPhi;
+Instruction *CGT_CPU::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+                                          BasicBlock *Body) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  return CounterPhi;
 }
 
+
 // Returns a packed struct type. The structtype is created by packing the input
 // types, output types and isLastInput buffer type. All the streaming
 // inputs/outputs are converted to i8*, since this is the type of buffer
@@ -831,135 +786,7 @@ void CGT_CPU::codeGenLaunch(DFInternalNode *Root) {
   }
 }
 
-Value *CGT_CPU::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_CPU,
-                             Instruction *InsertBefore) {
-  // TODO: Assumption is that each input port of a node has just one
-  // incoming edge. May change later on.
-
-  // Find the incoming edge at the requested input port
-  DFEdge *E = Child->getInDFEdgeAt(i);
-  assert(E && "No incoming edge or binding for input element!");
-  // Find the Source DFNode associated with the incoming edge
-  DFNode *SrcDF = E->getSourceDF();
-
-  // If Source DFNode is a dummyNode, edge is from parent. Get the
-  // argument from argument list of this internal node
-  Value *inputVal;
-  if (SrcDF->isEntryNode()) {
-    inputVal = getArgumentAt(ParentF_CPU, E->getSourcePosition());
-    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
-  } else {
-    // edge is from a sibling
-    // Check - code should already be generated for this source dfnode
-    assert(OutputMap.count(SrcDF) &&
-           "Source node call not found. Dependency violation!");
-
-    // Find CallInst associated with the Source DFNode using OutputMap
-    Value *CI = OutputMap[SrcDF];
-
-    // Extract element at source position from this call instruction
-    std::vector<unsigned> IndexList;
-    IndexList.push_back(E->getSourcePosition());
-    DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
-    ExtractValueInst *EI =
-        ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
-    inputVal = EI;
-  }
-  return inputVal;
-}
-
-void CGT_CPU::invokeChild_CPU(DFNode *C, Function *F_CPU,
-                              ValueToValueMapTy &VMap, Instruction *IB) {
-  Function *CF = C->getFuncPointer();
-
-  //  Function* CF_CPU = C->getGenFunc();
-  Function *CF_CPU = C->getGenFuncForTarget(hpvm::CPU_TARGET);
-  assert(CF_CPU != NULL &&
-         "Found leaf node for which code generation has not happened yet!\n");
-  assert(C->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) &&
-         "The generated function to be called from cpu backend is not an cpu "
-         "function\n");
-  DEBUG(errs() << "Invoking child node" << CF_CPU->getName() << "\n");
-
-  std::vector<Value *> Args;
-  // Create argument list to pass to call instruction
-  // First find the correct values using the edges
-  // The remaing six values are inserted as constants for now.
-  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(getInValueAt(C, i, F_CPU, IB));
-  }
-
-  Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_CPU->getContext()), 0);
-  for (unsigned j = 0; j < 6; j++)
-    Args.push_back(I64Zero);
-
-  DEBUG(errs() << "Gen Function type: " << *CF_CPU->getType() << "\n");
-  DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n");
-  DEBUG(errs() << "Arguments: " << Args.size() << "\n");
 
-  // Call the F_CPU function associated with this node
-  CallInst *CI =
-      CallInst::Create(CF_CPU, Args, CF_CPU->getName() + "_output", IB);
-  DEBUG(errs() << *CI << "\n");
-  OutputMap[C] = CI;
-
-  // Find num of dimensions this node is replicated in.
-  // Based on number of dimensions, insert loop instructions
-  std::string varNames[3] = {"x", "y", "z"};
-  unsigned numArgs = CI->getNumArgOperands();
-  for (unsigned j = 0; j < C->getNumOfDim(); j++) {
-    Value *indexLimit = NULL;
-    // Limit can either be a constant or an arguement of the internal node.
-    // In case of constant we can use that constant value directly in the
-    // new F_CPU function. In case of an argument, we need to get the mapped
-    // value using VMap
-    if (isa<Constant>(C->getDimLimits()[j])) {
-      indexLimit = C->getDimLimits()[j];
-      DEBUG(errs() << "In Constant case:\n"
-                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    } else {
-      indexLimit = VMap[C->getDimLimits()[j]];
-      DEBUG(errs() << "In VMap case:"
-                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    }
-    assert(indexLimit && "Invalid dimension limit!");
-    // Insert loop
-    Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
-    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
-    // Insert index variable and limit arguments
-    CI->setArgOperand(numArgs - 6 + j, indexVar);
-    CI->setArgOperand(numArgs - 3 + j, indexLimit);
-  }
-  // Insert call to runtime to push the dim limits and instanceID on the depth
-  // stack
-  Value *args[] = {
-      ConstantInt::get(Type::getInt32Ty(CI->getContext()),
-                       C->getNumOfDim()), // numDim
-      CI->getArgOperand(numArgs - 3 + 0), // limitX
-      CI->getArgOperand(numArgs - 6 + 0), // iX
-      CI->getArgOperand(numArgs - 3 + 1), // limitY
-      CI->getArgOperand(numArgs - 6 + 1), // iY
-      CI->getArgOperand(numArgs - 3 + 2), // limitZ
-      CI->getArgOperand(numArgs - 6 + 2)  // iZ
-  };
-
-  CallInst *Push = CallInst::Create(llvm_hpvm_cpu_dstack_push,
-                                    ArrayRef<Value *>(args, 7), "", CI);
-  DEBUG(errs() << "Push on stack: " << *Push << "\n");
-  // Insert call to runtime to pop the dim limits and instanceID from the depth
-  // stack
-  BasicBlock::iterator i(CI);
-  ++i;
-  Instruction *NextI = &*i;
-  // Next Instruction should also belong to the same basic block as the basic
-  // block will have a terminator instruction
-  assert(NextI->getParent() == CI->getParent() &&
-         "Next Instruction should also belong to the same basic block!");
-
-  CallInst *Pop = CallInst::Create(llvm_hpvm_cpu_dstack_pop, None, "", NextI);
-  DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
-  DEBUG(errs() << *CI->getParent()->getParent());
-}
 
 /* This function takes a DFNode, and creates a filter function for it. By filter
  * function we mean a function which keeps on getting input from input buffers,
@@ -1140,6 +967,7 @@ Function *CGT_CPU::createFunctionFilter(DFNode *C) {
   return CF_Pipeline;
 }
 
+
 void CGT_CPU::codeGen(DFInternalNode *N) {
   // Check if N is root node and its graph is streaming. We do not do codeGen
   // for Root in such a case
@@ -1241,7 +1069,7 @@ void CGT_CPU::codeGen(DFInternalNode *N) {
         continue;
 
       // Create calls to CPU function of child node
-      invokeChild_CPU(C, F_CPU, VMap, RI);
+      invokeChild(C, F_CPU, VMap, RI, hpvm::CPU_TARGET);
     }
 
     DEBUG(errs() << "*** Generating epilogue code for the function****\n");
@@ -1404,13 +1232,12 @@ void CGT_CPU::codeGen(DFLeafNode *N) {
                  << " : skipping it\n");
 
     switch (N->getTag()) {
-    case hpvm::GPU_TARGET: {
+    case hpvm::GPU_TARGET:
       // A leaf node should not have an cpu function for GPU
       // by design of DFG2LLVM_OpenCL backend
       assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) &&
              "Leaf node not expected to have GPU GenFunc");
       break;
-    }
     case hpvm::CUDNN_TARGET: {
       DEBUG(errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n");
       // Make sure there is a generated CPU function for cudnn
@@ -1448,16 +1275,18 @@ void CGT_CPU::codeGen(DFLeafNode *N) {
        N->setTag(hpvm::CPU_TARGET);
        break;
      }
-     default:
-     {
-       break;
-     }
+    default:
+      break;
     }
 
     return;
   }
 
-  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
+ /* if(N->getGenFuncForTarget(hpvm::CPU_TARGET) != NULL) {
+    DEBUG(errs() << "Already generated CPU code for this node!\n");
+    return;
+  }*/
+ assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
          "Error: Visiting a node for which code already generated\n");
 
   std::vector<IntrinsicInst *> IItoRemove;
@@ -1466,6 +1295,7 @@ void CGT_CPU::codeGen(DFLeafNode *N) {
 
   // Get the function associated woth the dataflow node
   Function *F = N->getFuncPointer();
+	DEBUG(errs() << "Generating CPU code for function " << F->getName() << "\n");
 
   // Clone the function, if we are seeing this function for the first time.
   Function *F_CPU;
@@ -1691,7 +1521,11 @@ void CGT_CPU::codeGen(DFLeafNode *N) {
         break;
       }
 
-    } else {
+    } else if (BuildDFG::isHPVMIntrinsic(I)) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      if (II->getIntrinsicID() == Intrinsic::hpvm_nz_loop) {
+        IItoRemove.push_back(II);
+      }
     }
   }
 
diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
index eda655e3196450ee94ab44a70d500a1188007a66..44d6eec7075b93df987935078604da42ff6639fd 100644
--- a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
+++ b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Intrinsics.h"
 #define DEBUG_TYPE "genhpvm"
 #include "GenHPVM/GenHPVM.h"
 
@@ -97,23 +98,50 @@ static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID,
     FunctionType *FTy = F->getFunctionType();
     DEBUG(errs() << *F << "\n");
 
-    // Create argument list
-    assert(CI->getNumArgOperands() == FTy->getNumParams() &&
-           "Number of arguments of call do not match with Intrinsic");
-    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
-      Value *V = CI->getArgOperand(i);
-      // Either the type should match or both should be of pointer type
-      assert((V->getType() == FTy->getParamType(i) ||
-              (V->getType()->isPointerTy() &&
-               FTy->getParamType(i)->isPointerTy())) &&
-             "Dummy function call argument does not match with Intrinsic "
-             "argument!");
-      // If the types do not match, then both must be pointer type and pointer
-      // cast needs to be performed
-      if (V->getType() != FTy->getParamType(i)) {
-        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
-      }
+    // Adding support for loop tripcount info
+    // hpvm_nz_loop might have 1 or two operands
+    if (IntrinsicID == Intrinsic::hpvm_nz_loop) {
+      assert((CI->getNumArgOperands() > 0 &&
+              (CI->getNumArgOperands() == FTy->getNumParams() ||
+               CI->getNumArgOperands() + 1 == FTy->getNumParams())) &&
+             "Number of arguments of call do not match with Intrinsic");
+      // We always should have 1 argument that points to the PHI Node
+      Value *V = CI->getArgOperand(0);
+      assert(V->getType() == FTy->getParamType(0) &&
+             "isNonZeroLoop argument 0 does not match!");
       args.push_back(V);
+      // We can optionally have a second argument which includes the trip count.
+      // If we don't have it, set to Zero.
+      // This trip count is only used by hpvm-hypermapper for DSE
+      if (CI->getNumArgOperands() == 2) {
+        Value *V2 = CI->getArgOperand(1);
+        assert(V2->getType() == FTy->getParamType(1) &&
+               "isNonZeroLoop argument 1 does not match!");
+        args.push_back(V2);
+      } else {
+        Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+        args.push_back(Zero);
+      }
+    } else {
+
+      // Create argument list
+      assert(CI->getNumArgOperands() == FTy->getNumParams() &&
+             "Number of arguments of call do not match with Intrinsic");
+      for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+        Value *V = CI->getArgOperand(i);
+        // Either the type should match or both should be of pointer type
+        assert((V->getType() == FTy->getParamType(i) ||
+                (V->getType()->isPointerTy() &&
+                 FTy->getParamType(i)->isPointerTy())) &&
+               "Dummy function call argument does not match with Intrinsic "
+               "argument!");
+        // If the types do not match, then both must be pointer type and pointer
+        // cast needs to be performed
+        if (V->getType() != FTy->getParamType(i)) {
+          V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+        }
+        args.push_back(V);
+      }
     }
   }
   // Insert call instruction
@@ -169,6 +197,8 @@ IS_HPVM_CALL(requestMemory)
 IS_HPVM_CALL(attributes)
 IS_HPVM_CALL(hint)
 
+IS_HPVM_CALL(task)
+
 // Tensor Operators
 IS_HPVM_CALL(tensor_mul)
 IS_HPVM_CALL(tensor_convolution)
@@ -186,6 +216,8 @@ IS_HPVM_CALL(tensor_softmax)
 
 IS_HPVM_CALL(node_id)
 
+IS_HPVM_CALL(isNonZeroLoop)
+
 // Return the constant integer represented by value V
 static unsigned getNumericValue(Value *V) {
   assert(
@@ -274,6 +306,26 @@ static void handleHPVMAttributes(Function *F, CallInst *CI) {
                << *F << "\n");
 }
 
+
+void insertChildren(Function* F, std::vector<Function*>& functions){
+    for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
+
+      CallInst* CI = dyn_cast<CallInst>(I);
+
+      if(!CI) continue;
+
+      if (isHPVMCall_createNodeND(I)) {
+          Function* ChildF = cast<Function>(CI->getArgOperand(1)->stripPointerCasts());
+          insertChildren(ChildF, functions);
+      }
+    }
+    if(std::find(functions.begin(),functions.end(), F) != functions.end()) return;
+    functions.push_back(F);
+
+}
+
+
 // Public Functions of GenHPVM pass
 bool GenHPVM::runOnModule(Module &M) {
   DEBUG(errs() << "\nGENHPVM PASS\n");
@@ -313,6 +365,7 @@ bool GenHPVM::runOnModule(Module &M) {
   // Insert init context in main
   DEBUG(errs() << "Locate __hpvm__init()\n");
   Function *VI = M.getFunction("__hpvm__init");
+  assert(VI != NULL && "__hpvm__init not found!");
   assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
   Instruction *I = cast<Instruction>(*VI->user_begin());
 
@@ -332,11 +385,59 @@ bool GenHPVM::runOnModule(Module &M) {
   std::vector<Instruction *> toBeErased;
   std::vector<Function *> functions;
 
-  for (auto &F : M)
-    functions.push_back(&F);
+  /*
+  for (auto &F : M){ 
+    if(!F.isDeclaration()){
+      functions.push_back(&F);
+    }
+  }*/
+
+  Function* LaunchF = M.getFunction("__hpvm__launch");
+  for(auto* User: LaunchF->users()){
+
+      CallInst* CI = dyn_cast<CallInst>(User);
+
+      Function* Host = CI->getParent()->getParent();
+
+
+      Function* RootFn = dyn_cast<Function>(CI->getArgOperand(1)->stripPointerCasts());
+      insertChildren(RootFn, functions);
+
+      if(std::find(functions.begin(),functions.end(), Host) == functions.end())
+          functions.push_back(Host);
+
+
+  }
+  
+  Function* InitF = M.getFunction("__hpvm__init");
+  for(auto* User: InitF->users()){
+
+      CallInst* CI = dyn_cast<CallInst>(User);
+
+      Function* Host = CI->getParent()->getParent();
+
+      if(std::find(functions.begin(),functions.end(), Host) == functions.end())
+          functions.push_back(Host);
+
+  }
+  Function* ClearF = M.getFunction("__hpvm__cleanup");
+  for(auto* User: ClearF->users()){
+
+      CallInst* CI = dyn_cast<CallInst>(User);
+
+      Function* Host = CI->getParent()->getParent();
+
+      if(std::find(functions.begin(),functions.end(), Host) == functions.end())
+          functions.push_back(Host);
+
+  }
+
+
+
 
   // Iterate over all functions in the module
-  for (Function *f : functions) {
+  for (unsigned i = 0; i < functions.size(); ++i) {
+    Function *f = functions[i];
     DEBUG(errs() << "Function: " << f->getName() << "\n");
 
     // List with the required additions in the function's return type
@@ -378,13 +479,21 @@ bool GenHPVM::runOnModule(Module &M) {
       if (isHPVMCall_requestMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_requestMemory, &toBeErased);
       }
+      if (isHPVMCall_task(I)){
+          assert(isa<ConstantInt>(CI->getArgOperand(0)) 
+                  && "Argument to hpvm_task must be a constant integer");
+          ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_task, &toBeErased);
+      }
       if (isHPVMCall_hint(I)) {
-        assert(isa<ConstantInt>(CI->getArgOperand(0)) &&
-               "Argument to hint must be constant integer!");
-        ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0));
-        DEBUG(errs() << "HINT INSTRUCTION: " << *I << "\n");
-        hpvm::Target t = (hpvm::Target)hint->getZExtValue();
-        addHint(CI->getParent()->getParent(), t);
+        // Iterate over variadic hint call
+        for(unsigned h = 0; h < CI->getNumArgOperands(); h ++){
+            assert(isa<ConstantInt>(CI->getArgOperand(h)) &&
+                   "Argument to hint must be constant integer!");
+            ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(h));
+
+            hpvm::Target t = (hpvm::Target)hint->getZExtValue();
+            addHint(CI->getParent()->getParent(), t);
+        }
         DEBUG(errs() << "Found hpvm hint call: " << *CI << "\n");
         toBeErased.push_back(CI);
       }
@@ -394,6 +503,7 @@ bool GenHPVM::runOnModule(Module &M) {
         DEBUG(errs() << *LaunchF << "\n");
         // Get i8* cast to function pointer
         Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        auto OldFuncPosition = std::find(functions.begin(), functions.end(), graphFunc);
         graphFunc = transformReturnTypeToStruct(graphFunc);
         Constant *F =
             ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
@@ -401,6 +511,8 @@ bool GenHPVM::runOnModule(Module &M) {
             F &&
             "Function invoked by HPVM launch has to be define and constant.");
 
+        std::replace(functions.begin(), functions.end(), *OldFuncPosition, graphFunc);
+
         ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0));
         assert(Op && "HPVM launch's streaming argument is a constant value.");
         Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
@@ -430,9 +542,17 @@ bool GenHPVM::runOnModule(Module &M) {
         assert(CI->getNumArgOperands() > 0 &&
                "Too few arguments for __hpvm__createNodeND call");
         unsigned numDims = getNumericValue(CI->getArgOperand(0));
-        // We need as meny dimension argments are there are dimensions
-        assert(CI->getNumArgOperands() - 2 == numDims &&
-               "Too few arguments for __hpvm_createNodeND call!\n");
+
+        // We need as many dimension argments are there are dimensions
+        // assert(CI->getNumArgOperands() - 2 == numDims &&
+        //       "Too few arguments for __hpvm_createNodeND call!\n");
+
+        unsigned numArgs = CI->getNumArgOperands();
+        
+
+        assert((numArgs - numDims == 2 ||
+                numArgs - numDims == 3) &&
+                "Invalid number of arguments passed to __hpvm__createNodeND call");
 
         Function *CreateNodeF;
         switch (numDims) {
@@ -463,22 +583,41 @@ bool GenHPVM::runOnModule(Module &M) {
 
         // Get i8* cast to function pointer
         Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        auto OldFuncPosition = std::find(functions.begin(), functions.end(), graphFunc);
         graphFunc = transformReturnTypeToStruct(graphFunc);
         Constant *F =
             ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
 
+        // Insert transformed functions into list of functions
+        // to process.
+        std::replace(functions.begin(), functions.end(), *OldFuncPosition, graphFunc);
+
         CallInst *CreateNodeInst;
+
+        // Each Node has a default criticality of 0
+        ConstantInt* NodeCriticality = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+        // If optional criticality provided
+        if(numArgs - numDims == 3){
+            ConstantInt* Crit = dyn_cast<ConstantInt>(CI->getArgOperand(numArgs - 1)
+                    );
+            assert(Crit && "Criticality Value must be a constant integer");
+            NodeCriticality = Crit;
+        }
+
         switch (numDims) {
-        case 0:
-          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F),
+        case 0: {
+          Value *CreateNodeArgs[] = {F, NodeCriticality};
+          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(CreateNodeArgs,2),
                                             graphFunc->getName() + ".node", CI);
-          break;
+                
+        } break;
         case 1: {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
-          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), NodeCriticality};
           CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2),
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
               graphFunc->getName() + ".node", CI);
         } break;
         case 2: {
@@ -487,9 +626,9 @@ bool GenHPVM::runOnModule(Module &M) {
           assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 3, expected to be i64\n");
           Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
-                                     CI->getArgOperand(3)};
+                                     CI->getArgOperand(3),NodeCriticality};
           CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
               graphFunc->getName() + ".node", CI);
         } break;
         case 3: {
@@ -501,9 +640,10 @@ bool GenHPVM::runOnModule(Module &M) {
                  "CreateNodeND dimension argument, 4, expected to be i64\n");
           Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
                                      CI->getArgOperand(3),
-                                     CI->getArgOperand(4)};
+                                     CI->getArgOperand(4),
+                                     NodeCriticality};
           CreateNodeInst = CallInst::Create(
-              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 5),
               graphFunc->getName() + ".node", CI);
         } break;
         default:
@@ -729,6 +869,9 @@ bool GenHPVM::runOnModule(Module &M) {
       if (isHPVMCall_cos(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
       }
+      if (isHPVMCall_isNonZeroLoop(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_nz_loop, &toBeErased);
+      }
       if (isHPVMCall_tensor_convolution(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_tensor_convolution,
                                  &toBeErased);
@@ -868,16 +1011,17 @@ void GenHPVM::initializeTimerSet(Instruction *InsertBefore) {
             GlobalValue::CommonLinkage,
             Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
             "hpvmTimerSet_GenHPVM"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
-               << "\n");
+  TIMER(DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
+                     << "\n"));
   // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet <<
   // "\n");
 
   TIMER(TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "",
                                         InsertBefore));
-  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
+  TIMER(DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"));
   TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
-  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
+  TIMER(DEBUG(errs() << "Store Timer Address in Global variable: " << *SI
+                     << "\n"));
 }
 
 void GenHPVM::switchToTimer(enum hpvm_TimerID timer,