From 330fc7698e1ad28ddc6770e21255e480c737ab3d Mon Sep 17 00:00:00 2001
From: Maria Kotsifakou <kotsifa2@illinois.edu>
Date: Mon, 7 Aug 2017 16:19:05 -0500
Subject: [PATCH] Compiler and runtime support for scheduling on different
 target (CPU/GPU/SPIR) based on policy. Support for scheduling per iteration
 included. Further testing required.

---
 llvm/include/llvm/IR/DFGraph.h                | 157 +++++
 llvm/include/llvm/SupportVISC/DFG2LLVM.h      |  40 ++
 llvm/include/llvm/SupportVISC/VISCHint.h      |   6 +-
 llvm/include/llvm/SupportVISC/VISCUtils.h     | 182 ++++++
 llvm/lib/Transforms/BuildDFG/BuildDFG.cpp     |  20 +-
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         |  28 +-
 .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp  | 577 +++++++++++++++---
 llvm/lib/Transforms/GenVISC/GenVISC.cpp       |  60 +-
 llvm/projects/visc-rt/CMakeLists.txt          |   7 +
 llvm/projects/visc-rt/policy.h                |  35 ++
 llvm/projects/visc-rt/visc-rt.cpp             |  14 +
 llvm/projects/visc-rt/visc-rt.h               |   8 +-
 12 files changed, 958 insertions(+), 176 deletions(-)
 create mode 100644 llvm/projects/visc-rt/policy.h

diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index 210270e809..d068301bc7 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -29,6 +29,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCUtils.h"
 
 
 namespace llvm {
@@ -42,6 +43,17 @@ class DFTreeTraversal;
 class DFEdgeVisitor;
 class DFGraph;
 
+struct TargetGenFunctions {
+  Function *CPUGenFunc;
+  Function *GPUGenFunc;
+  Function *SPIRGenFunc;
+};
+
+struct TargetGenFuncInfo {
+  bool cpu_hasX86Func;
+  bool gpu_hasX86Func;
+  bool spir_hasX86Func;
+};
 
 class DFGraph {
 
@@ -194,6 +206,12 @@ private:
   IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
   Function* FuncPointer;          ///< Associated Function
   Function* GenFunc = NULL;       ///< Associated Function generated by backend
+  struct TargetGenFunctions GenFuncs;
+                                  ///< Associated Functions generated by backends
+                                  ///< (if multiple are available)
+  struct TargetGenFuncInfo GenFuncInfo;
+                                  ///< True for each target generated function
+                                  ///< if the associated genFunc is an x86 function
   DFInternalNode* Parent;         ///< Pointer to parent dataflow Node
   unsigned NumOfDim;              ///< Number of dimensions
   std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
@@ -375,6 +393,136 @@ public:
     return GenFunc;
   }
 
+  void setHasX86FuncForTarget(visc::Target T, bool isX86Func) {
+    switch (T) {
+      case visc::CPU_TARGET:
+        GenFuncInfo.cpu_hasX86Func = isX86Func;
+        break;
+      case visc::GPU_TARGET:
+        GenFuncInfo.gpu_hasX86Func = isX86Func;
+        break;
+      case visc::SPIR_TARGET:
+        GenFuncInfo.spir_hasX86Func = isX86Func;
+        break;
+      case visc::CPU_OR_GPU_TARGET:
+      case visc::CPU_OR_SPIR_TARGET:
+        assert(false && "Single target expected (CPU/GPU/SPIR)\n");
+        break;
+      default:
+       assert(false && "Unknown target\n");
+        break;
+    }
+    return; 
+  }
+
+  bool hasX86GenFuncForTarget(visc::Target T) {
+    switch (T) {
+      case visc::CPU_TARGET:
+        return GenFuncInfo.cpu_hasX86Func;
+      case visc::GPU_TARGET:
+        return GenFuncInfo.gpu_hasX86Func;
+      case visc::SPIR_TARGET:
+        return GenFuncInfo.spir_hasX86Func;
+      case visc::CPU_OR_GPU_TARGET:
+      case visc::CPU_OR_SPIR_TARGET:
+        assert(false && "Single target expected (CPU/GPU/SPIR)\n");
+        return false;
+      default:
+       assert(false && "Unknown target\n");
+       return false;
+    }
+
+  }
+
+  void addGenFunc(Function* F, visc::Target T, bool isX86Func) {
+
+    switch (T) {
+      case visc::CPU_TARGET:
+        if (GenFuncs.CPUGenFunc != NULL) {
+          errs() << "Warning: Second generated CPU function for node "
+                 << FuncPointer->getName() << "\n";
+        }
+        GenFuncs.CPUGenFunc = F;
+        GenFuncInfo.cpu_hasX86Func = isX86Func;
+        break;
+      case visc::GPU_TARGET:
+        if (GenFuncs.GPUGenFunc != NULL) {
+          errs() << "Warning: Second generated GPU function for node "
+                 << FuncPointer->getName() << "\n";
+        }
+        GenFuncs.GPUGenFunc = F;
+        GenFuncInfo.gpu_hasX86Func = isX86Func;
+        break;
+      case visc::SPIR_TARGET:
+        if (GenFuncs.SPIRGenFunc != NULL) {
+          errs() << "Warning: Second generated SPIR function for node "
+                 << FuncPointer->getName() << "\n";
+        }
+        GenFuncs.SPIRGenFunc = F;
+        GenFuncInfo.spir_hasX86Func = isX86Func;
+        break;
+      case visc::CPU_OR_GPU_TARGET:
+      case visc::CPU_OR_SPIR_TARGET:
+        assert(false &&
+               "A node function should be set with a tag specifying its \
+                type, not the node hint itself\n");
+        break;
+      default:
+        assert(false && "Unknown target for generated function\n");
+        break;
+    }
+
+    Tag = viscUtils::getUpdatedTag(Tag,T);
+  }
+
+  Function* getGenFuncForTarget(visc::Target T) {
+    switch (T) {
+      case visc::CPU_TARGET:
+        return GenFuncs.CPUGenFunc;
+      case visc::GPU_TARGET:
+        return GenFuncs.GPUGenFunc;
+      case visc::SPIR_TARGET:
+        return GenFuncs.SPIRGenFunc;
+      case visc::CPU_OR_GPU_TARGET:
+      case visc::CPU_OR_SPIR_TARGET:
+        assert(false &&
+               "Requesting genarated node function with dual tag instead of \
+                CPU/GPU/SPIR\n");
+        return NULL;
+      default:
+        assert(false && "Unknown target for generated function\n");
+        return NULL;
+    }
+  }
+
+  void removeGenFuncForTarget(visc::Target T) {
+  errs() << "Target tag = " << T << "\n";
+    switch (T) {
+      case visc::CPU_TARGET:
+        GenFuncs.CPUGenFunc = NULL;
+        GenFuncInfo.cpu_hasX86Func = false;
+        break;
+      case visc::GPU_TARGET:
+        GenFuncs.GPUGenFunc = NULL;
+        GenFuncInfo.gpu_hasX86Func = false;
+        break;
+      case visc::SPIR_TARGET:
+        GenFuncs.SPIRGenFunc = NULL;
+        GenFuncInfo.spir_hasX86Func = false;
+        break;
+      case visc::CPU_OR_GPU_TARGET:
+      case visc::CPU_OR_SPIR_TARGET:
+        assert(false &&
+               "Removing genarated node function with dual tag instead of \
+                CPU/GPU/SPIR\n");
+        break;
+      default:
+        assert(false && "Unknown target for generated function\n");
+        break;
+    }
+    return;
+  }
+
   void setTargetHint(visc::Target T) {
     Hint = T;
   }
@@ -645,6 +793,15 @@ DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint,
   OutputType = cast<StructType>(Ty);
   Level = (_Parent) ? _Parent->getLevel() + 1 : 0 ;
   Rank = 0;
+
+  Tag = visc::None;
+  GenFuncs.CPUGenFunc = NULL;
+  GenFuncs.GPUGenFunc = NULL;
+  GenFuncs.SPIRGenFunc = NULL;
+
+  GenFuncInfo.cpu_hasX86Func = false;
+  GenFuncInfo.gpu_hasX86Func = false;
+  GenFuncInfo.spir_hasX86Func = false;
 }
 
 void DFNode::setRank(int r) {
diff --git a/llvm/include/llvm/SupportVISC/DFG2LLVM.h b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
index 5cc8a79564..a036d255c8 100644
--- a/llvm/include/llvm/SupportVISC/DFG2LLVM.h
+++ b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
@@ -111,6 +111,7 @@ public:
   CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
   static bool checkPreferredTarget(DFNode* N, visc::Target T);
+  static bool preferredTargetIncludes(DFNode* N, visc::Target T);
 
 
   virtual void visit(DFInternalNode* N) {
@@ -169,6 +170,45 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) {
   return false;
 }
 
+bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) {
+  Function* F = N->getFuncPointer();
+  Module* M = F->getParent();
+  std::vector<NamedMDNode *> HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+      break;
+    case visc::SPIR_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+      break;
+    case visc::CPU_TARGET:
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+    case visc::CPU_OR_SPIR_TARGET:
+      assert(false && "Target should be one of CPU/GPU/SPIR\n");
+      break;
+    default:
+      llvm_unreachable("Target Not supported yet!");
+  }
+
+  for (unsigned h = 0; h < HintNode.size(); h++) {
+    for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) {
+      MDNode *MetaNode = HintNode[h]->getOperand(i);
+      Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+      if (F == FHint)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
 // Generate Code for declaring a constant string [L x i8] and return a pointer
 // to the start of it.
 Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
diff --git a/llvm/include/llvm/SupportVISC/VISCHint.h b/llvm/include/llvm/SupportVISC/VISCHint.h
index 405b4e359f..aa9ee86364 100644
--- a/llvm/include/llvm/SupportVISC/VISCHint.h
+++ b/llvm/include/llvm/SupportVISC/VISCHint.h
@@ -20,10 +20,14 @@ namespace visc {
     CPU_TARGET,
     GPU_TARGET,
     SPIR_TARGET,
+    CPU_OR_GPU_TARGET,
+    CPU_OR_SPIR_TARGET,
+//    ALL_TARGETS,
     NUM_TARGETS
   };
 
 #ifdef __cplusplus
 }
 #endif
-#endif //VISC_RT_HEADER
+
+#endif //VISC_HINT_HEADER
diff --git a/llvm/include/llvm/SupportVISC/VISCUtils.h b/llvm/include/llvm/SupportVISC/VISCUtils.h
index 47e7582e5e..0077c81441 100644
--- a/llvm/include/llvm/SupportVISC/VISCUtils.h
+++ b/llvm/include/llvm/SupportVISC/VISCUtils.h
@@ -8,13 +8,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef VISC_UTILS_HEADER
+#define VISC_UTILS_HEADER
+
+#include <assert.h>
+ 
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 
@@ -147,6 +158,16 @@ void fixHintMetadata(Module &M, Function* F, Function* G) {
         if(HintNode->getOperand(i) == MDT_F)
            HintNode->setOperand(0, MDT_G);
     }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(0, MDT_G);
+    }
+    HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_spir");
+    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+        if(HintNode->getOperand(i) == MDT_F)
+           HintNode->setOperand(0, MDT_G);
+    }
 }
 
 // Assuming that the changed function is a node function, it is only used as a
@@ -276,6 +297,167 @@ Function* cloneFunction(Function* F, FunctionType* newFT, bool
   return newF;
 }
 
+ //------------------- Helper Functions For Handling Hints -------------------//
+  
+// Return true if 1st arg (tag) contains 2nd (target)
+bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
+  switch (Tag) {
+    case visc::None:
+      return false;
+    case visc::CPU_TARGET:
+      if (T == visc::CPU_TARGET)
+        return true;
+      else
+        return false;
+    case visc::GPU_TARGET:
+      if (T == visc::GPU_TARGET)
+        return true;
+      else
+        return false;
+    case visc::SPIR_TARGET:
+      if (T == visc::SPIR_TARGET)
+        return true;
+      else
+        return false;
+    case visc::CPU_OR_GPU_TARGET:
+      if ((T == visc::CPU_TARGET) ||
+          (T == visc::GPU_TARGET) ||
+          (T == visc::CPU_OR_GPU_TARGET))
+        return true;
+      else
+        return false;
+    case visc::CPU_OR_SPIR_TARGET:
+      if ((T == visc::CPU_TARGET) ||
+          (T == visc::SPIR_TARGET) ||
+          (T == visc::CPU_OR_SPIR_TARGET))
+        return true;
+      else
+        return false;
+    default:
+      assert(false && "Unknown Target\n");
+  }
+}
+
+bool isSingleTargetTag(visc::Target T) {
+  return ((T == visc::CPU_TARGET) ||
+          (T == visc::GPU_TARGET) ||
+          (T == visc::SPIR_TARGET));
+}
+
+// Add the specified target to the given tag
+visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
+  assert(((T == visc::CPU_TARGET) ||
+          (T == visc::GPU_TARGET) ||
+          (T == visc::SPIR_TARGET)) &&
+         "The target is only allowed to be CPU, GPU, or SPIR\n");
+
+  switch (Tag) {
+    case visc::None:
+      return T;
+    case visc::CPU_TARGET:
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_TARGET;
+      if (T == visc::GPU_TARGET)
+        return visc::CPU_OR_GPU_TARGET;
+      if (T == visc::SPIR_TARGET)
+        return visc::CPU_OR_SPIR_TARGET;
+    case visc::GPU_TARGET:
+      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_OR_GPU_TARGET;
+      if (T == visc::GPU_TARGET)
+        return visc::GPU_TARGET;
+    case visc::SPIR_TARGET:
+      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
+      if (T == visc::CPU_TARGET)
+        return visc::CPU_OR_SPIR_TARGET;
+      if (T == visc::SPIR_TARGET)
+        return visc::SPIR_TARGET;
+    case visc::CPU_OR_GPU_TARGET:
+      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
+      return visc::CPU_OR_GPU_TARGET;
+    case visc::CPU_OR_SPIR_TARGET:
+      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
+      return visc::CPU_OR_SPIR_TARGET;
+    default:
+      assert(false && "Unknown Target\n");
+  }
+}
+
+// This functions add the hint as metadata in visc code
+void addHint(Function* F, visc::Target T) {
+   // Get Module
+  Module* M = F->getParent();
+  DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
+ 
+  // Based on the hint, get the hint metadata
+  NamedMDNode* HintNode;
+  switch (T) {
+    case visc::GPU_TARGET:
+      DEBUG(errs() << "GPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      break;
+    case visc::SPIR_TARGET:
+      DEBUG(errs() << "SPIR Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      break;
+    case visc::CPU_TARGET:
+      DEBUG(errs() << "CPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      DEBUG(errs() << "CPU or GPU Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+      break;
+    case visc::CPU_OR_SPIR_TARGET:
+      DEBUG(errs() << "CPU or SPIR Target\n");
+      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+      break;
+    default:
+      llvm_unreachable("Unsupported Target Hint!");
+      break;
+  }
+
+  // Create a node for the function and add it to the hint node
+  MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  HintNode->addOperand(N);
+}
+
+visc::Target getPreferredTarget(Function* F) {
+  DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
+  Module* M = F->getParent();
+  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::GPU_TARGET;
+  }
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::SPIR_TARGET;
+  }
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_GPU_TARGET;
+  }
+  HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
+  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    MDNode* N = HintNode->getOperand(i);
+    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+    if(F == FHint)
+      return visc::CPU_OR_SPIR_TARGET;
+  }
+  return visc::CPU_TARGET;
+}
+
 
 } // End of namespace
 
+#endif //VISC_UTILS_HEADER
diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 0abeb095a9..04b01e332b 100644
--- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -16,24 +16,12 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCUtils.h"
 
 using namespace llvm;
 
 namespace builddfg {
 
-static visc::Target getPreferredTarget(Function* F) {
-  DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
-  Module* M = F->getParent();
-  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    MDNode* N = HintNode->getOperand(i);
-    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
-    if(F == FHint)
-      return visc::GPU_TARGET;
-  }
-  return visc::CPU_TARGET;
-}
-
 bool BuildDFG::runOnModule(Module &M) {
   errs() << "\nBUILDDFG PASS\n";
   DEBUG(errs() << "-------- Searching for launch sites ----------\n");
@@ -55,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
         // Intrinsic Instruction has been initialized from this point on.
         Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts());
-        Root = DFInternalNode::Create(II, F, getPreferredTarget(F));
+        Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F));
         Roots.push_back(Root);
         BuildGraph(Root, F);
 
@@ -198,14 +186,14 @@ void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) {
   if(isInternalNode) {
     // Create Internal DFNode, add it to the map and recursively build its
     // dataflow graph
-    DFInternalNode* childDFNode = DFInternalNode::Create(II, F, getPreferredTarget(F), N, numOfDim, dimLimits);
+    DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
     BuildGraph(childDFNode, F);
   }
   else {
     // Create Leaf DFnode and add it to the map.
-    DFLeafNode* childDFNode = DFLeafNode::Create(II, F, getPreferredTarget(F), N, numOfDim, dimLimits);
+    DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
   }
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index f4b7fe2500..bb83544969 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -321,7 +321,10 @@ void CGT_NVPTX::initRuntimeAPI() {
 void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
-  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+
+  assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
+         "Code already generated for this node");
 
   // Useful values
   Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
@@ -362,7 +365,8 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
                                       UndefValue::get(F_X86->getReturnType()), BB);
 
   //Add the generated function info to DFNode
-  N->setGenFunc(F_X86, visc::CPU_TARGET);
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
 
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
@@ -814,7 +818,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
 void CGT_NVPTX::codeGen(DFInternalNode* N) {
-  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n";
   if(KernelLaunchNode == NULL)
     errs () << "No kernel launch node\n";
   else {
@@ -901,6 +905,7 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
 }
 
 void CGT_NVPTX::codeGen(DFLeafNode* N) {
+  errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n";
 
   // Skip code generation if it is a dummy node
   if(N->isDummyNode()) {
@@ -915,7 +920,11 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   }
 
   // Generate code only if it has the right hint
-  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+//    return;
+//  }
+  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
     errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
     return;
   }
@@ -969,7 +978,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
-  Function *F_nvptx = N->getGenFunc();
+//  Function *F_nvptx = N->getGenFunc();
+  Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
 
   assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
   // Clone the function
@@ -978,13 +988,12 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   Twine FName = F->getName();
   F_nvptx = CloneFunction(F, VMap);
   F_nvptx->setName(FName+"_nvptx");
-  errs() << "Old Function Name: " << F->getName() << "\n";
-  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
+//  errs() << "Old Function Name: " << F->getName() << "\n";
+//  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
 
   F_nvptx->removeFromParent();
 
 
-
   // Insert the cloned function into the kernels module
   KernelM->getFunctionList().push_back(F_nvptx);
 
@@ -999,7 +1008,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   F_nvptx = transformFunctionToVoid(F_nvptx);
   
   //Add generated function info to DFNode
-  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
+//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
+  N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
 
   DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
   F_nvptx->removeAttributes(AttributeSet::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 6da326f701..8794f423e0 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -29,8 +29,26 @@ using namespace dfg2llvm;
 static cl::opt<bool>
 VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
 
+
 namespace {
 
+// Helper Functions
+static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
+  if (!isa<CallInst>(I))
+    return false;
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
+}
+
+CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
+  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
+    Instruction *I = &*ib;
+    if (isVISCCall_llvm_visc_policy_getVersion(I))
+      return cast<CallInst>(I);
+  }
+  return NULL;
+}
+
 // DFG2LLVM_X86 - The first implementation.
 struct DFG2LLVM_X86 : public DFG2LLVM {
   static char ID; // Pass identification, replacement for typeid
@@ -79,6 +97,8 @@ private:
   Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
   void addDoWhileLoop(Instruction*, Instruction*, Value*);
   void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+  void addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *,
+                           Instruction *);
   Argument* getArgumentFromEnd(Function* F, unsigned offset);
   Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
                       Instruction* InsertBefore);
@@ -204,13 +224,24 @@ void CGT_X86::initRuntimeAPI() {
   Instruction* I = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(I);
   switchToTimer(visc_TimerID_NONE, I);
+  // Insert code for initializing the sceduling policy
+  Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init",
+    runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()));
+  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  DEBUG(errs() << *IPCallInst << "\n");
 
   // Insert print instruction at visc exit
   Function* VC = M.getFunction("llvm.visc.cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
 
-  DEBUG(errs() << "Inserting x86 timer print\n");
+  // Insert code for clearing the sceduling policy
   I = cast<Instruction>(*VC->user_begin());
+  IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear",
+    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()));
+  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  errs() << *IPCallInst << "\n";
+
+  DEBUG(errs() << "Inserting x86 timer print\n");
   printTimerSet(I);
 
 }
@@ -284,6 +315,30 @@ void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
 
 }
 
+void CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+                                  BasicBlock *Body, Instruction *Cnt) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                            "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  Cnt = CounterPhi;
+}
+
 /* Add Loop around the instruction I
  * Algorithm:
  * (1) Split the basic block of instruction I into three parts, where the
@@ -755,7 +810,12 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and
-  Function* RootF_X86 = Root->getGenFunc();
+//  Function* RootF_X86 = Root->getGenFunc();
+  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
+  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "Error: Generated Function for Root node with no x86 wrapper\n");
+
   // Generate a call to RootF_X86 with null parameters for now
   std::vector<Value*>Args;
   for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
@@ -873,10 +933,13 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
                               ValueToValueMapTy &VMap,Instruction* IB) {
   Function* CF = C->getFuncPointer();
 
-  Function* CF_X86 = C->getGenFunc();
-  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+//  Function* CF_X86 = C->getGenFunc();
+  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
   assert(CF_X86 != NULL
-         && "Found leaf node for which code generation has not happened yet!");
+         && "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function to be called from x86 backend is not an x86 function\n");
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
 
   std::vector<Value*> Args;
   // Create argument list to pass to call instruction
@@ -1102,9 +1165,17 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   }
   /* Add a call to the generated function of the child node */
   DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
-  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
-                                  C->getGenFunc()->getName()+".output", RI);
+//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+//                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
+  DEBUG(errs() << "Type: "
+               << *CGenF->getType()
+               << "\n");
+  CallInst* CI = CallInst::Create(CGenF,
+                                  InputArgs,
+                                  CGenF->getName()+".output",
+                                  RI);
 
   /* Add runtime API calls to push output for each of the streaming outputs */
   // FIXME: Assumption
@@ -1134,8 +1205,59 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
 
   // Add loop around the basic block, which exits the loop if isLastInput is false
   //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond);
-  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
-                RI, Cond);
+//  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
+//                RI, Cond);
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  // Pointers to keep the created loop structure
+  BasicBlock *EntryBB, *CondBB, *BodyBB;
+  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
+  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
+  EntryBB = CondStartI->getParent();
+
+  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
+  CondBB = CondStartI->getParent();
+  BodyBB = CI->getParent();
+  Instruction *CntI = NULL;
+  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
+
+  // If the node function calls the visc runtime call to get policy, we update
+  // it with the counter information. This means we need to pass an additional
+  // argument to the generated function, that is the iteration number, and then
+  // use it as an argument to the policy_getVersion call 
+  if (GetPolicyCI) {
+    addWhileLoopCounter(EntryBB, CondBB, BodyBB, CntI);
+    assert(CntI && "Counter instruction not found\n");
+
+    // Create new function type (with additional argument for iteration number)
+    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
+    std::vector<Type*> NewArgTypes;
+    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
+         ai != ae ; ++ai) {
+      NewArgTypes.push_back(ai->getType());
+    }
+    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
+    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
+    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
+    // At least one (the last) argument exists (we added it)
+    Function::arg_iterator ae = NewCGenF->arg_end();
+    --ae;
+    Argument *CntArg = &*ae;
+    CntArg->setName("iteration");
+    // Replace the old cpu gen func with this one
+    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
+
+    // Add counter to the actual parameter list, to create the new call
+    InputArgs.push_back(CntI);
+    CallInst* newCI = CallInst::Create(NewCGenF,
+                                       InputArgs,
+                                       NewCGenF->getName()+".output");
+    ReplaceInstWithInst(CI, newCI);
+
+    // Set second operand of the policy_getVersion call to the last function
+    // argument
+    GetPolicyCI->setArgOperand(1, CntArg);
+  }
 
   // Return the Function pointer
   DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
@@ -1151,47 +1273,23 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-  if(N->getGenFunc() != NULL)
+//  if(N->getGenFunc() != NULL)
+//    return;
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
     return;
-
-  Function* F = N->getFuncPointer();
-  // Create of clone of F with no instructions. Only the type is the same as F
-  // without the extra arguments.
-  Function* F_X86;
-
-  // Clone the function, if we are seeing this function for the first time. We
-  // only need a clone in terms of type.
-  ValueToValueMapTy VMap;
-
-  // Create new function with the same type
-  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
-
-  // Loop over the arguments, copying the names of arguments over.
-  Function::arg_iterator dest_iterator = F_X86->arg_begin();
-  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-       i != e; ++i) {
-    dest_iterator->setName(i->getName()); // Copy the name over...
-    // Add mapping to VMap and increment dest iterator
-    ++ dest_iterator;
-    VMap[&*i] = &*dest_iterator;
   }
 
-  // Add a basic block to this empty function
-  BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
-  ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
-                                      UndefValue::get(F_X86->getReturnType()), BB);
-
-  //Add generated function info to DFNode
-  N->setGenFunc(F_X86, visc::CPU_TARGET);
-
-  // Add Index and Dim arguments except for the root node and the child graph of
-  // parent node is not streaming
-  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-    addIdxDimArgs(F_X86);
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
 
   // Sort children in topological order before code generation
   N->getChildGraph()->sortChildren();
 
+  // Only process if all children have a CPU x86 function
+  // Otherwise skip to end
+  bool codeGen = true;
   for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
       ce = N->getChildGraph()->end(); ci != ce; ++ci) {
     DFNode* C = *ci;
@@ -1199,61 +1297,344 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     if (C->isDummyNode())
       continue;
 
-    // Check if Child Node has PTX tag or X86 tag
-    invokeChild_X86(C, F_X86, VMap, RI);
+    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+      errs() << "No CPU x86 version for child node "
+             << C->getFuncPointer()->getName()
+             << " . Skip code gen for parent node "
+             << N->getFuncPointer()->getName() << "\n";
+      codeGen = false;
+    }
   }
 
-  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
-  // Generate code for output bindings
-  // Get Exit node
-  DFNode* C = N->getChildGraph()->getExit();
-  // Get OutputType of this node
-  StructType* OutTy = N->getOutputType();
-  Value *retVal = UndefValue::get(F_X86->getReturnType());
-  // Find all the input edges to exit node
-  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
-    DEBUG(errs() << "Output Edge " << i << "\n");
-    // Find the incoming edge at the requested input port
-    DFEdge* E = C->getInDFEdgeAt(i);
+  if (codeGen) {
+    Function* F = N->getFuncPointer();
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function* F_X86;
+  
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+  
+    // Create new function with the same type
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+    errs() << "--------------" << F->getName() << "\n";
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_X86->arg_begin();
+    assert(false && "Got here\n");
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Add mapping to VMap and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+  
+    assert(false && "Got here\n");
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
+    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
+
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Add Index and Dim arguments except for the root node and the child graph of
+    // parent node is not streaming
+    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+      addIdxDimArgs(F_X86);
+  
+    // Iterate over children in topological order
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      DFNode* C = *ci;
+      // Skip dummy node call
+      if (C->isDummyNode())
+        continue;
+  
+      // Create calls to CPU function of child node
+      invokeChild_X86(C, F_X86, VMap, RI);
+  
+    }
+ 
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    DFNode* C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType* OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_X86->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge* E = C->getInDFEdgeAt(i);
+  
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode* SrcDF = E->getSourceDF();
+  
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_X86, i);
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
+        // edge is from a internal node
+        // Check - code should already be generated for this source dfnode
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+  
+        // Find Output Value associated with the Source DFNode using OutputMap
+        Value* CI = OutputMap[SrcDF];
+  
+        // Extract element at source position from this call instruction
+        std::vector<unsigned> IndexList;
+        IndexList.push_back(E->getSourcePosition());
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
+        inputVal = EI;
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
 
-    assert(E && "No Binding for output element!");
-    // Find the Source DFNode associated with the incoming edge
-    DFNode* SrcDF = E->getSourceDF();
+  }
 
-    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  //-------------------------------------------------------------------------//
+  // Here, we need to check if this node (N) has more than one versions
+  // If so, we query the policy and have a call to each version
+  // If not, we see which version exists, check that it is in fact an x86
+  // function and save it as the CPU_TARGET function
+
+  // TODO: visc_id per node, so we can use this for id for policies
+  // For now, use node function name and change it later
+  errs() << "Node Name (for policy) : "
+         << N->getFuncPointer()->getName() << "\n";
+  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+  errs() << "Node: " << N->getFuncPointer()->getName()
+                     << " with tag " << N->getTag() << "\n";
+  errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+  errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+
+  if (viscUtils::isSingleTargetTag(N->getTag())) {
+    // There is a single version for this node according to code gen hints.
+    // Therefore, we do not need to check the policy, we simply use the
+    // available implementation, whichever target it is for.
+
+    // Sanity check - to be removed TODO
+    switch (N->getTag()) {
+      case visc::CPU_TARGET:
+        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::GPU_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::SPIR_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
+        break;
+      default:
+        assert(false && "Unreachable: we checked that tag was single target!\n");
+        break;
+    }
 
-    // If Source DFNode is a dummyNode, edge is from parent. Get the
-    // argument from argument list of this internal node
-    Value* inputVal;
-    if(SrcDF->isEntryNode()) {
-      inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+    N->addGenFunc(N->getGenFuncForTarget(N->getTag()),
+                  visc::CPU_TARGET,
+                  true);
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    errs() << "After editing\n";
+    errs() << "Node: " << N->getFuncPointer()->getName()
+                       << " with tag " << N->getTag() << "\n";
+    errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+    errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+    //  assert(false && "got to the point where we have to select\n");
+  } else {
+    // We have more than one targets
+    
+    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    // These assertions express what we can support with the current runtime.
+    // Code generation works the same way even for other target combinations.
+    // For now, we want either CPU and GPU, or CPU and SPIR
+    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
+    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
+           "Generated functions without appropriate x86 wrapper\n");
+
+    FunctionType *FT = CF->getFunctionType();
+    if (GF)
+      assert(FT == GF->getFunctionType() &&
+             "Type mismatch between generated functions for GPU and CPU targets.\n");
+    if (SF)
+      assert(FT == SF->getFunctionType() &&
+             "Type mismatch between generated functions for SPIR and CPU targets.\n");
+
+    // Code generation of wrapper function
+    Function *F_wrapper;
+    ValueToValueMapTy VMap;
+    F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M);
+
+    // Copy argument names over
+    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
+    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName());
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
     }
-    else {
-      // edge is from a internal node
-      // Check - code should already be generated for this source dfnode
-      assert(OutputMap.count(SrcDF)
-             && "Source node call not found. Dependency violation!");
-
-      // Find Output Value associated with the Source DFNode using OutputMap
-      Value* CI = OutputMap[SrcDF];
-
-      // Extract element at source position from this call instruction
-      std::vector<unsigned> IndexList;
-      IndexList.push_back(E->getSourcePosition());
-      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "",RI);
-      inputVal = EI;
+    // Gather all arguments of wrapper in a vector, to prepare the call to
+    // the individual gen functions
+    std::vector<Value *> GenFuncCallArgs;
+    for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end();
+         i != e; ++i) {
+      GenFuncCallArgs.push_back(&*i);
+    }
+
+    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
+
+    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
+
+    StringRef FName = N->getFuncPointer()->getName();
+    size_t nameSize = FName.size()+1;
+    std::vector<Constant *> NameV;
+    for (char c: FName) {
+      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
     }
-    std::vector<unsigned> IdxList;
-    IdxList.push_back(i);
-    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
+    ArrayType *NameType =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
+    AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent);
+    Constant *NameConst = ConstantArray::get(NameType, NameV);
+    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
+    CastInst *BI = BitCastInst::CreatePointerCast(AI,
+                     Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
+    std::vector<Value *> Args;
+    Args.push_back(BI);
+    Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
+    Function *RTF =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion",
+      runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType()));
+    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
+
+    ConstantInt *CmpConst =
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
+    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp,
+                                    CmpInst::ICMP_EQ,
+                                    RTFInst, CmpConst,
+                                    "", BBcurrent);
+
+    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
+    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
+    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+
+    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
+    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (GF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+    }
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (SF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+    }
+
+    RI = ReturnInst::Create(M.getContext(),
+                            UndefValue::get(FT->getReturnType()), BBfalse);
+
+    // Now, make the node cpu gen func to be this one
+    // Remove all other versions and update the tag
+    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::CPU_TARGET);
+
+    assert(false && "got to the point where we have to combine\n");
   }
-  DEBUG(errs() << "Extracted all\n");
-  retVal->setName("output");
-  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
-  ReplaceInstWithInst(RI, newRI);
 
 }
 
@@ -1267,8 +1648,17 @@ void CGT_X86::codeGen(DFLeafNode* N) {
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-  if(N->getGenFunc() != NULL)
+//  if(N->getGenFunc() != NULL)
+//    return;
+
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
     return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
 
   std::vector<IntrinsicInst *> IItoRemove;
   std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
@@ -1286,7 +1676,8 @@ void CGT_X86::codeGen(DFLeafNode* N) {
   M.getFunctionList().push_back(F_X86);
 
   // Add generated function info to DFNode
-  N->setGenFunc(F_X86, visc::CPU_TARGET);
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
 
   // Add the new argument to the argument list. Add arguments only if the cild
   // graph of parent node is not streaming
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index 8c0429af57..8de95a28d4 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCUtils.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -240,62 +241,6 @@ static Value* genCodeForReturn(CallInst* CI) {
   return IV;
 }
 
-
-// This functions add the hint as metadata in visc code
-static void addHint(Function* F, visc::Target T) {
-  // Get Module
-  Module* M = F->getParent();
-  DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
-
-  //assert(isa<ConstantInt>(CI->getArgOperand(0))
-  //&& "Argument to hint must be constant integer!");
-  //ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
-
-  // Based on the hint, get the hint metadata
-  NamedMDNode* HintNode;
-  switch (T) {
-    case visc::GPU_TARGET:
-      DEBUG(errs() << "GPU Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-      break;
-    case visc::SPIR_TARGET:
-      DEBUG(errs() << "SPIR Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-      break;
-    case visc::CPU_TARGET:
-      DEBUG(errs() << "CPU Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
-      break;
-    default:
-      llvm_unreachable("Unsupported Target Hint!");
-      break;
-  }
-
-  // Create a node for the function and add it to the hint node
-  MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
-  HintNode->addOperand(N);
-}
-
-static visc::Target getPreferredTarget(Function* F) {
-  DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
-  Module* M = F->getParent();
-  NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    MDNode* N = HintNode->getOperand(i);
-    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
-    if(F == FHint)
-      return visc::GPU_TARGET;
-  }
-  HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-  for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    MDNode* N = HintNode->getOperand(i);
-    Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
-    if(F == FHint)
-      return visc::SPIR_TARGET;
-  }
-  return visc::CPU_TARGET;
-}
-
 // The visc launch intrinsic requires all the input parameters to the kernel
 // function be placed in contiguous memory and pointer to that input be passed
 // as the second argument to the launch intrinsic. This generates code to bring
@@ -462,6 +407,9 @@ static Function* genInternalNode(Function* KernelF, unsigned level,
   if(level > 1) {
     ChildNodeF = genInternalNode(KernelF, level-1, numArgs, numDims, dimOffset, CI);
     addHint(ChildNodeF, getPreferredTarget(KernelF));
+//    Internal nodes always get a CPU hint. If code geneation for them is not
+//     needed and can be skipped, this is handled by the accelerator backends
+//    addHint(ChildNodeF, visc::CPU_TARGET);
   } else {
     ChildNodeF = KernelF;
   }
diff --git a/llvm/projects/visc-rt/CMakeLists.txt b/llvm/projects/visc-rt/CMakeLists.txt
index 824f751baf..e7c5f56ef8 100644
--- a/llvm/projects/visc-rt/CMakeLists.txt
+++ b/llvm/projects/visc-rt/CMakeLists.txt
@@ -1,4 +1,11 @@
 add_custom_target(visc-rt ALL)
+add_custom_command(
+  TARGET visc-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/policy.h
+    ${CMAKE_CURRENT_BINARY_DIR}/policy.h
+  DEPENDS policy.h
+  COMMENT "Copying policy.h")
 add_custom_command(
   TARGET visc-rt PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/llvm/projects/visc-rt/policy.h b/llvm/projects/visc-rt/policy.h
new file mode 100644
index 0000000000..3aaafce539
--- /dev/null
+++ b/llvm/projects/visc-rt/policy.h
@@ -0,0 +1,35 @@
+#ifndef __POLICY__
+#define __POLICY__
+
+#include <string>
+
+ /************************* Policies *************************************/
+class Policy {
+  public:
+    virtual int getVersion(const char *, int64_t) = 0;
+};
+
+class NodePolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+//    if (name == "sgemm")
+//      return 1;
+//    else
+//      return 0;
+    std::string s(name);
+    std::string NodeNames[1] = { " _Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
+    if (s.compare(NodeNames[0])) // if this is the kernel launch node
+      return 1;
+    return 0;
+  }
+};
+
+class IterationPolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    if (it % 2 == 0)
+      return 0;
+     else
+       return 1;
+  }
+};
+
+#endif // __POLICY__
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 4cb652135b..e61802d25e 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -51,6 +51,7 @@ cl_context globalOCLContext;
 cl_device_id* clDevices;
 cl_command_queue globalCommandQue;
 
+Policy *policy = NULL;
 MemTracker MTracker;
 vector<DFGDepth> DStack;
 // Mutex to prevent concurrent access by multiple thereads in pipeline
@@ -67,6 +68,19 @@ static inline void checkErr(cl_int err, cl_int success, const char * name) {
   }
 }
 
+/************************* Policies *************************************/
+void llvm_visc_policy_init() {
+  policy = new NodePolicy();
+}
+
+void llvm_visc_policy_clear() {
+  if (policy) free(policy);
+}
+
+int llvm_visc_policy_getVersion(const char *name, int64_t i) {
+  policy->getVersion(name, i);
+}
+
 /************************* Depth Stack Routines ***************************/
 
 void llvm_visc_x86_dstack_push(unsigned n, unsigned limitX, unsigned iX, unsigned limitY,
diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h
index 00e270d002..20cc6e35a6 100644
--- a/llvm/projects/visc-rt/visc-rt.h
+++ b/llvm/projects/visc-rt/visc-rt.h
@@ -15,7 +15,7 @@
 
 #include "llvm/SupportVISC/VISCHint.h"
 #include "llvm/SupportVISC/VISCTimer.h"
-
+#include "policy.h"
 #ifndef DEBUG_BUILD
 #define DEBUG(s) {}
 #else
@@ -28,6 +28,12 @@ using namespace std;
 
 extern "C" {
 
+/************************* Policies *************************************/
+
+void llvm_visc_policy_init();
+void llvm_visc_policy_clear();
+int llvm_visc_policy_getVersion(const char *, int64_t);
+
 /********************* DFG Depth Stack **********************************/
 class DFGDepth {
   private:
-- 
GitLab