diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index a189075fff07ece58c2da01909fde4f77cf8954d..9dd826ef4371298bb880fad5de9094335fb86779 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -21,3 +21,6 @@ add_subdirectory(ApproxScheduler)
 add_subdirectory(GenVISC)
 add_subdirectory(MergeDFN)
 add_subdirectory(FuseHPVMTensorNodes)
+add_subdirectory(ReplaceIntrinsics)
+add_subdirectory(DFG2LLVM_X86_dsoc)
+add_subdirectory(InlineTensorCalls)
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..75569adddae7232ff10988d89f5a7f98626a12c9
--- /dev/null
+++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( DFG2LLVM_X86_dsoc 
+  DFG2LLVM_X86_dsoc.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbe5e4f6bd836a31550b784d8a88730a6984a7be
--- /dev/null
+++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
@@ -0,0 +1,2128 @@
+//===-------------------------- DFG2LLVM_X86.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "DFG2LLVM_X86"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+// VISC Command line option to use timer or not
+static cl::opt<bool>
+VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
+// Command line option to enable device abstraction or not
+static cl::opt<bool>
+DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
+                   cl::desc("Enable visc device abstraction"));
+
+
+namespace {
+
+// Helper Functions
+static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
+  if (!isa<CallInst>(I))
+    return false;
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
+}
+
+CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
+  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
+    Instruction *I = &*ib;
+    if (isVISCCall_llvm_visc_policy_getVersion(I))
+      return cast<CallInst>(I);
+  }
+  return NULL;
+}
+
+// DFG2LLVM_X86 - The first implementation.
+struct DFG2LLVM_X86 : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_X86() :DFG2LLVM(ID) {}
+
+private:
+  // Member variables
+
+  // Functions
+
+public:
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_X86 : public CodeGenTraversal {
+
+private:
+  //Member variables
+
+  Constant* malloc;
+  // VISC Runtime API
+  Constant* llvm_visc_x86_launch;
+  Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_x86_argument_ptr;
+
+  Constant* llvm_visc_streamLaunch;
+  Constant* llvm_visc_streamPush;
+  Constant* llvm_visc_streamPop;
+  Constant* llvm_visc_streamWait;
+  Constant* llvm_visc_createBindInBuffer;
+  Constant* llvm_visc_createBindOutBuffer;
+  Constant* llvm_visc_createEdgeBuffer;
+  Constant* llvm_visc_createLastInputBuffer;
+  Constant* llvm_visc_createThread;
+  //Constant* llvm_visc_freeThreads;
+  Constant* llvm_visc_bufferPush;
+  Constant* llvm_visc_bufferPop;
+  Constant* llvm_visc_x86_dstack_push;
+  Constant* llvm_visc_x86_dstack_pop;
+  Constant* llvm_visc_x86_getDimLimit;
+  Constant* llvm_visc_x86_getDimInstance;
+
+  //Functions
+  std::vector<IntrinsicInst*>* getUseList(Value* LI);
+  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
+  void addDoWhileLoop(Instruction*, Instruction*, Value*);
+  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+  Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
+  Argument* getArgumentFromEnd(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  StructType* getArgumentListStructTy(DFNode*);
+  Function* createFunctionFilter(DFNode* C);
+  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
+                      Value*, Value*, Instruction*);
+  Function* createLaunchFunction(DFInternalNode*);
+  Function* createPushFunction(DFInternalNode*);
+  Function* createPopFunction(DFInternalNode*);
+  Function* createWaitFunction(DFInternalNode*);
+
+  // Virtual Functions
+  void init() {
+    VISCTimer = VISCTimer_X86;
+    TargetName = "X86";
+  }
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  Function* codeGenStreamPush(DFInternalNode* N);
+  Function* codeGenStreamPop(DFInternalNode* N);
+
+public:
+  // Constructor
+  CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
+    init();
+    initRuntimeAPI();
+  }
+
+  void codeGenLaunch(DFInternalNode* Root);
+  void codeGenLaunchStreaming(DFInternalNode* Root);
+};
+
+bool DFG2LLVM_X86::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_X86 PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  //DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
+  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // TODO: Later on, we might like to do this in a separate pass, which would
+    // allow us the flexibility to switch between complete static code generation
+    // for DFG or having a customized runtime+scheduler
+    
+    // Do streaming code generation if root node is streaming. Usual otherwise
+    if(rootNode->isChildGraphStreaming())
+      CGTVisitor->codeGenLaunchStreaming(rootNode);
+    else
+      CGTVisitor->codeGenLaunch(rootNode);
+  }
+
+  delete CGTVisitor;
+  return true;
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_X86::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll";
+
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  DECLARE(llvm_visc_x86_launch);
+  DECLARE(malloc);
+  DECLARE(llvm_visc_x86_wait);
+  DECLARE(llvm_visc_x86_argument_ptr);
+  DECLARE(llvm_visc_streamLaunch);
+  DECLARE(llvm_visc_streamPush);
+  DECLARE(llvm_visc_streamPop);
+  DECLARE(llvm_visc_streamWait);
+  DECLARE(llvm_visc_createBindInBuffer);
+  DECLARE(llvm_visc_createBindOutBuffer);
+  DECLARE(llvm_visc_createEdgeBuffer);
+  DECLARE(llvm_visc_createLastInputBuffer);
+  DECLARE(llvm_visc_createThread);
+  //DECLARE(llvm_visc_freeThreads);
+  DECLARE(llvm_visc_bufferPush);
+  DECLARE(llvm_visc_bufferPop);
+  DECLARE(llvm_visc_x86_dstack_push);
+  DECLARE(llvm_visc_x86_dstack_pop);
+  DECLARE(llvm_visc_x86_getDimLimit);
+  DECLARE(llvm_visc_x86_getDimInstance);
+
+  // Get or insert timerAPI functions as well if you plan to use timers
+  initTimerAPI();
+
+  // Insert init context in main
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  DEBUG(errs() << "Inserting x86 timer initialization\n");
+  Instruction* I = cast<Instruction>(*VI->user_begin());
+  initializeTimerSet(I);
+  switchToTimer(visc_TimerID_NONE, I);
+  // Insert code for initializing the sceduling policy
+  Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init",
+    runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()));
+  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  DEBUG(errs() << *IPCallInst << "\n");
+
+  // If device abstraction is enabled, we add a runtime call to start the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+  // Insert print instruction at visc exit
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+
+  // Insert code for clearing the sceduling policy
+  I = cast<Instruction>(*VC->user_begin());
+  IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear",
+    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()));
+  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  errs() << *IPCallInst << "\n";
+
+  DEBUG(errs() << "Inserting x86 timer print\n");
+  printTimerSet(I);
+
+  // If device abstraction is enabled, we add a runtime call to end the
+  // device status simulation
+  if (DeviceAbstraction) {
+    Function *ID =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType()));
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    DEBUG(errs() << *IDCallInst << "\n");
+  }
+
+}
+
+/* Returns vector of all wait instructions
+ */
+std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
+  // It must have been loaded from memory somewhere
+  for(Value::user_iterator ui = GraphID->user_begin(),
+      ue = GraphID->user_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      UseList->push_back(waitI);
+    }
+    //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){
+      //errs() << "Found PhiNode use of graphID\n";
+      //std::vector<IntrinsicInst*>* phiUseList  = getUseList(PN);
+      //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end());
+      //free(phiUseList);
+    //}
+    else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
+    }
+  }
+  return UseList;
+}
+
+/* Traverse the function argument list in reverse order to get argument at a
+ * distance offset fromt he end of argument list of function F
+ */
+Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
+         && "Invalid offset to access arguments!");
+  Function::arg_iterator e = F->arg_end();
+  // Last element of argument iterator is dummy. Skip it.
+  e--;
+  Argument* arg;
+  for( ; offset != 0; e--) {
+    offset--;
+    arg = &*e;
+  }
+  return arg;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
+                          Instruction* BodyEnd, Value* TerminationCond) {
+  BasicBlock* Entry = CondBlockStart->getParent();
+  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+
+  // Replace the terminator instruction of conditional with new conditional
+  // branch which goes to while.body if true and branches to while.end otherwise
+  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  ReplaceInstWithInst(CondBlock->getTerminator(), BI);
+
+  // While Body should jump to condition block
+  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
+  ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
+
+}
+
+Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+                                          BasicBlock *Body) {
+  Module *M = Entry->getParent()->getParent();
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+  // Insert a PHI instruction at the beginning of the condition block
+  Instruction *IB = Cond->getFirstNonPHI();
+  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
+
+  ConstantInt *IConst =
+    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+  Instruction *CounterIncr =
+    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                            "cnt_incr", Body->getTerminator());
+
+  // Set incoming values for Phi node
+  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
+  CounterPhi->addIncoming(IConst, Entry);
+  CounterPhi->addIncoming(CounterIncr, Body);
+
+  // Return the pointer to the created PHI node in the corresponding argument
+  return CounterPhi;
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) {
+  BasicBlock* Entry = From->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body");
+
+  // To Instruction should also belong to the same basic block as the From basic
+  // block will have a terminator instruction
+  assert(To->getParent() == ForBody
+         && "To Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end");
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+}
+
+/* Add Loop around the instruction I
+ * Algorithm:
+ * (1) Split the basic block of instruction I into three parts, where the
+ * middleblock/body would contain instruction I.
+ * (2) Add phi node before instruction I. Add incoming edge to phi node from
+ * predecessor
+ * (3) Add increment and compare instruction to index variable
+ * (4) Replace terminator/branch instruction of body with conditional branch
+ * which loops over bidy if true and goes to end if false
+ * (5) Update phi node of body
+ */
+Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
+  BasicBlock* Entry = I->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
+
+  BasicBlock::iterator i(I);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == ForBody
+         && "Next Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+
+
+  // Add Phi Node for index variable
+  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
+                                      2, "index."+indexName, I);
+
+  // Add incoming edge to phi
+  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
+                        Entry);
+  // Increment index variable
+  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
+                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+                             "index."+indexName+".inc", ForBody->getTerminator());
+
+  // Compare index variable with limit
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
+                                  limit, "cond."+indexName, ForBody->getTerminator());
+
+  // Replace the terminator instruction of for.body with new conditional
+  // branch which loops over body if true and branches to for.end otherwise
+  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  ReplaceInstWithInst(ForBody->getTerminator(), BI);
+
+  // Add incoming edge to phi node in body
+  IndexPhi->addIncoming(IndexInc, ForBody);
+  return IndexPhi;
+}
+
+// Returns a packed struct type. The structtype is created by packing the input
+// types, output types and isLastInput buffer type. All the streaming
+// inputs/outputs are converted to i8*, since this is the type of buffer
+// handles.
+StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
+  std::vector<Type*> TyList;
+  // Input types
+  Function* CF = C->getFuncPointer();
+  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+      ai != ae; ++ai) {
+    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+      TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+    else 
+      TyList.push_back(ai->getType());
+  }
+  // Output Types
+  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
+    // All outputs of a node are streaming edge
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
+        && "All output edges of child node have to be streaming");
+    TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+  }
+  // isLastInput buffer element
+  TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
+
+  StructType* STy = StructType::create(CF->getContext(), TyList,
+                        Twine("struct.thread."+CF->getName()).str(), true);
+  return STy;
+
+}
+
+void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
+                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
+                              Instruction* IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create a filter/pipeline function for the child node
+  Function* C_Pipeline = createFunctionFilter(C);
+  Function* CF = C->getFuncPointer();
+
+  // Get module context and i32 0 constant, as they would be frequently used in
+  // this function.
+  LLVMContext& Ctx = IB->getParent()->getContext();
+  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  // Marshall arguments
+  // Create a packed struct type with inputs of C followed by outputs and then
+  // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
+  // by i8*
+  //
+  StructType* STy = getArgumentListStructTy(C);
+  // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
+  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
+                                  C->getFuncPointer()->getName()+".inputs", IB);
+  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
+  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
+  // Insert elements in the struct
+  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Marshall Inputs
+  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
+    // Create constant int (i)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".arg_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getInDFEdgeAt(i);
+    if (E->getSourceDF()->isEntryNode()) {
+      // This is a Bind Input Edge
+      if(E->isStreamingEdge()) {
+        // Streaming Bind Input edge. Get buffer corresponding to it
+        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
+        new StoreInst(EdgeBufferMap[E], GEP, IB);
+      }
+      else {
+        // Non-streaming Bind edge
+        new StoreInst(Args[i], GEP, IB);
+      }
+    }
+    else {
+      // This is an edge between siblings. 
+      // This must be an streaming edge. As it is our assumption that all edges
+      // between two nodes in a DFG are streaming.
+      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
+      new StoreInst(EdgeBufferMap[E], GEP, IB);
+    }
+  }
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
+  // Marshall Outputs
+  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  for(unsigned i = 0; i < numOutputs; i++ ) {
+    // Create constant int (i+numInputs)
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
+    // Get Element pointer instruction
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".out_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
+    new StoreInst(EdgeBufferMap[E], GEP, IB);
+  }
+  // Marshall last argument. isLastInput buffer
+  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
+  // Create constant int (i+numInputs)
+  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
+  // Get Element pointer instruction
+  Value* GEPIndices[] = { IntZero, Int_index };
+  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                           ArrayRef<Value*>(GEPIndices, 2),
+                           Struct->getName()+".isLastInput", IB);
+  new StoreInst(isLastInputBuffer, GEP, IB);
+
+  // AllocaInst AI points to memory with all the arguments packed
+  // Call runtime to create the thread with these arguments
+  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << *graphID->getType() << "\n");
+  DEBUG(errs() << *C_Pipeline->getType() << "\n");
+  DEBUG(errs() << *Struct->getType() << "\n");
+  // Bitcast AI to i8*
+  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
+  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst* CreateThread = CallInst::Create(llvm_visc_createThread,
+                                            ArrayRef<Value*>(CreateThreadArgs, 3),
+                                            "",
+                                            IB);
+
+}
+
+Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Streaming Launch Function\n");
+  // Get Function associated with Node N
+  Function* NF = N->getFuncPointer();
+
+  // Map from Streaming edge to buffer 
+  DenseMap<DFEdge*, Value*> EdgeBufferMap;
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+  * (2) Extract each of inputs from data.addr
+  * (3) create Buffers for all the streaming edges
+  *     - Put buffers in the context
+  * (4) Go over each child node
+  *     - marshall its arguments together (use buffers in place of streaming
+  *       arguments)
+  *     - Start the threads
+  * (5) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // (1) Create Launch Function of type void (i8* args, i8* GraphID)
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
+                                  ArrayRef<Type*>(ArgTypes, 2), false);
+  Function* LaunchFunc = Function::Create(LaunchFuncTy,
+                                       NF->getLinkage(),
+                                       NF->getName()+".LaunchFunction",
+                                       &M);
+  DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Argument* data = &*LaunchFunc->arg_begin();
+  Argument* graphID = &*(++LaunchFunc->arg_begin());
+  data->setName("data.addr");
+  graphID->setName("graphID");
+  // Add a basic block to this empty function and a return null statement to it
+  DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
+  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
+                                      BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+
+  // (2) Extract each of inputs from data.addr
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  std::vector<Value*> Args;
+
+  for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
+      ai != ae; ++ai) {
+    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back(Twine(ai->getName()+"_buffer").str());
+      continue;
+    }
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  Args = extractElements(data, TyList, names, RI);
+  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
+  // (3) Create buffers for all the streaming edges
+  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
+    DFEdge* Edge = *di;
+    DEBUG(errs() << *Edge->getType() << "\n");
+    Value* size = ConstantExpr::getSizeOf(Edge->getType());
+    Value* CallArgs[] = {graphID, size};
+    if (Edge->isStreamingEdge()) {
+      CallInst* CI;
+      // Create a buffer call
+      if(Edge->getSourceDF()->isEntryNode()) {
+        // Bind Input Edge
+        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
+                                  Edge->getSourcePosition());
+        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
+                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else if(Edge->getDestDF()->isExitNode()) {
+        // Bind Output Edge
+        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else {
+        // Streaming Edge
+        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
+                              ArrayRef<Value*>(CallArgs, 2),
+                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
+                              +Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      EdgeBufferMap[Edge] = CI;
+    }
+  }
+  // Create buffer for isLastInput for all the child nodes
+  DFGraph* G = N->getChildGraph();
+  DenseMap<DFNode*, Value*> NodeLastInputMap;
+  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
+    DFNode* child = *ci;
+    if(child->isDummyNode())
+      continue;
+    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value* CallArgs[] = {graphID, size};
+    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
+                              RI);
+    NodeLastInputMap[child] = CI;
+  }
+  DEBUG(errs() <<  "Start Each child node filter\n");
+  // (4) Marshall arguments for each child node and start the thread with its
+  //     pipeline funtion
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+    
+    // Marshall all the arguments for this node into an i8*
+    // Pass to the runtime to create the thread
+    // Start the thread for child node C
+    startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
+  }
+
+  DEBUG(errs() << "Launch function:\n");
+  DEBUG(errs() << *LaunchFunc << "\n");
+
+  return LaunchFunc;
+}
+
+
+Function* CGT_X86::createPushFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Push function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createPopFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Pop function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+
+Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
+  DEBUG(errs() << "Generating Wait function\n");
+  Function* PushFunc;
+  return PushFunc;
+}
+/* This fuction does the steps necessary to launch a streaming graph
+ * Steps
+ * Create Pipeline/Filter function for each node in child graph of Root
+ * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
+ * Modify each of the instrinsic in host code
+ * Launch, Push, Pop, Wait
+ */
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
+  IntrinsicInst* LI = Root->getInstruction();
+  Function* RootLaunch = createLaunchFunction(Root);
+  //Function* RootPush = createPushFunction(Root);
+  //Function* RootPop = createPopFunction(Root);
+  //Function* RootWait = createWaitFunction(Root);
+  // Substitute launch intrinsic main
+  DEBUG(errs() <<  "Substitute launch intrinsic\n");
+  Value* LaunchInstArgs[] = {RootLaunch,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_streamWait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_streamPush,
+                            ArrayRef<Value*>(PushArgs, 2),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_streamPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    DEBUG(errs() << "Replace:\n\t" << *II << "\n");
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << "\twith " << *CI << "\n");
+  }
+
+
+}
+
+void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
+  // TODO: Place an assert to check if the constant passed by launch intrinsic
+  // as the number of arguments to DFG is same as the number of arguments of the
+  // root of DFG
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Get Launch Instruction
+  IntrinsicInst* LI = Root->getInstruction();
+  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
+  DEBUG(errs() << "Generating Launch Function\n");
+
+  /* Now we have all the necessary global declarations necessary to generate the
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type i8*(i8*)
+  * (2) Extract each of inputs from data.addr and pass them as arguments to the
+  * call to Root function
+  * (3) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
+  // Create Launch Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                            ArrayRef<Type*>(i8Ty->getPointerTo()),
+                            false);
+  Function* AppFunc = Function::Create(AppFuncTy,
+                                       Root->getFuncPointer()->getLinkage(),
+                                       "LaunchDataflowGraph",
+                                       &M);
+  DEBUG(errs() << "Generating Launch Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*AppFunc->arg_begin();
+  data->setName("data.addr");
+  // Add a basic block to this empty function and a return null statement to it
+  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
+  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
+                                      Constant::getNullValue(AppFunc->getReturnType()),
+                                      BB);
+  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
+  // Find the X86 function generated for Root and
+//  Function* RootF_X86 = Root->getGenFunc();
+  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
+  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "Error: Generated Function for Root node with no x86 wrapper\n");
+
+  // Generate a call to RootF_X86 with null parameters for now
+  std::vector<Value*>Args;
+  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  }
+  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
+
+  // Extract input data from i8* data.addr and patch them to correct argument of
+  // call to RootF_X86. For each argument
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+      ai != ae; ++ai) {
+    TyList.push_back(ai->getType());
+    names.push_back(ai->getName());
+  }
+  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
+  // Patch the elements to the call arguments
+  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
+    CI->setArgOperand(i, elements[i]);
+
+  // Add timers around Call to RootF_X86 function
+  switchToTimer(visc_TimerID_COMPUTATION, CI);
+  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
+
+  // Code for returning the output
+  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                             CI->getType()->getPointerTo(),
+                             CI->getName()+".addr",
+                             RI);
+  new StoreInst(CI, OutputAddrCast, RI);
+  switchToTimer(visc_TimerID_NONE, RI);
+
+  DEBUG(errs() << "Application specific function:\n");
+  DEBUG(errs() << *AppFunc << "\n");
+
+  // Substitute launch intrinsic main
+  Value* LaunchInstArgs[] = {AppFunc,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    switch(II->getIntrinsicID()) {
+    case Intrinsic::visc_wait:
+      CI = CallInst::Create(llvm_visc_x86_wait,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_push:
+      CI = CallInst::Create(llvm_visc_bufferPush,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    case Intrinsic::visc_pop:
+      CI = CallInst::Create(llvm_visc_bufferPop,
+                            ArrayRef<Value*>(LaunchInst),
+                            "");
+      break;
+    default:
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+    };
+    ReplaceInstWithInst(II, CI);
+    DEBUG(errs() << *CI << "\n");
+  }
+
+}
+
+Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using OutputMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
+                              ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+//  Function* CF_X86 = C->getGenFunc();
+  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
+  assert(CF_X86 != NULL
+         && "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+         "The generated function to be called from x86 backend is not an x86 function\n");
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+
+  std::vector<Value*> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_X86, IB));
+  }
+
+  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
+    Args.push_back(I64Zero);
+
+  errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
+  errs() << "Node Function type: " << *CF->getType() << "\n";
+  errs() << "Arguments: " << Args.size() << "\n";
+
+  // Call the F_X86 function associated with this node
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  unsigned numArgs = CI->getNumArgOperands();
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit = NULL;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_X86 function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if(isa<Constant>(C->getDimLimits()[j])) {
+      indexLimit = C->getDimLimits()[j];
+      DEBUG(errs() << "In Constant case:\n"
+             << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    else {
+      indexLimit = VMap[C->getDimLimits()[j]];
+      DEBUG(errs() << "In VMap case:"
+             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
+  }
+  // Insert call to runtime to push the dim limits and instanceID on the depth
+  // stack
+  Value* args[] = {
+    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
+    CI->getArgOperand(numArgs-3+0), // limitX
+    CI->getArgOperand(numArgs-6+0), // iX
+    CI->getArgOperand(numArgs-3+1), // limitY
+    CI->getArgOperand(numArgs-6+1), // iY
+    CI->getArgOperand(numArgs-3+2), // limitZ
+    CI->getArgOperand(numArgs-6+2)  // iZ
+  };
+
+  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
+  DEBUG(errs() << "Push on stack: " << *Push << "\n");
+  // Insert call to runtime to pop the dim limits and instanceID from the depth
+  // stack
+  BasicBlock::iterator i(CI);
+  ++i;
+  Instruction* NextI = &*i;
+  // Next Instruction should also belong to the same basic block as the basic
+  // block will have a terminator instruction
+  assert(NextI->getParent() == CI->getParent()
+         && "Next Instruction should also belong to the same basic block!");
+
+  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
+  DEBUG(errs() << *CI->getParent()->getParent());
+}
+
+/* This function takes a DFNode, and creates a filter function for it. By filter
+ * function we mean a function which keeps on getting input from input buffers,
+ * applying the function on the inputs and then pushes data on output buffers
+ */
+// Create a function with void* (void*) type.
+// Create a new basic block
+// Add a return instruction to the basic block
+// extract arguments from the aggregate data input. Type list would be
+// Replace the streaming inputs with i8* types signifying handle to
+// corresponding buffers
+// Add a boolean argument isLastInput
+// Add runtime API calls to get input for each of the streaming inputs
+// Add a call to the generated function of the child node
+// Add runtime API calls to push output for each of the streaming outputs
+// Add loop around the basic block, which exits the loop if isLastInput is false
+
+Function* CGT_X86::createFunctionFilter(DFNode* C) {
+  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
+
+  /* Create a function with same argument list as child.*/
+  DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
+  // Get the generated function for child node
+  Function* CF = C->getFuncPointer();
+  // Create Filter Function of type i8*(i8*) which calls the root function
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
+  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
+                          CF->getLinkage(),
+                          CF->getName()+"_Pipeline",
+                          &M);
+  DEBUG(errs() << "Generating Pipline Function\n");
+  // Give a name to the argument which is used pass data to this thread
+  Value* data = &*CF_Pipeline->arg_begin();
+  data->setName("data.addr");
+  // Create a new basic block
+  DEBUG(errs() << "\tCreate new BB and add a return function\n");
+  // Add a basic block to this empty function
+  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  // Add a return instruction to the basic block
+  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
+                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
+
+
+  /* Extract the elements from the aggregate argument to the function.
+   * Replace the streaming inputs with i8* types signifying handle to
+   * corresponding buffers
+   * Add outputs to the list as well
+   * Add isLastInput to the list
+   */
+  DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
+  // These Args will be used when passing arguments to the generated function
+  // inside loop, and reading outputs as well.
+  std::vector<Value*> Args;
+  std::vector<Type*> TyList;
+  std::vector<std::string> names;
+  // Adding inputs
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      TyList.push_back(i8Ty->getPointerTo());
+      names.push_back((Twine(i->getName())+"_buffer").str());
+    }
+    else {
+      TyList.push_back(i->getType());
+      names.push_back(i->getName());
+    }
+  }
+  // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
+  // because we get there buffer handles
+  StructType* RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
+    TyList.push_back(i8Ty->getPointerTo());
+    names.push_back("out");
+  }
+  /* Add a boolean argument isLastInput */
+  DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
+  TyList.push_back(i8Ty->getPointerTo());
+  names.push_back("isLastInput_buffer");
+
+  // Extract the inputs, outputs and
+  Args = extractElements(data, TyList, names, RI);
+  for(unsigned i=0; i<Args.size(); i++) {
+    DEBUG(errs() << *Args[i] << "\n");
+  }
+
+  // Split the Args vector into, input output and isLastInput
+  unsigned numInputs = CF->getFunctionType()->getNumParams();
+  unsigned numOutputs = RetTy->getNumElements();
+  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
+  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
+
+  /* Add runtime API calls to get input for each of the streaming input edges */
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
+  // First read the termination condition variable islastInput
+  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
+                                        ArrayRef<Value*>(isLastInput),
+                                        "",
+                                        RI);
+
+  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
+                 Type::getInt64Ty(CF_Pipeline->getContext()),
+                 false,
+                 "isLastInput",
+                 RI);
+  isLastInput = BI;
+  // Create a loop termination condition
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
+      RI);
+
+  // Get input from buffers of all the incoming streaming edges
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
+                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
+                                            "",
+                                            RI);
+      CastInst* BI;
+      if(i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr,
+                              bufferIn,
+                              i->getType(),
+                              i->getName()+".addr",
+                              RI);
+      }
+      else if(i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn,
+                                    i->getType(),
+                                    i->getName()+".addr",
+                                    RI);
+      }
+      else {
+        BI = CastInst::CreateIntegerCast(bufferIn,
+                                         i->getType(),
+                                         false,
+                                         i->getName()+".addr",
+                                         RI);
+      }
+      // Replace the argument in Args vector. We would be using the vector as
+      // parameters passed to the call
+      InputArgs[i->getArgNo()] = BI;
+    }
+  }
+  /* Add a call to the generated function of the child node */
+  DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
+//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+//                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
+  DEBUG(errs() << "Type: "
+               << *CGenF->getType()
+               << "\n");
+  CallInst* CI = CallInst::Create(CGenF,
+                                  InputArgs,
+                                  CGenF->getName()+".output",
+                                  RI);
+
+  /* Add runtime API calls to push output for each of the streaming outputs */
+  // FIXME: Assumption
+  // All edges between siblings are streaming edges
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
+  for (unsigned i=0; i< numOutputs; i++) {
+    // Extract output
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
+                           "",RI);
+    // Convert to i64
+    CastInst* BI;
+    if(EI->getType()->isPointerTy())
+      BI = CastInst::Create(CastInst::PtrToInt,EI,
+                            Type::getInt64Ty(CF_Pipeline->getContext()),
+                            "",
+                            RI);
+    else
+      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
+                                       false, "", RI);
+    // Push to Output buffer
+    Value* bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush,
+                                           ArrayRef<Value*>(bufferOutArgs, 2),
+                                           "",
+                                           RI);
+  }
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond);
+//  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
+//                RI, Cond);
+
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  // Pointers to keep the created loop structure
+  BasicBlock *EntryBB, *CondBB, *BodyBB;
+  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
+  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
+  EntryBB = CondStartI->getParent();
+
+  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
+  CondBB = CondStartI->getParent();
+  BodyBB = CI->getParent();
+  Instruction *CntI = NULL;
+  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
+
+  // If the node function calls the visc runtime call to get policy, we update
+  // it with the counter information. This means we need to pass an additional
+  // argument to the generated function, that is the iteration number, and then
+  // use it as an argument to the policy_getVersion call 
+  if (GetPolicyCI) {
+    CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
+    assert(CntI && "Counter instruction not found\n");
+
+    // Create new function type (with additional argument for iteration number)
+    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
+    std::vector<Type*> NewArgTypes;
+    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
+         ai != ae ; ++ai) {
+      NewArgTypes.push_back(ai->getType());
+    }
+    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
+    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
+    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
+    // At least one (the last) argument exists (we added it)
+    Function::arg_iterator ae = NewCGenF->arg_end();
+    --ae;
+    Argument *CntArg = &*ae;
+    CntArg->setName("iteration");
+    // Replace the old cpu gen func with this one
+    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
+
+    // Add counter to the actual parameter list, to create the new call
+    InputArgs.push_back(CntI);
+    CallInst* newCI = CallInst::Create(NewCGenF,
+                                       InputArgs,
+                                       NewCGenF->getName()+".output");
+    ReplaceInstWithInst(CI, newCI);
+
+    // Set second operand of the policy_getVersion call to the last function
+    // argument
+    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
+    GetPolicyCI->setArgOperand(1, CntArg);
+  }
+
+  // Return the Function pointer
+  DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
+  DEBUG(errs() << *CF_Pipeline << "\n");
+  return CF_Pipeline;
+}
+
+void CGT_X86::codeGen(DFInternalNode* N) {
+  // Check if N is root node and its graph is streaming. We do not do codeGen
+  // for Root in such a case
+  if(N->isRoot() && N->isChildGraphStreaming())
+    return;
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  // Sort children in topological order before code generation
+  N->getChildGraph()->sortChildren();
+
+  // Only process if all children have a CPU x86 function
+  // Otherwise skip to end
+  bool codeGen = true;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
+    // Skip dummy node call
+    if (C->isDummyNode())
+      continue;
+
+    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+      errs() << "No CPU x86 version for child node "
+             << C->getFuncPointer()->getName()
+             << "\n  Skip code gen for parent node "
+             << N->getFuncPointer()->getName() << "\n";
+      codeGen = false;
+    }
+  }
+
+  if (codeGen) {
+    Function* F = N->getFuncPointer();
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function* F_X86;
+  
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+  
+    // Create new function with the same type
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Increment dest iterator
+      ++dest_iterator;
+    }
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
+    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
+
+    // Add Index and Dim arguments except for the root node and the child graph of
+    // parent node is not streaming
+    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+      F_X86 = addIdxDimArgs(F_X86);
+
+    BB = &*F_X86->begin();
+    RI = cast<ReturnInst>(BB->getTerminator());
+  
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Loop over the arguments, to create the VMap.
+    dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      // Add mapping and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+
+    // Iterate over children in topological order
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      DFNode* C = *ci;
+      // Skip dummy node call
+      if (C->isDummyNode())
+        continue;
+  
+      // Create calls to CPU function of child node
+      invokeChild_X86(C, F_X86, VMap, RI);
+  
+    }
+ 
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    DFNode* C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType* OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_X86->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge* E = C->getInDFEdgeAt(i);
+  
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode* SrcDF = E->getSourceDF();
+  
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_X86, i);
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
+        // edge is from a internal node
+        // Check - code should already be generated for this source dfnode
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+  
+        // Find Output Value associated with the Source DFNode using OutputMap
+        Value* CI = OutputMap[SrcDF];
+  
+        // Extract element at source position from this call instruction
+        std::vector<unsigned> IndexList;
+        IndexList.push_back(E->getSourcePosition());
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
+        inputVal = EI;
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
+
+  }
+
+  //-------------------------------------------------------------------------//
+  // Here, we need to check if this node (N) has more than one versions
+  // If so, we query the policy and have a call to each version
+  // If not, we see which version exists, check that it is in fact an x86
+  // function and save it as the CPU_TARGET function
+
+  // TODO: visc_id per node, so we can use this for id for policies
+  // For now, use node function name and change it later
+  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+  errs() << "Node: " << N->getFuncPointer()->getName()
+                     << " with tag " << N->getTag() << "\n";
+  errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+  errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+
+  if (N->getTag() == visc::None) {
+    // No code is available for this node. This (usually) means that this
+    // node is a node that
+    // - from the accelerator backends has been mapped to an intermediate
+    // node, and thus they have not produced a genFunc
+    // - a child node had no CPU hint, thus no code gen for CPU could 
+    // take place
+    errs() << "No GenFunc - Skipping CPU code generation for node "
+           << N->getFuncPointer()->getName() << "\n";
+  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
+    // There is a single version for this node according to code gen hints.
+    // Therefore, we do not need to check the policy, we simply use the
+    // available implementation, whichever target it is for.
+
+    // Sanity check - to be removed TODO
+    switch (N->getTag()) {
+      case visc::CPU_TARGET:
+        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::GPU_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+        break;
+      case visc::SPIR_TARGET:
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
+        break;
+      default:
+        assert(false && "Unreachable: we checked that tag was single target!\n");
+        break;
+    }
+
+    // If device abstraction is enabled, then we may need to edit the node 
+    // function. In case this is a GPU or SPIR gen func, we issue a call to
+    // the runtime that waits for the device to be available
+    if (DeviceAbstraction) {
+      Function *NodeGenFunc = NULL;
+      switch (N->getTag()) {
+        case visc::GPU_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+          break;
+        case visc::SPIR_TARGET:
+          NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET);
+          break;
+        default:
+          break;
+      }
+
+      if (NodeGenFunc) {
+        // If we found a function to edit, we add the call to the runtime as
+        // its first statement
+        BasicBlock *BB = &*NodeGenFunc->begin();
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
+      }
+
+    }
+
+    Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::None);
+    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+    N->setTag(visc::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    errs() << "After editing\n";
+    errs() << "Node: " << N->getFuncPointer()->getName()
+                       << " with tag " << N->getTag() << "\n";
+    errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
+    errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
+    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
+    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
+
+    //  assert(false && "got to the point where we have to select\n");
+  } else {
+    // We have more than one targets
+    
+    errs() << "Node Name (for policy) : "
+           << N->getFuncPointer()->getName() << "\n";
+
+    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
+
+    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
+
+    // These assertions express what we can support with the current runtime.
+    // Code generation works the same way even for other target combinations.
+    // For now, we want either CPU and GPU, or CPU and SPIR
+    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
+    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
+           "Generated functions without appropriate x86 wrapper\n");
+
+    FunctionType *FT = CF->getFunctionType();
+    if (GF)
+      assert(FT == GF->getFunctionType() &&
+             "Type mismatch between generated functions for GPU and CPU targets.\n");
+    if (SF)
+      assert(FT == SF->getFunctionType() &&
+             "Type mismatch between generated functions for SPIR and CPU targets.\n");
+
+    // Code generation of wrapper function
+    Function *F_wrapper;
+    ValueToValueMapTy VMap;
+    F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M);
+
+    // Copy argument names over
+    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
+    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName());
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+    // Gather all arguments of wrapper in a vector, to prepare the call to
+    // the individual gen functions
+    std::vector<Value *> GenFuncCallArgs;
+    for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end();
+         i != e; ++i) {
+      GenFuncCallArgs.push_back(&*i);
+    }
+
+    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
+
+    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
+
+    StringRef FName = N->getFuncPointer()->getName();
+    size_t nameSize = FName.size()+1;
+    std::vector<Constant *> NameV;
+    for (char c: FName) {
+      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
+    }
+    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
+    ArrayType *NameType =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
+    AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent);
+    Constant *NameConst = ConstantArray::get(NameType, NameV);
+    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
+    CastInst *BI = BitCastInst::CreatePointerCast(AI,
+                     Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
+    std::vector<Value *> Args;
+    Args.push_back(BI);
+    Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
+    Function *RTF =
+      cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion",
+      runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType()));
+    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
+
+    ConstantInt *CmpConst =
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
+    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp,
+                                    CmpInst::ICMP_EQ,
+                                    RTFInst, CmpConst,
+                                    "", BBcurrent);
+
+    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
+    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
+    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+
+    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
+    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (GF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    // Switch basic block pointers
+    BBcurrent = BBfalse;
+    if (SF) {
+      // We have a GPU version. Generate policy check and call
+      CmpConst =
+         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
+      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                             RTFInst, CmpConst, "", BBcurrent);
+      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
+      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
+      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+      
+      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
+      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+
+      if (DeviceAbstraction) {
+        // Prepare arguments and function for call to wait for device runtime call
+        std::vector<Value *> Args; // TODO: add the device type as argument?
+        Function *RTF =
+          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
+        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+      }
+    }
+
+    RI = ReturnInst::Create(M.getContext(),
+                            UndefValue::get(FT->getReturnType()), BBfalse);
+
+    // Now, make the node cpu gen func to be this one
+    // Remove all other versions and update the tag
+    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
+    N->removeGenFuncForTarget(visc::GPU_TARGET);
+    N->removeGenFuncForTarget(visc::SPIR_TARGET);
+    N->setTag(visc::CPU_TARGET);
+
+    // assert(false && "got to the point where we have to combine\n");
+  }
+
+}
+
+// Code generation for leaf nodes
+void CGT_X86::codeGen(DFLeafNode* N) {
+  // Skip code generation if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // At this point, the X86 backend does not support code generation for
+  // the case where allocation node is used, so we skip. This means that a
+  // CPU version will not be created, and therefore code generation will
+  // only succeed if another backend (nvptx or spir) has been invoked to
+  // generate a node function for the node including the allocation node.
+  if (N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
+  // Check if clone already exists. If it does, it means we have visited this
+  // function before and nothing else needs to be done for this leaf node.
+//  if(N->getGenFunc() != NULL)
+//    return;
+
+  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
+
+    errs() << "Check for cudnn or promise hint for node "
+           << N->getFuncPointer()->getName() <<  "\n";
+
+    switch (N->getTag()) {
+       case visc::CUDNN_TARGET: {
+          errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for cudnn
+         assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && "");
+         // Store the CUDNN x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->removeGenFuncForTarget(visc::CUDNN_TARGET);
+         N->setTag(visc::None);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::PROMISE_TARGET: {
+          errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n";
+         // Make sure there is a generated x86 function for promise
+         assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && "");
+         assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && "");
+         // Store the PROMISE x86 function as the CPU generated function
+         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
+         // after adding the required number of arguments
+         if (!N->getParent()->isChildGraphStreaming())
+           Ftmp = addIdxDimArgs(Ftmp);
+
+         N->setTag(visc::None);
+         N->removeGenFuncForTarget(visc::PROMISE_TARGET);
+         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
+         N->setTag(visc::CPU_TARGET);
+         break;
+         }
+       case visc::GPU_TARGET:
+         // A leaf node should not have an x86 function for GPU
+         // by design of DFG2LLVM_NVPTX backend
+         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+         break;
+       case visc::SPIR_TARGET:
+         // A leaf node should not have an x86 function for SPIR
+         // by design of DFG2LLVM_SPIR backend
+         assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
+         break;
+       default:
+         break;
+    }
+
+    return;
+  }
+
+  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+         "Error: Visiting a node for which code already generated\n");
+
+  std::vector<IntrinsicInst *> IItoRemove;
+  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
+  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
+
+  // Get the function associated woth the dataflow node
+  Function *F = N->getFuncPointer();
+
+  // Clone the function, if we are seeing this function for the first time.
+  Function *F_X86;
+  ValueToValueMapTy VMap;
+  F_X86 = CloneFunction(F, VMap);
+  F_X86->removeFromParent();
+  // Insert the cloned function into the module
+  M.getFunctionList().push_back(F_X86);
+
+  // Add the new argument to the argument list. Add arguments only if the cild
+  // graph of parent node is not streaming
+  if(!N->getParent()->isChildGraphStreaming())
+    F_X86 = addIdxDimArgs(F_X86);
+
+  // Add generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+  /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/
+  /* This part of the code is meant to handle turning the CPU backend into an
+   "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime
+   needs to be essentially deactivated.                                      */
+
+  /* We look into the leaf node's function for function call starting from
+   "tensor". These are functions with which we replaced the ApproxHPVM
+   intrinsics, and for which we have LLVM implementations. If found, it means
+   we are dealing with an AproxHPVM program.                                 */
+  bool isApproxHPVMnode = false;
+  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
+    Instruction *I = &(*i);
+    DEBUG(errs() << *I << "\n");
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if ((CI->getCalledFunction()->getName()).startswith("tensor")) {
+        isApproxHPVMnode = true;
+        break;
+      }
+    }
+  }
+
+  /*As in CUDNN backend, we remove the in out attributes of tensor operations,
+   aiming to deactivate the HPVM runtime calls. This has been tested through
+   CUDNN backend for the internal node codegen, and should ensure that code
+   does not insert llvm_visc_x86_argument_ptr in the generated function for
+   leaf node codegen as well.                                                */
+
+  /* Removing HPVM in/out/inout function attributes */
+  if (isApproxHPVMnode) {
+    for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) {
+      Argument *Arg = &*ai;
+      if(Arg->hasAttribute(Attribute::In))
+        Arg->removeAttr(Attribute::In);
+      if(Arg->hasAttribute(Attribute::Out))
+        Arg->removeAttr(Attribute::Out);
+      if(Arg->hasAttribute(Attribute::InOut))
+        Arg->removeAttr(Attribute::InOut);    
+    }
+  }else{
+    printf("****** NO REMOVEAL *** \n\n");
+  }
+
+  /*** FIXME: HACK FOR DSSOC DEMO -- END ***/
+
+  // Go through the arguments, and any pointer arguments with in attribute need
+  // to have x86_argument_ptr call to get the x86 ptr of the argument
+  // Insert these calls in a new BB which would dominate all other BBs
+  // Create new BB
+  BasicBlock* EntryBB = &*F_X86->begin();
+  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
+  // Insert calls
+  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+        ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
+      assert(ai->getType()->isPointerTy()
+          && "Only pointer arguments can have visc in/out attributes ");
+      Function::arg_iterator aiNext = ai;
+      ++aiNext;
+      Argument* size = &*aiNext;
+      assert(size->getType() == Type::getInt64Ty(M.getContext())
+          && "Next argument after a pointer should be an i64 type");
+      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
+                                                    Type::getInt8PtrTy(M.getContext()),
+                                                    ai->getName()+".i8ptr",
+                                                    Terminator);
+      Value* ArgPtrCallArgs[] = {BI, size};
+      CallInst::Create(llvm_visc_x86_argument_ptr,
+                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
+                                              "",
+                                              Terminator);
+
+    }
+  }
+  errs() << *BB << "\n";
+
+  // Go through all the instructions
+  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
+    Instruction *I = &(*i);
+    DEBUG(errs() << *I << "\n");
+    // Leaf nodes should not contain VISC graph intrinsics or launch
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+
+    if (BuildDFG::isViscQueryIntrinsic(I)) {
+      IntrinsicInst* II = cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
+
+      /***********************************************************************
+      *                        Handle VISC Query intrinsics                  *
+      ***********************************************************************/
+      switch (II->getIntrinsicID()) {
+      /**************************** llvm.visc.getNode() *******************/
+      case Intrinsic::visc_getNode: {
+        // add mapping <intrinsic, this node> to the node-specific map
+        Leaf_HandleToDFNodeMap[II] = N;
+        IItoRemove.push_back(II);
+        break;
+      }
+      /************************* llvm.visc.getParentNode() ****************/
+      case Intrinsic::visc_getParentNode: {
+        // get the parent node of the arg node
+        // get argument node
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        // get the parent node of the arg node
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+        // Add mapping <intrinsic, parent node> to the node-specific map
+        // the argument node must have been added to the map, orelse the
+        // code could not refer to it
+        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*************************** llvm.visc.getNumDims() *****************/
+      case Intrinsic::visc_getNumDims: {
+        // get node from map
+        // get the appropriate field
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
+        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+
+        II->replaceAllUsesWith(numOfDimConstant);
+        IItoRemove.push_back(II);
+        break;
+      }
+      /*********************** llvm.visc.getNodeInstanceID() **************/
+      case Intrinsic::visc_getNodeInstanceID_x:
+      case Intrinsic::visc_getNodeInstanceID_y:
+      case Intrinsic::visc_getNodeInstanceID_z: {
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNodeInstanceID_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 + (3-dim);
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
+          assert(indexVal && "Index argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
+
+          II->replaceAllUsesWith(indexVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is for an ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
+                                          ArrayRef<Value*>(args, 2),
+                                          "nodeInstanceID", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+        break;
+      }
+      /********************** llvm.visc.getNumNodeInstances() *************/
+      case Intrinsic::visc_getNumNodeInstances_x:
+      case Intrinsic::visc_getNumNodeInstances_y:
+      case Intrinsic::visc_getNumNodeInstances_z: {
+
+        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
+        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
+
+        // The dfnode argument should be an ancestor of this leaf node or
+        // the leaf node itself
+        int parentLevel = N->getAncestorHops(ArgDFNode);
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+
+        // Get specified dimension
+        // (dim = 0) => x
+        // (dim = 1) => y
+        // (dim = 2) => z
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNumNodeInstances_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
+
+        // For immediate ancestor, use the extra argument introduced in
+        // F_X86
+        int numParamsF = F->getFunctionType()->getNumParams();
+        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
+
+        if(parentLevel == 0) {
+          // Case when the query is for this node itself
+          unsigned offset = 3 - dim;
+          // Traverse argument list of F_X86 in reverse order to find the
+          // correct index or dim argument.
+          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
+          assert(limitVal && "Limit argument not found. Invalid offset!");
+
+          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
+
+          II->replaceAllUsesWith(limitVal);
+          IItoRemove.push_back(II);
+        }
+        else {
+          // Case when query is from the ancestor
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
+                                          ArrayRef<Value*>(args, 2),
+                                          "numNodeInstances", II);
+          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
+          II->replaceAllUsesWith(CI);
+          IItoRemove.push_back(II);
+        }
+
+        break;
+      }
+      default:
+        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
+              II->getIntrinsicID() << "\n");
+        assert(false && "Unknown VISC Intrinsic!");
+        break;
+      }
+
+    } else {
+      //TODO: how to handle address space qualifiers in load/store
+    }
+
+  }
+
+  //TODO:
+  // When to replace the uses?
+  // In which order is it safe to replace the instructions in
+  // IItoReplace?
+  // Probably in the reverse order in the vectors
+  // It is a good idea to have them in one vector and chech the type
+  // using dyn_cast in order to determine if we replace with inst or value
+
+
+  //TODO: maybe leave these instructions to be removed by a later DCE pass
+  for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
+       i != IItoRemove.end(); ++i) {
+    (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
+    (*i)->eraseFromParent();
+  }
+
+  DEBUG(errs() << *F_X86);
+}
+
+} // End of namespace
+
+char DFG2LLVM_X86::ID = 0;
+static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc",
+                                    "Dataflow Graph to LLVM for X86 backend (DSOCC version)",
+                                    false /* does not modify the CFG */,
+                                    true /* transformation, not just analysis */);
+
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a6c4de95376cb517de25482ecf74f0782c479004
--- /dev/null
+++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DFG2LLVM_X86_dsoc
+parent = Transforms
+
diff --git a/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt b/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..51f321884fe7f9cb56e11df573bb837dde89434e
--- /dev/null
+++ b/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( InlineTensorCalls
+  InlineTensorCalls.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d31434341cf65939768d0acb7a0051d453909971
--- /dev/null
+++ b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp
@@ -0,0 +1,77 @@
+//=== InlineApproxHPVMCalls.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "INLINE_APPROXHPVM_CALLS"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#include "llvm/IR/InstIterator.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/InlineCost.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/ADT/SetVector.h"
+#include <sstream>
+
+using namespace llvm;
+
+
+namespace {
+
+  struct InlineApproxHPVMCalls : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    InlineApproxHPVMCalls() : ModulePass(ID) {}
+
+    bool runOnModule(Module &M) override {
+
+      InlineFunctionInfo IFI;
+      SmallSetVector<CallSite, 16> Calls;
+      bool Changed = false;
+      SmallVector<Function *, 16> InlinedFunctions;
+      for (Function &F : M){
+	if (!F.isDeclaration() && F.getName().startswith("tensor") ) {
+	  //errs()<<"Function = "<<*&F<<"\n";
+	  Calls.clear();
+
+	  for (User *U : F.users())
+	    if (auto CS = CallSite(U))
+	      if (CS.getCalledFunction() == &F)
+		Calls.insert(CS);
+
+	  for (CallSite CS : Calls)
+	    // FIXME: We really shouldn't be able to fail to inline at this point!
+	    // We should do something to log or check the inline failures here.
+	    Changed |= InlineFunction(CS, IFI);
+
+	}
+      }
+
+      return true;
+    }
+
+  };
+
+
+} // End of namespace
+
+char InlineApproxHPVMCalls::ID = 0;
+static RegisterPass<InlineApproxHPVMCalls> X("inline-tensor-calls",
+					     "Inline ApproxHPVM tensor library function calls (CPU version)",
+					     true /* modifies the CFG */,
+					     true /* transformation,   *
+						   * not just analysis */);
+
diff --git a/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.exports b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.exports
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt b/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8fff7891af1b6b81fd642bb1300a23c2caca6918
--- /dev/null
+++ b/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InlineTensorCalls
+parent = Transforms
+
diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt b/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0bfb2bf2219ba1278d14b78e7ee5dc0a0abd2702
--- /dev/null
+++ b/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
+add_llvm_loadable_module( ReplaceIntrinsics
+  ReplaceIntrinsics.cpp
+
+  DEPENDS
+  intrinsics_gen
+  PLUGIN_TOOL
+  opt
+  )
+
diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt b/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6450fa1714de0200ce18848919d69cff895848d0
--- /dev/null
+++ b/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ReplaceIntrinsics
+parent = Transforms
+
diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef649d8e170451d5bf2f133d6113cbfeb30f046e
--- /dev/null
+++ b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp
@@ -0,0 +1,516 @@
+//=== ReplaceApproxHPVMIntrinsicsWithFCalls.cpp ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define ENABLE_ASSERTS
+
+#define DEBUG_TYPE "REPLACE_APPROXHPVM_INTRINSICS_WITH_FCALLS"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm-c/Core.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/DFG2LLVM.h"
+#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace builddfg;
+using namespace dfg2llvm;
+
+// TODO: We still need in place analysis, if calls have the same interface
+using namespace inplacedfg;
+
+namespace {
+// Helper class declarations
+
+// Replace ApproxHPVM intrinsics with LLVM function calls.
+// aiming to go through the CPU backend code generation.
+
+struct DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls : public DFG2LLVM {
+  static char ID; // Pass identification, replacement for typeid
+  DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls() : DFG2LLVM(ID) {}
+private:
+
+public:
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<BuildDFG>();
+    AU.addRequired<InPlaceDFGAnalysisWrapper>();
+    AU.addPreserved<BuildDFG>();
+    AU.addPreserved<InPlaceDFGAnalysisWrapper>();
+  }
+
+  bool runOnModule(Module &M);
+};
+
+// Visitor for Code generation traversal (tree traversal for now)
+class CGT_ReplaceApproxHPVMIntrinsicsWithFCalls : public CodeGenTraversal {
+
+private:
+  //Member variables
+  InPlaceDFGAnalysis::InPlaceDFGParameter *IPP;
+
+  // VISC Runtime API and Tensor runtime API
+
+  /* TODO: I believe that TensorRt is not needed, since we will have llvm
+   implementations linked in, so init and cleanup calls can be removed and
+   relevant code also, but I leave in in for now until verified. */
+  Constant* llvm_hpvm_initTensorRt;
+  Constant* llvm_hpvm_cleanupTensorRt;
+//  Constant* hpvm_request_tensor; DONE: request tensor will not be used
+
+  // Functions
+  bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N);
+
+  // Virtual Functions
+  void init();
+  void initRuntimeAPI();
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+
+public:
+
+  // Constructor
+  CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP)
+  : CodeGenTraversal(_M, _DFG), IPP(&_IPP) {
+    initRuntimeAPI();
+  }
+
+};
+
+bool CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::isValidOperandForInPlaceOperation(Value *Op,
+                                                  Function *Fgen,
+                                                  DFNode *N) {
+  // We only expect the if branch to be taken
+  if (Argument *Arg = dyn_cast<Argument>(Op)) {
+    DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n");
+    assert((Arg->getParent() == Fgen) &&
+          "Extra Parameter in body of Function\n");
+    // Candidae parameter is a function argument
+    // In this case, consult the result of in place analysis
+    // Find position in arg list
+    unsigned pos = Arg->getArgNo();
+    // If this parameter cannot be used for in place operation
+    // code gen cannot continue
+    if (IPP->at(N)[pos]) {
+      DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n");
+      return true;
+    } else {
+      DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n");
+      return false;
+    }
+  }
+  else {
+    // If it is not an argument, then it needs to be the result of
+    // another intrinsic. These are new objects that are allocated,
+    // and consumed by next intrinsic. Alternatively, the intrinsic
+    // could have been replaced by a call to an LLVM function.
+    // We do not expect a merge pass to have run before the replacement pass,
+    // therefore we do not expect to go in the else branch.
+    DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n");
+    if (dyn_cast<IntrinsicInst>(Op)) {
+      DEBUG(errs() << *Arg << "\t: local, suitable for in place\n");
+      return true;
+    } else if (CallInst *CI = dyn_cast<CallInst>(Op)) {
+      if ((CI->getCalledFunction()->getName()).startswith("tensor"))
+        return true;
+      else
+        return false;
+    }
+    else {
+      DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n");
+      return false;
+    }
+  }
+}
+
+
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::init() {
+}
+
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n");
+
+  // FIXME: set correct path
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_cpu_runtime.ll";
+  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  if(runtimeModule == nullptr)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n");
+
+  // Get or insert Global declarations for
+  // - initialization
+  // - cleanup
+  // - request a tensor
+  DECLARE(llvm_hpvm_initTensorRt);
+  DECLARE(llvm_hpvm_cleanupTensorRt);
+//  DECLARE(hpvm_request_tensor);
+
+  // Find visc.init and visc.cleanup calls, and add placeholder methods
+  // for initialization and cleanup of the hpvm tensor runtime
+
+  Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n");
+  InitCall = cast<Instruction>(*VI->user_begin());
+  CallInst::Create(llvm_hpvm_initTensorRt,
+                   ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)),
+                   "", InitCall);
+
+  Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n");
+  CleanupCall = cast<Instruction>(*VC->user_begin());
+  CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall);
+
+}
+
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  errs () << "Skipping internal node\n";
+}
+
+  
+void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFLeafNode* N) {
+
+  // Skip if it is a dummy node
+  if(N->isDummyNode()) {
+    DEBUG(errs() << "Skipping dummy node\n");
+    return;
+  }
+
+  // Abort if it is an allocation node
+  if(N->isAllocationNode()) {
+    assert(false && "Allocation Node not expected in ApproxHPVM");
+    return;
+  }
+
+  // Search for intrinsic only if it has the right hint
+  if (!checkPreferredTarget(N, visc::CPU_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
+  // Get the function associated with the dataflow node
+  Function *F = N->getFuncPointer();
+  errs()<<"function name = "<< F->getName()<<"\n";
+
+  std::vector<IntrinsicInst *> IItoRemove;
+
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    if (BuildDFG::isViscIntrinsic(I)) {
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+      assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")
+        && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n");
+      /********************* Handle VISC Tensor intrinsics ********************/
+      // We replace them with calls to functions with implementations at the LLVM level
+      switch (II->getIntrinsicID()) {
+
+      case Intrinsic::visc_tensor_convolution:
+      { /* llvm.hpvm.tensor.convolution */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor convolution \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+
+	Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+	Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+
+        Args.push_back(conv_mode);
+        Args.push_back(conv_precision);
+	
+        // Create function call
+        Constant* tensorConvolutionCPU;
+        DECLARE(tensorConvolutionCPU);
+	
+        CallInst* CI = CallInst::Create(tensorConvolutionCPU,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_mul:
+      { /* llvm.hpvm.tensor.mul */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor mul\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create function call
+        Constant* tensorGemmCPU;
+        DECLARE(tensorGemmCPU);
+	
+        CallInst* CI = CallInst::Create(tensorGemmCPU,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_add:
+      { /* llvm.hpvm.tensor.add */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor add\n");
+        // Tensor add(a,b) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+
+	// FIXME: remove this comment - must check for in-place
+        //assert(inplace &&
+        //       "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+
+        // Create function call
+        Constant* tensorAddCPU;
+        DECLARE(tensorAddCPU);
+        CallInst::Create(tensorAddCPU, Args, "", II);
+        // We can replace the call to hpvm.tensor.add with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_pool_max:
+      case Intrinsic::visc_tensor_pool_mean:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor_pool_max\n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list - tensorPooling(input, poolFunction, window_height, window_width, vertical_pad, horizontal_pad,
+	//                               vertical_stride, horizontal_stride);
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	int pool_type = 0;
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){
+          pool_type = 0;
+	}
+        if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){
+          pool_type = 1;
+	}	
+	
+	Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type);
+        Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero)	
+	Args.push_back(II->getOperand(1));
+        Args.push_back(II->getOperand(2));
+	Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+	Args.push_back(II->getOperand(5));
+	Args.push_back(II->getOperand(6));
+
+        // Create function call
+        Constant* tensorPoolingCPU;
+        DECLARE(tensorPoolingCPU);
+        CallInst* CI = CallInst::Create(tensorPoolingCPU, Args, "", II);
+
+	// Replacing intrinsic result uses with the result of the LLVM call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }break;
+
+      case Intrinsic::visc_tensor_relu:
+      case Intrinsic::visc_tensor_clipped_relu:
+      case Intrinsic::visc_tensor_tanh:
+      { /* llvm.visc.tensor.relu */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions \n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+	if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){
+          // Create function call
+          Constant* tensorReluCPU;
+          DECLARE(tensorReluCPU);
+          CallInst::Create(tensorReluCPU, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){
+          // Create function call
+          //-- Constant* tensorClippedRelu;
+	  Constant* tensorRelu2CPU;
+          DECLARE(tensorRelu2CPU);
+          CallInst::Create(tensorRelu2CPU, Args, "", II);
+	}
+	else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){
+          // Create function call
+          Constant* tensorTanhCPU;
+	  errs()<<"tensorTanh Call = \n\n";
+          DECLARE(tensorTanhCPU);
+	  //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l";
+          CallInst::Create(tensorTanhCPU, Args, "", II);
+	}
+     
+        // We can replace the call to hpvm.tensor.relu with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      case Intrinsic::visc_tensor_softmax:
+      { /* llvm.visc.tensor.softmax */
+        DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n");
+        // Tensor relu(a) is in place for argument a.
+        Value *Op = II->getOperand(0);
+
+        // Test the intrinsic operand for in place operation.
+        bool inplace = isValidOperandForInPlaceOperation(Op, F, N);
+        // Code generation cannot continue if this is false, because the target
+        // only provides an in place operation
+        assert(inplace &&
+               "Operand not valid for in place operation. Code gen aborted.\n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+
+        // Create function call
+        Constant* tensorSoftmaxCPU;
+        DECLARE(tensorSoftmaxCPU);
+        CallInst::Create(tensorSoftmaxCPU, Args, "", II);
+        // We can replace the call to hpvm.tensor.softmax with the 1st argument
+        // that, due to in place operation, now contains the result
+        II->replaceAllUsesWith(II->getOperand(0));
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+
+      }
+
+    }
+
+  }
+
+  // We need to do this explicitly: DCE pass may not remove them.
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around.
+  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
+    errs() << "Erasing: " << **ri << "\n";
+    (*ri)->eraseFromParent();
+  }
+
+  return;
+}
+
+bool DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::runOnModule(Module &M) {
+  errs() << "\nDFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls PASS\n";
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // Get the In Place Analysis Results
+  InPlaceDFGAnalysis::InPlaceDFGParameter IPP =
+    (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP();
+  // Print results
+  printInPlaceDFGParameter(IPP);
+
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+ 
+  // Visitor for Code Generation Graph Traversal
+  CGT_ReplaceApproxHPVMIntrinsicsWithFCalls *CGTVisitor =
+    new CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(M, DFG, IPP);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode: Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
+}
+
+
+/******************************************************************************
+ *                              Helper functions                              *
+ ******************************************************************************/
+
+
+} // End of namespace
+
+char DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::ID = 0;
+static RegisterPass<DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls> X("replace-intrinsics",
+                                      "Replace ApproxHPVM intrinsics with LLVM calls",
+                                      false /* does not modify the CFG */,
+                                      true /* transformation,   *
+                                            * not just analysis */);
+
diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.exports b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.exports
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
index 03b3941834b462c130adab73e739b41f68cd9f05..7be710803ab2a9bf884c33c215464199f3f28217 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
@@ -15,18 +15,17 @@ APP = lenet
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_RT_SRC_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/src
-# FIXME: Fix this path to be in the BUILD directories (Currently source directory)
-#PATH_TO_CPU_TensorRt = $(HPVM_BUILD_DIR)/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime_cpu.ll
 
 
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
-CCFLAGS += -DDEVICE=CUDNN_TARGET
 LINKER_FLAGS = -lpthread -lcudart -lcurand -lOpenCL
 
 HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib
 
 
-VISC_OPTFLAGS = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load  $(HPVM_LIB_DIR)/ReplaceIntrinsics.so   -load  $(HPVM_LIB_DIR)/InlineTensorCalls.so    -load  $(HPVM_LIB_DIR)/DFG2LLVM_X86_dsoc.so -load  $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace   -dfg2llvm-x86-dsoc  -clearDFG
+OPTFLAGS1 = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so   -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so  -load  $(HPVM_LIB_DIR)/ReplaceIntrinsics.so    -load  $(HPVM_LIB_DIR)/DFG2LLVM_X86_dsoc.so    -load  $(HPVM_LIB_DIR)/LLVMClearDFG.so  -inplace  -replace-intrinsics  -dfg2llvm-x86-dsoc  -clearDFG
+
+OPTFLAGS2 = -load  $(HPVM_LIB_DIR)/InlineTensorCalls.so  -inline-tensor-calls
 
 
 TARGET = $(BUILD_DIR)/$(APP).opt.bc
@@ -45,13 +44,16 @@ $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp
 $(BUILD_DIR)/%.visc.ll: $(BUILD_DIR)/%.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $< -S -o $@
 
+#$(BUILD_DIR)/lenet_tensor_rt.bc
+
 $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.visc.ll
+	$(OPT) $(OPTFLAGS1) $<  -o $@
 	$(CC) -emit-llvm  -c  $(TENSOR_RT_SRC_DIR)/tensor_cpu_runtime.cc  -o  $(BUILD_DIR)/tensor_cpu_runtime.bc
 	$(OPT) -always-inline $(BUILD_DIR)/tensor_cpu_runtime.bc  -o  $(BUILD_DIR)/tensor_cpu_runtime.bc
-	$(LLVM_LINK) $<  $(BUILD_DIR)/tensor_cpu_runtime.bc -o  $(BUILD_DIR)/lenet_tensor_rt.bc 
-	$(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/lenet_tensor_rt.bc -o $@
-	$(LLVM_LINK) $@ $(VISC_RT_PATH) -o $(BUILD_DIR)/lenet_linked.bc
-	$(CC) $(BUILD_DIR)/lenet_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/lenet_linked $(LINKER_FLAGS)
+	$(LLVM_LINK) $@ $(BUILD_DIR)/tensor_cpu_runtime.bc  -o  $(BUILD_DIR)/lenet_tensor_rt.bc
+	$(OPT) $(OPTFLAGS2) $(BUILD_DIR)/lenet_tensor_rt.bc  -o  $(BUILD_DIR)/lenet_inline.bc
+	#$(LLVM_LINK) $@ $(VISC_RT_PATH) -o $(BUILD_DIR)/lenet_linked.bc
+	#$(CC) $(BUILD_DIR)/lenet_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/lenet_linked $(LINKER_FLAGS)
 
 $(BUILD_DIR):
 	mkdir -p $@
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py
new file mode 100644
index 0000000000000000000000000000000000000000..c77d43bf8ff554b77623ed0ea291d4590e52cb3a
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py
@@ -0,0 +1,87 @@
+
+
+import os
+import sys
+import subprocess
+
+
+# Configuration variables - change for each benchmark
+bench_build_dir = os.environ["LLVM_SRC_ROOT"] + "/test/VISC/DNN_Benchmarks/benchmarks/lenet/build/"
+visc_file_name = "lenet.visc.ll" 
+num_tests = 10
+threshold_accuracy = 98.0
+binary_name = "./lenet_tune"
+result_dir = "./opentuner_test_result"
+num_flags = 14 # FIXME: Auto-extract the number of tensor ops from the bitcode file
+error_range = 9
+
+
+def change_dir():
+    os.chdir(bench_build_dir)
+    print os.getcwd()
+
+def setup_env():
+    os.environ["LD_LIBRARY_PATH"] = os.environ["LD_LIBRARY_PATH"] + ":" + os.environ["LLVM_BUILD_ROOT"] + "/lib" 
+    print os.environ["LD_LIBRARY_PATH"]
+
+    
+def build_binaries():
+    subprocess.call("make", shell=True)
+    
+
+def run_autotuner():
+
+    # Change build directory to benchmark build directory
+    change_dir()
+
+    LLVM_SRC_ROOT = os.environ["LLVM_SRC_ROOT"]
+    autotuner_cmd = "python  " + LLVM_SRC_ROOT + "projects/hpvm-tensor-rt/opentuner/autotuner/approxhpvm_tuner.py " + \
+                    " --test-limit " + str(num_tests) + \
+                    " --accuracy " + str(threshold_accuracy) + \
+                    " --binary " + str(binary_name) + \
+                    " --result-dir " + str(result_dir) + \
+                    " --num-flags " + str(num_flags) + \
+                    " --error-range " + str(error_range)  
+
+    print autotuner_cmd
+    
+    subprocess.call(autotuner_cmd, shell=True)
+    
+
+def add_approx_info():
+
+    # Change directory and setup env variables
+    change_dir()
+    setup_env()
+
+    subprocess.call("which opt", shell=True)
+
+    approxinfo_cmd = "opt  -load LLVMBuildDFG.so  -load InsertApproxInfo.so  -insert-approxinfo --results-dir " + \
+                     result_dir + " " + " " + \
+                     visc_file_name + " -S -o " + visc_file_name + "_approx.ll"
+
+    print approxinfo_cmd    
+    subprocess.call(approxinfo_cmd, shell=True)
+
+
+
+    
+def run_scheduler():
+
+    change_dir()
+    setup_env()
+
+    sched_cmd = "opt  -load LLVMBuildDFG.so  -load ApproxScheduler.so  -approx-scheduler --category quad --rank 4 " + \
+                     visc_file_name + "_approx.ll" + " -S -o " + visc_file_name + "_sched_out.ll"
+    print sched_cmd    
+    subprocess.call(sched_cmd, shell=True)
+    
+    
+    
+if __name__ == "__main__":
+
+    #build_binaries()
+    #run_autotuner()
+    #add_approx_info()
+    run_scheduler()
+    
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..13209394589933ad648e8a9ede1b6fc3b013a264
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+BUILD_DIR=${LLVM_SRC_ROOT}/test/VISC/DNN_Benchmarks/benchmarks/lenet/build/
+cd $BUILD_DIR
+python  ~/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/approxhpvm_tuner.py --test-limit 300 --accuracy 86.7 --binary ./lenet_tune --result-dir ./opentuner_test_result --num-flags 14 --error-range 9
+
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3548f182f198724600aee855b66169a1bdf12a3a
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# CUDNN Path setup
+module load cuda-toolkit/9.1
+export CUDA_INCLUDE_PATH=/software/cuda-9.1/include
+export CUDNN_PATH=/software/cuda-9.1/lib64/
+export LIBRARY_PATH=/software/cuda-9.1/lib64/:$LIBRARY_PATH
+export LD_LIBRARY_PATH=/software/cuda-9.1/lib64/:$LD_LIBRARY_PATH
+
+# HPVM Path setup
+export CPATH=$CPATH:/home/hsharif3/anaconda2/include/
+export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH
+export LLVM_BUILD_ROOT=/home/hsharif3/Gitlab/hpvm/build/
+export LLVM_SRC_ROOT=/home/hsharif3/Gitlab/hpvm/llvm/
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp
index 1746fc13dc4809f8c3d806fa144903fac50f3315..67213d38302982ee677ec0337aad5728d6de27ea 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp
@@ -10,7 +10,7 @@
 #include <tensorUtils.h> 
 
 void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); 
@@ -18,7 +18,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -26,7 +26,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_2_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_relu(t1); 
@@ -34,7 +34,7 @@ void var_2_node(void* t1, size_t bytes_t1) {
 }
 
 void var_3_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -42,7 +42,7 @@ void var_3_node(void* t1, size_t bytes_t1) {
 }
 
 void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); 
@@ -50,7 +50,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -58,7 +58,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_6_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_relu(t1); 
@@ -66,7 +66,7 @@ void var_6_node(void* t1, size_t bytes_t1) {
 }
 
 void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 2, 2); 
@@ -74,7 +74,7 @@ void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -82,7 +82,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_9_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_relu(t1); 
@@ -90,7 +90,7 @@ void var_9_node(void* t1, size_t bytes_t1) {
 }
 
 void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_mul(t1, t2); 
@@ -98,7 +98,7 @@ void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -106,7 +106,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_12_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_relu(t1); 
@@ -114,7 +114,7 @@ void var_12_node(void* t1, size_t bytes_t1) {
 }
 
 void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_mul(t1, t2); 
@@ -122,7 +122,7 @@ void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -130,7 +130,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_15_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_relu(t1); 
@@ -138,7 +138,7 @@ void var_15_node(void* t1, size_t bytes_t1) {
 }
 
 void var_16_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::CPU_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_softmax(t1);