From 024890d34228c3e5237adadc215905be6cf7c7b5 Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@tyler.cs.illinois.edu>
Date: Mon, 21 Dec 2020 09:18:20 -0600
Subject: [PATCH] Remove the dsoc x86 passes

---
 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt        |   13 -
 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports  |    0
 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp | 2128 -------------------
 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt         |   22 -
 4 files changed, 2163 deletions(-)
 delete mode 100644 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
 delete mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports
 delete mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
 delete mode 100644 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt

diff --git a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
deleted file mode 100644
index 75569addda..0000000000
--- a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if(WIN32 OR CYGWIN)
-  set(LLVM_LINK_COMPONENTS Core Support)
-endif()
-
-add_llvm_loadable_module( DFG2LLVM_X86_dsoc 
-  DFG2LLVM_X86_dsoc.cpp
-
-  DEPENDS
-  intrinsics_gen
-  PLUGIN_TOOL
-  opt
-  )
-
diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
deleted file mode 100644
index fbe5e4f6bd..0000000000
--- a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp
+++ /dev/null
@@ -1,2128 +0,0 @@
-//===-------------------------- DFG2LLVM_X86.cpp --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "DFG2LLVM_X86"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/SupportVISC/DFG2LLVM.h"
-
-using namespace llvm;
-using namespace builddfg;
-using namespace dfg2llvm;
-
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
-// Command line option to enable device abstraction or not
-static cl::opt<bool>
-DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
-                   cl::desc("Enable visc device abstraction"));
-
-
-namespace {
-
-// Helper Functions
-static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
-  if (!isa<CallInst>(I))
-    return false;
-  CallInst *CI = cast<CallInst>(I);
-  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
-}
-
-CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
-  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
-    Instruction *I = &*ib;
-    if (isVISCCall_llvm_visc_policy_getVersion(I))
-      return cast<CallInst>(I);
-  }
-  return NULL;
-}
-
-// DFG2LLVM_X86 - The first implementation.
-struct DFG2LLVM_X86 : public DFG2LLVM {
-  static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_X86() :DFG2LLVM(ID) {}
-
-private:
-  // Member variables
-
-  // Functions
-
-public:
-  bool runOnModule(Module &M);
-};
-
-// Visitor for Code generation traversal (tree traversal for now)
-class CGT_X86 : public CodeGenTraversal {
-
-private:
-  //Member variables
-
-  Constant* malloc;
-  // VISC Runtime API
-  Constant* llvm_visc_x86_launch;
-  Constant* llvm_visc_x86_wait;
-  Constant* llvm_visc_x86_argument_ptr;
-
-  Constant* llvm_visc_streamLaunch;
-  Constant* llvm_visc_streamPush;
-  Constant* llvm_visc_streamPop;
-  Constant* llvm_visc_streamWait;
-  Constant* llvm_visc_createBindInBuffer;
-  Constant* llvm_visc_createBindOutBuffer;
-  Constant* llvm_visc_createEdgeBuffer;
-  Constant* llvm_visc_createLastInputBuffer;
-  Constant* llvm_visc_createThread;
-  //Constant* llvm_visc_freeThreads;
-  Constant* llvm_visc_bufferPush;
-  Constant* llvm_visc_bufferPop;
-  Constant* llvm_visc_x86_dstack_push;
-  Constant* llvm_visc_x86_dstack_pop;
-  Constant* llvm_visc_x86_getDimLimit;
-  Constant* llvm_visc_x86_getDimInstance;
-
-  //Functions
-  std::vector<IntrinsicInst*>* getUseList(Value* LI);
-  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
-  void addDoWhileLoop(Instruction*, Instruction*, Value*);
-  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
-  Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
-  Argument* getArgumentFromEnd(Function* F, unsigned offset);
-  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
-                      Instruction* InsertBefore);
-  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  StructType* getArgumentListStructTy(DFNode*);
-  Function* createFunctionFilter(DFNode* C);
-  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
-                      Value*, Value*, Instruction*);
-  Function* createLaunchFunction(DFInternalNode*);
-  Function* createPushFunction(DFInternalNode*);
-  Function* createPopFunction(DFInternalNode*);
-  Function* createWaitFunction(DFInternalNode*);
-
-  // Virtual Functions
-  void init() {
-    VISCTimer = VISCTimer_X86;
-    TargetName = "X86";
-  }
-  void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
-  Function* codeGenStreamPush(DFInternalNode* N);
-  Function* codeGenStreamPop(DFInternalNode* N);
-
-public:
-  // Constructor
-  CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
-    init();
-    initRuntimeAPI();
-  }
-
-  void codeGenLaunch(DFInternalNode* Root);
-  void codeGenLaunchStreaming(DFInternalNode* Root);
-};
-
-bool DFG2LLVM_X86::runOnModule(Module &M) {
-  errs() << "\nDFG2LLVM_X86 PASS\n";
-
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
-
-  //DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
-  // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-  // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
-
-  // Visitor for Code Generation Graph Traversal
-  CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
-
-  // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(rootNode);
-    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
-    // TODO: Later on, we might like to do this in a separate pass, which would
-    // allow us the flexibility to switch between complete static code generation
-    // for DFG or having a customized runtime+scheduler
-    
-    // Do streaming code generation if root node is streaming. Usual otherwise
-    if(rootNode->isChildGraphStreaming())
-      CGTVisitor->codeGenLaunchStreaming(rootNode);
-    else
-      CGTVisitor->codeGenLaunch(rootNode);
-  }
-
-  delete CGTVisitor;
-  return true;
-}
-
-// Initialize the VISC runtime API. This makes it easier to insert these calls
-void CGT_X86::initRuntimeAPI() {
-
-  // Load Runtime API Module
-  SMDiagnostic Err;
-
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
-
-  // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable
-  Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll";
-
-  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-
-  if(runtimeModule == NULL)
-    DEBUG(errs() << Err.getMessage());
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
-
-  // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_x86_launch);
-  DECLARE(malloc);
-  DECLARE(llvm_visc_x86_wait);
-  DECLARE(llvm_visc_x86_argument_ptr);
-  DECLARE(llvm_visc_streamLaunch);
-  DECLARE(llvm_visc_streamPush);
-  DECLARE(llvm_visc_streamPop);
-  DECLARE(llvm_visc_streamWait);
-  DECLARE(llvm_visc_createBindInBuffer);
-  DECLARE(llvm_visc_createBindOutBuffer);
-  DECLARE(llvm_visc_createEdgeBuffer);
-  DECLARE(llvm_visc_createLastInputBuffer);
-  DECLARE(llvm_visc_createThread);
-  //DECLARE(llvm_visc_freeThreads);
-  DECLARE(llvm_visc_bufferPush);
-  DECLARE(llvm_visc_bufferPop);
-  DECLARE(llvm_visc_x86_dstack_push);
-  DECLARE(llvm_visc_x86_dstack_pop);
-  DECLARE(llvm_visc_x86_getDimLimit);
-  DECLARE(llvm_visc_x86_getDimInstance);
-
-  // Get or insert timerAPI functions as well if you plan to use timers
-  initTimerAPI();
-
-  // Insert init context in main
-  Function* VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-  DEBUG(errs() << "Inserting x86 timer initialization\n");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
-  initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
-  // Insert code for initializing the sceduling policy
-  Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init",
-    runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType()));
-  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
-  DEBUG(errs() << *IPCallInst << "\n");
-
-  // If device abstraction is enabled, we add a runtime call to start the
-  // device status simulation
-  if (DeviceAbstraction) {
-    Function *ID =
-      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType()));
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
-    DEBUG(errs() << *IDCallInst << "\n");
-  }
-
-  // Insert print instruction at visc exit
-  Function* VC = M.getFunction("llvm.visc.cleanup");
-  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
-
-  // Insert code for clearing the sceduling policy
-  I = cast<Instruction>(*VC->user_begin());
-  IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear",
-    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType()));
-  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
-  errs() << *IPCallInst << "\n";
-
-  DEBUG(errs() << "Inserting x86 timer print\n");
-  printTimerSet(I);
-
-  // If device abstraction is enabled, we add a runtime call to end the
-  // device status simulation
-  if (DeviceAbstraction) {
-    Function *ID =
-      cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType()));
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
-    DEBUG(errs() << *IDCallInst << "\n");
-  }
-
-}
-
-/* Returns vector of all wait instructions
- */
-std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
-  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
-  // It must have been loaded from memory somewhere
-  for(Value::user_iterator ui = GraphID->user_begin(),
-      ue = GraphID->user_end(); ui!=ue; ++ui) {
-    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
-      UseList->push_back(waitI);
-    }
-    //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){
-      //errs() << "Found PhiNode use of graphID\n";
-      //std::vector<IntrinsicInst*>* phiUseList  = getUseList(PN);
-      //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end());
-      //free(phiUseList);
-    //}
-    else {
-      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
-    }
-  }
-  return UseList;
-}
-
-/* Traverse the function argument list in reverse order to get argument at a
- * distance offset fromt he end of argument list of function F
- */
-Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
-  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
-         && "Invalid offset to access arguments!");
-  Function::arg_iterator e = F->arg_end();
-  // Last element of argument iterator is dummy. Skip it.
-  e--;
-  Argument* arg;
-  for( ; offset != 0; e--) {
-    offset--;
-    arg = &*e;
-  }
-  return arg;
-}
-
-/* Add Loop around the instruction I
- * Algorithm:
- * (1) Split the basic block of instruction I into three parts, where the
- * middleblock/body would contain instruction I.
- * (2) Add phi node before instruction I. Add incoming edge to phi node from
- * predecessor
- * (3) Add increment and compare instruction to index variable
- * (4) Replace terminator/branch instruction of body with conditional branch
- * which loops over bidy if true and goes to end if false
- * (5) Update phi node of body
- */
-void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
-                          Instruction* BodyEnd, Value* TerminationCond) {
-  BasicBlock* Entry = CondBlockStart->getParent();
-  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
-  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
-  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
-
-  // Replace the terminator instruction of conditional with new conditional
-  // branch which goes to while.body if true and branches to while.end otherwise
-  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
-  ReplaceInstWithInst(CondBlock->getTerminator(), BI);
-
-  // While Body should jump to condition block
-  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
-  ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
-
-}
-
-Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
-                                          BasicBlock *Body) {
-  Module *M = Entry->getParent()->getParent();
-  Type *Int64Ty = Type::getInt64Ty(M->getContext());
-
-  // Insert a PHI instruction at the beginning of the condition block
-  Instruction *IB = Cond->getFirstNonPHI();
-  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
-
-  ConstantInt *IConst =
-    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
-  Instruction *CounterIncr =
-    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
-                                            "cnt_incr", Body->getTerminator());
-
-  // Set incoming values for Phi node
-  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
-  CounterPhi->addIncoming(IConst, Entry);
-  CounterPhi->addIncoming(CounterIncr, Body);
-
-  // Return the pointer to the created PHI node in the corresponding argument
-  return CounterPhi;
-}
-
-/* Add Loop around the instruction I
- * Algorithm:
- * (1) Split the basic block of instruction I into three parts, where the
- * middleblock/body would contain instruction I.
- * (2) Add phi node before instruction I. Add incoming edge to phi node from
- * predecessor
- * (3) Add increment and compare instruction to index variable
- * (4) Replace terminator/branch instruction of body with conditional branch
- * which loops over bidy if true and goes to end if false
- * (5) Update phi node of body
- */
-void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) {
-  BasicBlock* Entry = From->getParent();
-  BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body");
-
-  // To Instruction should also belong to the same basic block as the From basic
-  // block will have a terminator instruction
-  assert(To->getParent() == ForBody
-         && "To Instruction should also belong to the same basic block!");
-  BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end");
-
-  // Replace the terminator instruction of for.body with new conditional
-  // branch which loops over body if true and branches to for.end otherwise
-  BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond);
-  ReplaceInstWithInst(ForBody->getTerminator(), BI);
-
-}
-
-/* Add Loop around the instruction I
- * Algorithm:
- * (1) Split the basic block of instruction I into three parts, where the
- * middleblock/body would contain instruction I.
- * (2) Add phi node before instruction I. Add incoming edge to phi node from
- * predecessor
- * (3) Add increment and compare instruction to index variable
- * (4) Replace terminator/branch instruction of body with conditional branch
- * which loops over bidy if true and goes to end if false
- * (5) Update phi node of body
- */
-Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
-  BasicBlock* Entry = I->getParent();
-  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
-
-  BasicBlock::iterator i(I);
-  ++i;
-  Instruction* NextI = &*i;
-  // Next Instruction should also belong to the same basic block as the basic
-  // block will have a terminator instruction
-  assert(NextI->getParent() == ForBody
-         && "Next Instruction should also belong to the same basic block!");
-  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
-
-
-  // Add Phi Node for index variable
-  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
-                                      2, "index."+indexName, I);
-
-  // Add incoming edge to phi
-  IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
-                        Entry);
-  // Increment index variable
-  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
-                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
-                             "index."+indexName+".inc", ForBody->getTerminator());
-
-  // Compare index variable with limit
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
-                                  limit, "cond."+indexName, ForBody->getTerminator());
-
-  // Replace the terminator instruction of for.body with new conditional
-  // branch which loops over body if true and branches to for.end otherwise
-  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
-  ReplaceInstWithInst(ForBody->getTerminator(), BI);
-
-  // Add incoming edge to phi node in body
-  IndexPhi->addIncoming(IndexInc, ForBody);
-  return IndexPhi;
-}
-
-// Returns a packed struct type. The structtype is created by packing the input
-// types, output types and isLastInput buffer type. All the streaming
-// inputs/outputs are converted to i8*, since this is the type of buffer
-// handles.
-StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
-  std::vector<Type*> TyList;
-  // Input types
-  Function* CF = C->getFuncPointer();
-  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
-      ai != ae; ++ai) {
-    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
-      TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-    else 
-      TyList.push_back(ai->getType());
-  }
-  // Output Types
-  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
-  for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
-    // All outputs of a node are streaming edge
-    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
-        && "All output edges of child node have to be streaming");
-    TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-  }
-  // isLastInput buffer element
-  TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-
-  StructType* STy = StructType::create(CF->getContext(), TyList,
-                        Twine("struct.thread."+CF->getName()).str(), true);
-  return STy;
-
-}
-
-void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
-                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
-                              Instruction* IB) {
-  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
-  // Create a filter/pipeline function for the child node
-  Function* C_Pipeline = createFunctionFilter(C);
-  Function* CF = C->getFuncPointer();
-
-  // Get module context and i32 0 constant, as they would be frequently used in
-  // this function.
-  LLVMContext& Ctx = IB->getParent()->getContext();
-  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
-
-  // Marshall arguments
-  // Create a packed struct type with inputs of C followed by outputs and then
-  // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
-  // by i8*
-  //
-  StructType* STy = getArgumentListStructTy(C);
-  // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
-  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
-                                  C->getFuncPointer()->getName()+".inputs", IB);
-  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
-  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
-  // Insert elements in the struct
-  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
-  // Marshall Inputs
-  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
-    // Create constant int (i)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
-    // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".arg_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getInDFEdgeAt(i);
-    if (E->getSourceDF()->isEntryNode()) {
-      // This is a Bind Input Edge
-      if(E->isStreamingEdge()) {
-        // Streaming Bind Input edge. Get buffer corresponding to it
-        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
-        new StoreInst(EdgeBufferMap[E], GEP, IB);
-      }
-      else {
-        // Non-streaming Bind edge
-        new StoreInst(Args[i], GEP, IB);
-      }
-    }
-    else {
-      // This is an edge between siblings. 
-      // This must be an streaming edge. As it is our assumption that all edges
-      // between two nodes in a DFG are streaming.
-      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
-      new StoreInst(EdgeBufferMap[E], GEP, IB);
-    }
-  }
-  unsigned numInputs = CF->getFunctionType()->getNumParams();
-  unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
-  // Marshall Outputs
-  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
-  for(unsigned i = 0; i < numOutputs; i++ ) {
-    // Create constant int (i+numInputs)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
-    // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".out_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getOutDFEdgeAt(i);
-    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
-    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
-    new StoreInst(EdgeBufferMap[E], GEP, IB);
-  }
-  // Marshall last argument. isLastInput buffer
-  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
-  // Create constant int (i+numInputs)
-  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
-  // Get Element pointer instruction
-  Value* GEPIndices[] = { IntZero, Int_index };
-  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                           ArrayRef<Value*>(GEPIndices, 2),
-                           Struct->getName()+".isLastInput", IB);
-  new StoreInst(isLastInputBuffer, GEP, IB);
-
-  // AllocaInst AI points to memory with all the arguments packed
-  // Call runtime to create the thread with these arguments
-  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
-  DEBUG(errs() << *llvm_visc_createThread << "\n");
-  DEBUG(errs() << *graphID->getType() << "\n");
-  DEBUG(errs() << *C_Pipeline->getType() << "\n");
-  DEBUG(errs() << *Struct->getType() << "\n");
-  // Bitcast AI to i8*
-  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
-  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
-  CallInst* CreateThread = CallInst::Create(llvm_visc_createThread,
-                                            ArrayRef<Value*>(CreateThreadArgs, 3),
-                                            "",
-                                            IB);
-
-}
-
-Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
-  DEBUG(errs() << "Generating Streaming Launch Function\n");
-  // Get Function associated with Node N
-  Function* NF = N->getFuncPointer();
-
-  // Map from Streaming edge to buffer 
-  DenseMap<DFEdge*, Value*> EdgeBufferMap;
-
-  /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
-  * (2) Extract each of inputs from data.addr
-  * (3) create Buffers for all the streaming edges
-  *     - Put buffers in the context
-  * (4) Go over each child node
-  *     - marshall its arguments together (use buffers in place of streaming
-  *       arguments)
-  *     - Start the threads
-  * (5) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
-  // (1) Create Launch Function of type void (i8* args, i8* GraphID)
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
-  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
-                                  ArrayRef<Type*>(ArgTypes, 2), false);
-  Function* LaunchFunc = Function::Create(LaunchFuncTy,
-                                       NF->getLinkage(),
-                                       NF->getName()+".LaunchFunction",
-                                       &M);
-  DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
-  // Give a name to the argument which is used pass data to this thread
-  Argument* data = &*LaunchFunc->arg_begin();
-  Argument* graphID = &*(++LaunchFunc->arg_begin());
-  data->setName("data.addr");
-  graphID->setName("graphID");
-  // Add a basic block to this empty function and a return null statement to it
-  DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
-  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
-  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
-                                      BB);
-
-  DEBUG(errs() << "Created Empty Launch Function\n");
-
-  // (2) Extract each of inputs from data.addr
-  std::vector<Type*> TyList;
-  std::vector<std::string> names;
-  std::vector<Value*> Args;
-
-  for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
-      ai != ae; ++ai) {
-    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
-      TyList.push_back(i8Ty->getPointerTo());
-      names.push_back(Twine(ai->getName()+"_buffer").str());
-      continue;
-    }
-    TyList.push_back(ai->getType());
-    names.push_back(ai->getName());
-  }
-  Args = extractElements(data, TyList, names, RI);
-  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
-  // (3) Create buffers for all the streaming edges
-  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
-      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
-    DFEdge* Edge = *di;
-    DEBUG(errs() << *Edge->getType() << "\n");
-    Value* size = ConstantExpr::getSizeOf(Edge->getType());
-    Value* CallArgs[] = {graphID, size};
-    if (Edge->isStreamingEdge()) {
-      CallInst* CI;
-      // Create a buffer call
-      if(Edge->getSourceDF()->isEntryNode()) {
-        // Bind Input Edge
-        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
-                                  Edge->getSourcePosition());
-        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
-        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
-                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else if(Edge->getDestDF()->isExitNode()) {
-        // Bind Output Edge
-        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else {
-        // Streaming Edge
-        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
-                              ArrayRef<Value*>(CallArgs, 2),
-                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
-                              +Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      EdgeBufferMap[Edge] = CI;
-    }
-  }
-  // Create buffer for isLastInput for all the child nodes
-  DFGraph* G = N->getChildGraph();
-  DenseMap<DFNode*, Value*> NodeLastInputMap;
-  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
-    DFNode* child = *ci;
-    if(child->isDummyNode())
-      continue;
-    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
-    Value* CallArgs[] = {graphID, size};
-    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
-                              RI);
-    NodeLastInputMap[child] = CI;
-  }
-  DEBUG(errs() <<  "Start Each child node filter\n");
-  // (4) Marshall arguments for each child node and start the thread with its
-  //     pipeline funtion
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
-    // Skip dummy node call
-    if (C->isDummyNode())
-      continue;
-    
-    // Marshall all the arguments for this node into an i8*
-    // Pass to the runtime to create the thread
-    // Start the thread for child node C
-    startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
-  }
-
-  DEBUG(errs() << "Launch function:\n");
-  DEBUG(errs() << *LaunchFunc << "\n");
-
-  return LaunchFunc;
-}
-
-
-Function* CGT_X86::createPushFunction(DFInternalNode* N) {
-  DEBUG(errs() << "Generating Push function\n");
-  Function* PushFunc;
-  return PushFunc;
-}
-
-Function* CGT_X86::createPopFunction(DFInternalNode* N) {
-  DEBUG(errs() << "Generating Pop function\n");
-  Function* PushFunc;
-  return PushFunc;
-}
-
-Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
-  DEBUG(errs() << "Generating Wait function\n");
-  Function* PushFunc;
-  return PushFunc;
-}
-/* This fuction does the steps necessary to launch a streaming graph
- * Steps
- * Create Pipeline/Filter function for each node in child graph of Root
- * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
- * Modify each of the instrinsic in host code
- * Launch, Push, Pop, Wait
- */
-void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
-  IntrinsicInst* LI = Root->getInstruction();
-  Function* RootLaunch = createLaunchFunction(Root);
-  //Function* RootPush = createPushFunction(Root);
-  //Function* RootPop = createPopFunction(Root);
-  //Function* RootWait = createWaitFunction(Root);
-  // Substitute launch intrinsic main
-  DEBUG(errs() <<  "Substitute launch intrinsic\n");
-  Value* LaunchInstArgs[] = {RootLaunch,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
-  //ReplaceInstWithInst(LI, LaunchInst);
-
-  DEBUG(errs() << *LaunchInst << "\n");
-  // Replace all wait instructions with x86 specific wait instructions
-  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
-    switch(II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_streamWait,
-                            ArrayRef<Value*>(LaunchInst),
-                            "");
-      break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_streamPush,
-                            ArrayRef<Value*>(PushArgs, 2),
-                            "");
-      break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_streamPop,
-                            ArrayRef<Value*>(LaunchInst),
-                            "");
-      break;
-    default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
-    };
-    DEBUG(errs() << "Replace:\n\t" << *II << "\n");
-    ReplaceInstWithInst(II, CI);
-    DEBUG(errs() << "\twith " << *CI << "\n");
-  }
-
-
-}
-
-void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
-  // TODO: Place an assert to check if the constant passed by launch intrinsic
-  // as the number of arguments to DFG is same as the number of arguments of the
-  // root of DFG
-  DEBUG(errs() << "Generating Launch Function\n");
-  // Get Launch Instruction
-  IntrinsicInst* LI = Root->getInstruction();
-  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
-  DEBUG(errs() << "Generating Launch Function\n");
-
-  /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type i8*(i8*)
-  * (2) Extract each of inputs from data.addr and pass them as arguments to the
-  * call to Root function
-  * (3) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
-  // Create Launch Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                            ArrayRef<Type*>(i8Ty->getPointerTo()),
-                            false);
-  Function* AppFunc = Function::Create(AppFuncTy,
-                                       Root->getFuncPointer()->getLinkage(),
-                                       "LaunchDataflowGraph",
-                                       &M);
-  DEBUG(errs() << "Generating Launch Function\n");
-  // Give a name to the argument which is used pass data to this thread
-  Value* data = &*AppFunc->arg_begin();
-  data->setName("data.addr");
-  // Add a basic block to this empty function and a return null statement to it
-  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
-  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
-                                      Constant::getNullValue(AppFunc->getReturnType()),
-                                      BB);
-  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
-
-  DEBUG(errs() << "Created Empty Launch Function\n");
-  // Find the X86 function generated for Root and
-//  Function* RootF_X86 = Root->getGenFunc();
-  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
-  assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
-  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
-         "Error: Generated Function for Root node with no x86 wrapper\n");
-
-  // Generate a call to RootF_X86 with null parameters for now
-  std::vector<Value*>Args;
-  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
-  }
-  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
-
-  // Extract input data from i8* data.addr and patch them to correct argument of
-  // call to RootF_X86. For each argument
-  std::vector<Type*> TyList;
-  std::vector<std::string> names;
-  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
-      ai != ae; ++ai) {
-    TyList.push_back(ai->getType());
-    names.push_back(ai->getName());
-  }
-  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
-  // Patch the elements to the call arguments
-  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
-    CI->setArgOperand(i, elements[i]);
-
-  // Add timers around Call to RootF_X86 function
-  switchToTimer(visc_TimerID_COMPUTATION, CI);
-  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
-
-  // Code for returning the output
-  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                             CI->getType()->getPointerTo(),
-                             CI->getName()+".addr",
-                             RI);
-  new StoreInst(CI, OutputAddrCast, RI);
-  switchToTimer(visc_TimerID_NONE, RI);
-
-  DEBUG(errs() << "Application specific function:\n");
-  DEBUG(errs() << *AppFunc << "\n");
-
-  // Substitute launch intrinsic main
-  Value* LaunchInstArgs[] = {AppFunc,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
-  //ReplaceInstWithInst(LI, LaunchInst);
-
-  DEBUG(errs() << *LaunchInst << "\n");
-  // Replace all wait instructions with x86 specific wait instructions
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    switch(II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_x86_wait,
-                            ArrayRef<Value*>(LaunchInst),
-                            "");
-      break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_bufferPush,
-                            ArrayRef<Value*>(LaunchInst),
-                            "");
-      break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_bufferPop,
-                            ArrayRef<Value*>(LaunchInst),
-                            "");
-      break;
-    default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
-    };
-    ReplaceInstWithInst(II, CI);
-    DEBUG(errs() << *CI << "\n");
-  }
-
-}
-
-Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
-  // TODO: Assumption is that each input port of a node has just one
-  // incoming edge. May change later on.
-
-  // Find the incoming edge at the requested input port
-  DFEdge* E = Child->getInDFEdgeAt(i);
-  assert(E && "No incoming edge or binding for input element!");
-  // Find the Source DFNode associated with the incoming edge
-  DFNode* SrcDF = E->getSourceDF();
-
-  // If Source DFNode is a dummyNode, edge is from parent. Get the
-  // argument from argument list of this internal node
-  Value* inputVal;
-  if(SrcDF->isEntryNode()) {
-    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
-    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-  }
-  else {
-    // edge is from a sibling
-    // Check - code should already be generated for this source dfnode
-    assert(OutputMap.count(SrcDF)
-           && "Source node call not found. Dependency violation!");
-
-    // Find CallInst associated with the Source DFNode using OutputMap
-    Value* CI = OutputMap[SrcDF];
-
-    // Extract element at source position from this call instruction
-    std::vector<unsigned> IndexList;
-    IndexList.push_back(E->getSourcePosition());
-    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                           "", InsertBefore);
-    inputVal = EI;
-  }
-  return inputVal;
-}
-
-void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
-                              ValueToValueMapTy &VMap,Instruction* IB) {
-  Function* CF = C->getFuncPointer();
-
-//  Function* CF_X86 = C->getGenFunc();
-  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
-  assert(CF_X86 != NULL
-         && "Found leaf node for which code generation has not happened yet!\n");
-  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
-         "The generated function to be called from x86 backend is not an x86 function\n");
-  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
-
-  std::vector<Value*> Args;
-  // Create argument list to pass to call instruction
-  // First find the correct values using the edges
-  // The remaing six values are inserted as constants for now.
-  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(getInValueAt(C, i, F_X86, IB));
-  }
-
-  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
-  for(unsigned j=0; j<6; j++)
-    Args.push_back(I64Zero);
-
-  errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
-  errs() << "Node Function type: " << *CF->getType() << "\n";
-  errs() << "Arguments: " << Args.size() << "\n";
-
-  // Call the F_X86 function associated with this node
-  CallInst* CI = CallInst::Create(CF_X86, Args,
-                                  CF_X86->getName()+"_output",
-                                  IB);
-  DEBUG(errs() << *CI << "\n");
-  OutputMap[C] = CI;
-
-  // Find num of dimensions this node is replicated in.
-  // Based on number of dimensions, insert loop instructions
-  std::string varNames[3] = {"x", "y", "z"};
-  unsigned numArgs = CI->getNumArgOperands();
-  for(unsigned j=0; j < C->getNumOfDim(); j++) {
-    Value* indexLimit = NULL;
-    // Limit can either be a constant or an arguement of the internal node.
-    // In case of constant we can use that constant value directly in the
-    // new F_X86 function. In case of an argument, we need to get the mapped
-    // value using VMap
-    if(isa<Constant>(C->getDimLimits()[j])) {
-      indexLimit = C->getDimLimits()[j];
-      DEBUG(errs() << "In Constant case:\n"
-             << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    }
-    else {
-      indexLimit = VMap[C->getDimLimits()[j]];
-      DEBUG(errs() << "In VMap case:"
-             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
-    }
-    assert(indexLimit && "Invalid dimension limit!");
-    // Insert loop
-    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
-    DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
-    // Insert index variable and limit arguments
-    CI->setArgOperand(numArgs-6+j, indexVar);
-    CI->setArgOperand(numArgs-3+j, indexLimit);
-  }
-  // Insert call to runtime to push the dim limits and instanceID on the depth
-  // stack
-  Value* args[] = {
-    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
-    CI->getArgOperand(numArgs-3+0), // limitX
-    CI->getArgOperand(numArgs-6+0), // iX
-    CI->getArgOperand(numArgs-3+1), // limitY
-    CI->getArgOperand(numArgs-6+1), // iY
-    CI->getArgOperand(numArgs-3+2), // limitZ
-    CI->getArgOperand(numArgs-6+2)  // iZ
-  };
-
-  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
-  DEBUG(errs() << "Push on stack: " << *Push << "\n");
-  // Insert call to runtime to pop the dim limits and instanceID from the depth
-  // stack
-  BasicBlock::iterator i(CI);
-  ++i;
-  Instruction* NextI = &*i;
-  // Next Instruction should also belong to the same basic block as the basic
-  // block will have a terminator instruction
-  assert(NextI->getParent() == CI->getParent()
-         && "Next Instruction should also belong to the same basic block!");
-
-  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
-  DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
-  DEBUG(errs() << *CI->getParent()->getParent());
-}
-
-/* This function takes a DFNode, and creates a filter function for it. By filter
- * function we mean a function which keeps on getting input from input buffers,
- * applying the function on the inputs and then pushes data on output buffers
- */
-// Create a function with void* (void*) type.
-// Create a new basic block
-// Add a return instruction to the basic block
-// extract arguments from the aggregate data input. Type list would be
-// Replace the streaming inputs with i8* types signifying handle to
-// corresponding buffers
-// Add a boolean argument isLastInput
-// Add runtime API calls to get input for each of the streaming inputs
-// Add a call to the generated function of the child node
-// Add runtime API calls to push output for each of the streaming outputs
-// Add loop around the basic block, which exits the loop if isLastInput is false
-
-Function* CGT_X86::createFunctionFilter(DFNode* C) {
-  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
-
-  /* Create a function with same argument list as child.*/
-  DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
-  // Get the generated function for child node
-  Function* CF = C->getFuncPointer();
-  // Create Filter Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
-                                ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                false);
-  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
-                          CF->getLinkage(),
-                          CF->getName()+"_Pipeline",
-                          &M);
-  DEBUG(errs() << "Generating Pipline Function\n");
-  // Give a name to the argument which is used pass data to this thread
-  Value* data = &*CF_Pipeline->arg_begin();
-  data->setName("data.addr");
-  // Create a new basic block
-  DEBUG(errs() << "\tCreate new BB and add a return function\n");
-  // Add a basic block to this empty function
-  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
-  // Add a return instruction to the basic block
-  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
-                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
-
-
-  /* Extract the elements from the aggregate argument to the function.
-   * Replace the streaming inputs with i8* types signifying handle to
-   * corresponding buffers
-   * Add outputs to the list as well
-   * Add isLastInput to the list
-   */
-  DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
-  // These Args will be used when passing arguments to the generated function
-  // inside loop, and reading outputs as well.
-  std::vector<Value*> Args;
-  std::vector<Type*> TyList;
-  std::vector<std::string> names;
-  // Adding inputs
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
-      TyList.push_back(i8Ty->getPointerTo());
-      names.push_back((Twine(i->getName())+"_buffer").str());
-    }
-    else {
-      TyList.push_back(i->getType());
-      names.push_back(i->getName());
-    }
-  }
-  // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
-  // because we get there buffer handles
-  StructType* RetTy = cast<StructType>(CF->getReturnType());
-  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
-    TyList.push_back(i8Ty->getPointerTo());
-    names.push_back("out");
-  }
-  /* Add a boolean argument isLastInput */
-  DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
-  TyList.push_back(i8Ty->getPointerTo());
-  names.push_back("isLastInput_buffer");
-
-  // Extract the inputs, outputs and
-  Args = extractElements(data, TyList, names, RI);
-  for(unsigned i=0; i<Args.size(); i++) {
-    DEBUG(errs() << *Args[i] << "\n");
-  }
-
-  // Split the Args vector into, input output and isLastInput
-  unsigned numInputs = CF->getFunctionType()->getNumParams();
-  unsigned numOutputs = RetTy->getNumElements();
-  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
-  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
-  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
-
-  /* Add runtime API calls to get input for each of the streaming input edges */
-  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
-  // First read the termination condition variable islastInput
-  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
-                                        ArrayRef<Value*>(isLastInput),
-                                        "",
-                                        RI);
-
-  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
-                 Type::getInt64Ty(CF_Pipeline->getContext()),
-                 false,
-                 "isLastInput",
-                 RI);
-  isLastInput = BI;
-  // Create a loop termination condition
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
-      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
-      RI);
-
-  // Get input from buffers of all the incoming streaming edges
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
-      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
-                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
-                                            "",
-                                            RI);
-      CastInst* BI;
-      if(i->getType()->isPointerTy()) {
-        BI = CastInst::Create(CastInst::IntToPtr,
-                              bufferIn,
-                              i->getType(),
-                              i->getName()+".addr",
-                              RI);
-      }
-      else if(i->getType()->isFloatTy()) {
-        BI = CastInst::CreateFPCast(bufferIn,
-                                    i->getType(),
-                                    i->getName()+".addr",
-                                    RI);
-      }
-      else {
-        BI = CastInst::CreateIntegerCast(bufferIn,
-                                         i->getType(),
-                                         false,
-                                         i->getName()+".addr",
-                                         RI);
-      }
-      // Replace the argument in Args vector. We would be using the vector as
-      // parameters passed to the call
-      InputArgs[i->getArgNo()] = BI;
-    }
-  }
-  /* Add a call to the generated function of the child node */
-  DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
-//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
-//                                  C->getGenFunc()->getName()+".output", RI);
-  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
-  DEBUG(errs() << "Type: "
-               << *CGenF->getType()
-               << "\n");
-  CallInst* CI = CallInst::Create(CGenF,
-                                  InputArgs,
-                                  CGenF->getName()+".output",
-                                  RI);
-
-  /* Add runtime API calls to push output for each of the streaming outputs */
-  // FIXME: Assumption
-  // All edges between siblings are streaming edges
-  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
-  for (unsigned i=0; i< numOutputs; i++) {
-    // Extract output
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
-                           "",RI);
-    // Convert to i64
-    CastInst* BI;
-    if(EI->getType()->isPointerTy())
-      BI = CastInst::Create(CastInst::PtrToInt,EI,
-                            Type::getInt64Ty(CF_Pipeline->getContext()),
-                            "",
-                            RI);
-    else
-      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
-                                       false, "", RI);
-    // Push to Output buffer
-    Value* bufferOutArgs[] = {OutputArgs[i], BI};
-    CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush,
-                                           ArrayRef<Value*>(bufferOutArgs, 2),
-                                           "",
-                                           RI);
-  }
-
-  // Add loop around the basic block, which exits the loop if isLastInput is false
-  //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond);
-//  addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(),
-//                RI, Cond);
-
-  // Add loop around the basic block, which exits the loop if isLastInput is false
-  // Pointers to keep the created loop structure
-  BasicBlock *EntryBB, *CondBB, *BodyBB;
-  Instruction *CondStartI = cast<Instruction>(isLastInputPop);
-  Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
-  EntryBB = CondStartI->getParent();
-
-  addWhileLoop(CondStartI, BodyStartI, RI, Cond);
-  CondBB = CondStartI->getParent();
-  BodyBB = CI->getParent();
-  Instruction *CntI = NULL;
-  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
-
-  // If the node function calls the visc runtime call to get policy, we update
-  // it with the counter information. This means we need to pass an additional
-  // argument to the generated function, that is the iteration number, and then
-  // use it as an argument to the policy_getVersion call 
-  if (GetPolicyCI) {
-    CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
-    assert(CntI && "Counter instruction not found\n");
-
-    // Create new function type (with additional argument for iteration number)
-    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
-    std::vector<Type*> NewArgTypes;
-    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
-         ai != ae ; ++ai) {
-      NewArgTypes.push_back(ai->getType());
-    }
-    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
-    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
-    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
-    // At least one (the last) argument exists (we added it)
-    Function::arg_iterator ae = NewCGenF->arg_end();
-    --ae;
-    Argument *CntArg = &*ae;
-    CntArg->setName("iteration");
-    // Replace the old cpu gen func with this one
-    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
-
-    // Add counter to the actual parameter list, to create the new call
-    InputArgs.push_back(CntI);
-    CallInst* newCI = CallInst::Create(NewCGenF,
-                                       InputArgs,
-                                       NewCGenF->getName()+".output");
-    ReplaceInstWithInst(CI, newCI);
-
-    // Set second operand of the policy_getVersion call to the last function
-    // argument
-    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
-    GetPolicyCI->setArgOperand(1, CntArg);
-  }
-
-  // Return the Function pointer
-  DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
-  DEBUG(errs() << *CF_Pipeline << "\n");
-  return CF_Pipeline;
-}
-
-void CGT_X86::codeGen(DFInternalNode* N) {
-  // Check if N is root node and its graph is streaming. We do not do codeGen
-  // for Root in such a case
-  if(N->isRoot() && N->isChildGraphStreaming())
-    return;
-
-  // Check if clone already exists. If it does, it means we have visited this
-  // function before and nothing else needs to be done for this leaf node.
-//  if(N->getGenFunc() != NULL)
-//    return;
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
-              " : skipping it\n";
-    return;
-  }
-
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
-         "Error: Visiting a node for which code already generated\n");
-
-  // Sort children in topological order before code generation
-  N->getChildGraph()->sortChildren();
-
-  // Only process if all children have a CPU x86 function
-  // Otherwise skip to end
-  bool codeGen = true;
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
-    // Skip dummy node call
-    if (C->isDummyNode())
-      continue;
-
-    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
-      errs() << "No CPU x86 version for child node "
-             << C->getFuncPointer()->getName()
-             << "\n  Skip code gen for parent node "
-             << N->getFuncPointer()->getName() << "\n";
-      codeGen = false;
-    }
-  }
-
-  if (codeGen) {
-    Function* F = N->getFuncPointer();
-    // Create of clone of F with no instructions. Only the type is the same as F
-    // without the extra arguments.
-    Function* F_X86;
-  
-    // Clone the function, if we are seeing this function for the first time. We
-    // only need a clone in terms of type.
-    ValueToValueMapTy VMap;
-  
-    // Create new function with the same type
-    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
-
-    // Loop over the arguments, copying the names of arguments over.
-    Function::arg_iterator dest_iterator = F_X86->arg_begin();
-    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-         i != e; ++i) {
-      dest_iterator->setName(i->getName()); // Copy the name over...
-      // Increment dest iterator
-      ++dest_iterator;
-    }
-
-    // Add a basic block to this empty function
-    BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
-    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
-                                        UndefValue::get(F_X86->getReturnType()), BB);
-
-    // Add Index and Dim arguments except for the root node and the child graph of
-    // parent node is not streaming
-    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-      F_X86 = addIdxDimArgs(F_X86);
-
-    BB = &*F_X86->begin();
-    RI = cast<ReturnInst>(BB->getTerminator());
-  
-    //Add generated function info to DFNode
-//    N->setGenFunc(F_X86, visc::CPU_TARGET);
-    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
-
-    // Loop over the arguments, to create the VMap.
-    dest_iterator = F_X86->arg_begin();
-    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-         i != e; ++i) {
-      // Add mapping and increment dest iterator
-      VMap[&*i] = &*dest_iterator;
-      ++dest_iterator;
-    }
-
-    // Iterate over children in topological order
-    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-      DFNode* C = *ci;
-      // Skip dummy node call
-      if (C->isDummyNode())
-        continue;
-  
-      // Create calls to CPU function of child node
-      invokeChild_X86(C, F_X86, VMap, RI);
-  
-    }
- 
-    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
-    // Generate code for output bindings
-    // Get Exit node
-    DFNode* C = N->getChildGraph()->getExit();
-    // Get OutputType of this node
-    StructType* OutTy = N->getOutputType();
-    Value *retVal = UndefValue::get(F_X86->getReturnType());
-    // Find all the input edges to exit node
-    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
-      DEBUG(errs() << "Output Edge " << i << "\n");
-      // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-  
-      assert(E && "No Binding for output element!");
-      // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-  
-      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
-  
-      // If Source DFNode is a dummyNode, edge is from parent. Get the
-      // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
-        inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
-        // edge is from a internal node
-        // Check - code should already be generated for this source dfnode
-        assert(OutputMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-  
-        // Find Output Value associated with the Source DFNode using OutputMap
-        Value* CI = OutputMap[SrcDF];
-  
-        // Extract element at source position from this call instruction
-        std::vector<unsigned> IndexList;
-        IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "",RI);
-        inputVal = EI;
-      }
-      std::vector<unsigned> IdxList;
-      IdxList.push_back(i);
-      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
-    }
-    DEBUG(errs() << "Extracted all\n");
-    retVal->setName("output");
-    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
-    ReplaceInstWithInst(RI, newRI);
-
-  }
-
-  //-------------------------------------------------------------------------//
-  // Here, we need to check if this node (N) has more than one versions
-  // If so, we query the policy and have a call to each version
-  // If not, we see which version exists, check that it is in fact an x86
-  // function and save it as the CPU_TARGET function
-
-  // TODO: visc_id per node, so we can use this for id for policies
-  // For now, use node function name and change it later
-  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
-
-  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
-
-  errs() << "Node: " << N->getFuncPointer()->getName()
-                     << " with tag " << N->getTag() << "\n";
-  errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
-  errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
-  errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
-  errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
-  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
-  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
-
-
-  if (N->getTag() == visc::None) {
-    // No code is available for this node. This (usually) means that this
-    // node is a node that
-    // - from the accelerator backends has been mapped to an intermediate
-    // node, and thus they have not produced a genFunc
-    // - a child node had no CPU hint, thus no code gen for CPU could 
-    // take place
-    errs() << "No GenFunc - Skipping CPU code generation for node "
-           << N->getFuncPointer()->getName() << "\n";
-  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
-    // There is a single version for this node according to code gen hints.
-    // Therefore, we do not need to check the policy, we simply use the
-    // available implementation, whichever target it is for.
-
-    // Sanity check - to be removed TODO
-    switch (N->getTag()) {
-      case visc::CPU_TARGET:
-        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
-        break;
-      case visc::GPU_TARGET:
-        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
-        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
-        break;
-      case visc::SPIR_TARGET:
-        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
-        break;
-      default:
-        assert(false && "Unreachable: we checked that tag was single target!\n");
-        break;
-    }
-
-    // If device abstraction is enabled, then we may need to edit the node 
-    // function. In case this is a GPU or SPIR gen func, we issue a call to
-    // the runtime that waits for the device to be available
-    if (DeviceAbstraction) {
-      Function *NodeGenFunc = NULL;
-      switch (N->getTag()) {
-        case visc::GPU_TARGET:
-          NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
-          break;
-        case visc::SPIR_TARGET:
-          NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET);
-          break;
-        default:
-          break;
-      }
-
-      if (NodeGenFunc) {
-        // If we found a function to edit, we add the call to the runtime as
-        // its first statement
-        BasicBlock *BB = &*NodeGenFunc->begin();
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        Function *RTF =
-          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
-        CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
-      }
-
-    }
-
-    Function *Ftmp = N->getGenFuncForTarget(N->getTag());
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->removeGenFuncForTarget(visc::SPIR_TARGET);
-    N->setTag(visc::None);
-    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
-    N->setTag(visc::CPU_TARGET);
-
-    // Sanity checks - to be removed TODO
-    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
-
-    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
-
-    errs() << "After editing\n";
-    errs() << "Node: " << N->getFuncPointer()->getName()
-                       << " with tag " << N->getTag() << "\n";
-    errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n";
-    errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
-    errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
-    errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
-    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
-    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
-
-    //  assert(false && "got to the point where we have to select\n");
-  } else {
-    // We have more than one targets
-    
-    errs() << "Node Name (for policy) : "
-           << N->getFuncPointer()->getName() << "\n";
-
-    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
-
-    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
-
-    // These assertions express what we can support with the current runtime.
-    // Code generation works the same way even for other target combinations.
-    // For now, we want either CPU and GPU, or CPU and SPIR
-    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
-    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
-           "Generated functions without appropriate x86 wrapper\n");
-
-    FunctionType *FT = CF->getFunctionType();
-    if (GF)
-      assert(FT == GF->getFunctionType() &&
-             "Type mismatch between generated functions for GPU and CPU targets.\n");
-    if (SF)
-      assert(FT == SF->getFunctionType() &&
-             "Type mismatch between generated functions for SPIR and CPU targets.\n");
-
-    // Code generation of wrapper function
-    Function *F_wrapper;
-    ValueToValueMapTy VMap;
-    F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M);
-
-    // Copy argument names over
-    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
-    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-         i != e; ++i) {
-      dest_iterator->setName(i->getName());
-      VMap[&*i] = &*dest_iterator;
-      ++dest_iterator;
-    }
-    // Gather all arguments of wrapper in a vector, to prepare the call to
-    // the individual gen functions
-    std::vector<Value *> GenFuncCallArgs;
-    for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end();
-         i != e; ++i) {
-      GenFuncCallArgs.push_back(&*i);
-    }
-
-    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
-
-    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
-
-    StringRef FName = N->getFuncPointer()->getName();
-    size_t nameSize = FName.size()+1;
-    std::vector<Constant *> NameV;
-    for (char c: FName) {
-      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
-    }
-    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
-    ArrayType *NameType =
-      ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
-    AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent);
-    Constant *NameConst = ConstantArray::get(NameType, NameV);
-    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
-    CastInst *BI = BitCastInst::CreatePointerCast(AI,
-                     Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
-    std::vector<Value *> Args;
-    Args.push_back(BI);
-    Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
-    Function *RTF =
-      cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion",
-      runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType()));
-    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
-
-    ConstantInt *CmpConst =
-      ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
-    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp,
-                                    CmpInst::ICMP_EQ,
-                                    RTFInst, CmpConst,
-                                    "", BBcurrent);
-
-    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
-    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
-    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-
-    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
-    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-    // Switch basic block pointers
-    BBcurrent = BBfalse;
-    if (GF) {
-      // We have a GPU version. Generate policy check and call
-      CmpConst =
-         ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
-      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                             RTFInst, CmpConst, "", BBcurrent);
-      BBtrue =  BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
-      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
-      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-      
-      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
-      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-      if (DeviceAbstraction) {
-        // Prepare arguments and function for call to wait for device runtime call
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        Function *RTF =
-          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
-        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
-      }
-    }
-
-    // Switch basic block pointers
-    BBcurrent = BBfalse;
-    if (SF) {
-      // We have a GPU version. Generate policy check and call
-      CmpConst =
-         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
-      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                             RTFInst, CmpConst, "", BBcurrent);
-      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
-      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
-      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-      
-      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
-      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-      if (DeviceAbstraction) {
-        // Prepare arguments and function for call to wait for device runtime call
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        Function *RTF =
-          cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType()));
-        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
-      }
-    }
-
-    RI = ReturnInst::Create(M.getContext(),
-                            UndefValue::get(FT->getReturnType()), BBfalse);
-
-    // Now, make the node cpu gen func to be this one
-    // Remove all other versions and update the tag
-    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->removeGenFuncForTarget(visc::SPIR_TARGET);
-    N->setTag(visc::CPU_TARGET);
-
-    // assert(false && "got to the point where we have to combine\n");
-  }
-
-}
-
-// Code generation for leaf nodes
-void CGT_X86::codeGen(DFLeafNode* N) {
-  // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
-    DEBUG(errs() << "Skipping dummy node\n");
-    return;
-  }
-
-  // At this point, the X86 backend does not support code generation for
-  // the case where allocation node is used, so we skip. This means that a
-  // CPU version will not be created, and therefore code generation will
-  // only succeed if another backend (nvptx or spir) has been invoked to
-  // generate a node function for the node including the allocation node.
-  if (N->isAllocationNode()) {
-    DEBUG(errs() << "Skipping allocation node\n");
-    return;
-  }
-
-  // Check if clone already exists. If it does, it means we have visited this
-  // function before and nothing else needs to be done for this leaf node.
-//  if(N->getGenFunc() != NULL)
-//    return;
-
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
-              " : skipping it\n";
-
-    errs() << "Check for cudnn or promise hint for node "
-           << N->getFuncPointer()->getName() <<  "\n";
-
-    switch (N->getTag()) {
-       case visc::CUDNN_TARGET: {
-          errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n";
-         // Make sure there is a generated x86 function for cudnn
-         assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && "");
-         assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && "");
-         // Store the CUDNN x86 function as the CPU generated function
-         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
-         // after adding the required number of arguments
-         if (!N->getParent()->isChildGraphStreaming())
-           Ftmp = addIdxDimArgs(Ftmp);
-
-         N->removeGenFuncForTarget(visc::CUDNN_TARGET);
-         N->setTag(visc::None);
-         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
-         N->setTag(visc::CPU_TARGET);
-         break;
-         }
-       case visc::PROMISE_TARGET: {
-          errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n";
-         // Make sure there is a generated x86 function for promise
-         assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && "");
-         assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && "");
-         // Store the PROMISE x86 function as the CPU generated function
-         Function *Ftmp = N->getGenFuncForTarget(N->getTag());
-         // after adding the required number of arguments
-         if (!N->getParent()->isChildGraphStreaming())
-           Ftmp = addIdxDimArgs(Ftmp);
-
-         N->setTag(visc::None);
-         N->removeGenFuncForTarget(visc::PROMISE_TARGET);
-         N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
-         N->setTag(visc::CPU_TARGET);
-         break;
-         }
-       case visc::GPU_TARGET:
-         // A leaf node should not have an x86 function for GPU
-         // by design of DFG2LLVM_NVPTX backend
-         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-         break;
-       case visc::SPIR_TARGET:
-         // A leaf node should not have an x86 function for SPIR
-         // by design of DFG2LLVM_SPIR backend
-         assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
-         break;
-       default:
-         break;
-    }
-
-    return;
-  }
-
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
-         "Error: Visiting a node for which code already generated\n");
-
-  std::vector<IntrinsicInst *> IItoRemove;
-  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
-  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
-
-  // Get the function associated woth the dataflow node
-  Function *F = N->getFuncPointer();
-
-  // Clone the function, if we are seeing this function for the first time.
-  Function *F_X86;
-  ValueToValueMapTy VMap;
-  F_X86 = CloneFunction(F, VMap);
-  F_X86->removeFromParent();
-  // Insert the cloned function into the module
-  M.getFunctionList().push_back(F_X86);
-
-  // Add the new argument to the argument list. Add arguments only if the cild
-  // graph of parent node is not streaming
-  if(!N->getParent()->isChildGraphStreaming())
-    F_X86 = addIdxDimArgs(F_X86);
-
-  // Add generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
-
-  /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/
-  /* This part of the code is meant to handle turning the CPU backend into an
-   "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime
-   needs to be essentially deactivated.                                      */
-
-  /* We look into the leaf node's function for function call starting from
-   "tensor". These are functions with which we replaced the ApproxHPVM
-   intrinsics, and for which we have LLVM implementations. If found, it means
-   we are dealing with an AproxHPVM program.                                 */
-  bool isApproxHPVMnode = false;
-  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
-    Instruction *I = &(*i);
-    DEBUG(errs() << *I << "\n");
-
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      if ((CI->getCalledFunction()->getName()).startswith("tensor")) {
-        isApproxHPVMnode = true;
-        break;
-      }
-    }
-  }
-
-  /*As in CUDNN backend, we remove the in out attributes of tensor operations,
-   aiming to deactivate the HPVM runtime calls. This has been tested through
-   CUDNN backend for the internal node codegen, and should ensure that code
-   does not insert llvm_visc_x86_argument_ptr in the generated function for
-   leaf node codegen as well.                                                */
-
-  /* Removing HPVM in/out/inout function attributes */
-  if (isApproxHPVMnode) {
-    for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) {
-      Argument *Arg = &*ai;
-      if(Arg->hasAttribute(Attribute::In))
-        Arg->removeAttr(Attribute::In);
-      if(Arg->hasAttribute(Attribute::Out))
-        Arg->removeAttr(Attribute::Out);
-      if(Arg->hasAttribute(Attribute::InOut))
-        Arg->removeAttr(Attribute::InOut);    
-    }
-  }else{
-    printf("****** NO REMOVEAL *** \n\n");
-  }
-
-  /*** FIXME: HACK FOR DSSOC DEMO -- END ***/
-
-  // Go through the arguments, and any pointer arguments with in attribute need
-  // to have x86_argument_ptr call to get the x86 ptr of the argument
-  // Insert these calls in a new BB which would dominate all other BBs
-  // Create new BB
-  BasicBlock* EntryBB = &*F_X86->begin();
-  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
-  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
-  // Insert calls
-  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
-        ai != ae; ++ai) {
-    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
-      assert(ai->getType()->isPointerTy()
-          && "Only pointer arguments can have visc in/out attributes ");
-      Function::arg_iterator aiNext = ai;
-      ++aiNext;
-      Argument* size = &*aiNext;
-      assert(size->getType() == Type::getInt64Ty(M.getContext())
-          && "Next argument after a pointer should be an i64 type");
-      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
-                                                    Type::getInt8PtrTy(M.getContext()),
-                                                    ai->getName()+".i8ptr",
-                                                    Terminator);
-      Value* ArgPtrCallArgs[] = {BI, size};
-      CallInst::Create(llvm_visc_x86_argument_ptr,
-                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
-                                              "",
-                                              Terminator);
-
-    }
-  }
-  errs() << *BB << "\n";
-
-  // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
-    Instruction *I = &(*i);
-    DEBUG(errs() << *I << "\n");
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
-
-    if (BuildDFG::isViscQueryIntrinsic(I)) {
-      IntrinsicInst* II = cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
-
-      /***********************************************************************
-      *                        Handle VISC Query intrinsics                  *
-      ***********************************************************************/
-      switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *******************/
-      case Intrinsic::visc_getNode: {
-        // add mapping <intrinsic, this node> to the node-specific map
-        Leaf_HandleToDFNodeMap[II] = N;
-        IItoRemove.push_back(II);
-        break;
-      }
-      /************************* llvm.visc.getParentNode() ****************/
-      case Intrinsic::visc_getParentNode: {
-        // get the parent node of the arg node
-        // get argument node
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        // get the parent node of the arg node
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-        // Add mapping <intrinsic, parent node> to the node-specific map
-        // the argument node must have been added to the map, orelse the
-        // code could not refer to it
-        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
-        IItoRemove.push_back(II);
-        break;
-      }
-      /*************************** llvm.visc.getNumDims() *****************/
-      case Intrinsic::visc_getNumDims: {
-        // get node from map
-        // get the appropriate field
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
-        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
-
-        II->replaceAllUsesWith(numOfDimConstant);
-        IItoRemove.push_back(II);
-        break;
-      }
-      /*********************** llvm.visc.getNodeInstanceID() **************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-
-        // The dfnode argument should be an ancestor of this leaf node or
-        // the leaf node itself
-        int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
-               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
-
-        // Get specified dimension
-        // (dim = 0) => x
-        // (dim = 1) => y
-        // (dim = 2) => z
-        int dim = (int) (II->getIntrinsicID() -
-                         Intrinsic::visc_getNodeInstanceID_x);
-        assert((dim >= 0) && (dim < 3)
-               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
-
-        // For immediate ancestor, use the extra argument introduced in
-        // F_X86
-        int numParamsF = F->getFunctionType()->getNumParams();
-        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert((numParamsF_X86 - numParamsF == 6)
-               && "Difference of arguments between function and its clone is not 6!");
-
-        if(parentLevel == 0) {
-          // Case when the query is for this node itself
-          unsigned offset = 3 + (3-dim);
-          // Traverse argument list of F_X86 in reverse order to find the
-          // correct index or dim argument.
-          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
-          assert(indexVal && "Index argument not found. Invalid offset!");
-
-          DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
-
-          II->replaceAllUsesWith(indexVal);
-          IItoRemove.push_back(II);
-        }
-        else {
-          // Case when query is for an ancestor
-          Value* args[] = {
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
-          };
-          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
-                                          ArrayRef<Value*>(args, 2),
-                                          "nodeInstanceID", II);
-          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
-          II->replaceAllUsesWith(CI);
-          IItoRemove.push_back(II);
-        }
-        break;
-      }
-      /********************** llvm.visc.getNumNodeInstances() *************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
-
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-
-        // The dfnode argument should be an ancestor of this leaf node or
-        // the leaf node itself
-        int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
-               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
-
-        // Get specified dimension
-        // (dim = 0) => x
-        // (dim = 1) => y
-        // (dim = 2) => z
-        int dim = (int) (II->getIntrinsicID() -
-                         Intrinsic::visc_getNumNodeInstances_x);
-        assert((dim >= 0) && (dim < 3)
-               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
-
-        // For immediate ancestor, use the extra argument introduced in
-        // F_X86
-        int numParamsF = F->getFunctionType()->getNumParams();
-        int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert((numParamsF_X86 - numParamsF == 6)
-               && "Difference of arguments between function and its clone is not 6!");
-
-        if(parentLevel == 0) {
-          // Case when the query is for this node itself
-          unsigned offset = 3 - dim;
-          // Traverse argument list of F_X86 in reverse order to find the
-          // correct index or dim argument.
-          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
-          assert(limitVal && "Limit argument not found. Invalid offset!");
-
-          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
-
-          II->replaceAllUsesWith(limitVal);
-          IItoRemove.push_back(II);
-        }
-        else {
-          // Case when query is from the ancestor
-          Value* args[] = {
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
-          };
-          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
-                                          ArrayRef<Value*>(args, 2),
-                                          "numNodeInstances", II);
-          DEBUG(errs() << *II << " replaced with " << *CI << "\n");
-          II->replaceAllUsesWith(CI);
-          IItoRemove.push_back(II);
-        }
-
-        break;
-      }
-      default:
-        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
-              II->getIntrinsicID() << "\n");
-        assert(false && "Unknown VISC Intrinsic!");
-        break;
-      }
-
-    } else {
-      //TODO: how to handle address space qualifiers in load/store
-    }
-
-  }
-
-  //TODO:
-  // When to replace the uses?
-  // In which order is it safe to replace the instructions in
-  // IItoReplace?
-  // Probably in the reverse order in the vectors
-  // It is a good idea to have them in one vector and chech the type
-  // using dyn_cast in order to determine if we replace with inst or value
-
-
-  //TODO: maybe leave these instructions to be removed by a later DCE pass
-  for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
-       i != IItoRemove.end(); ++i) {
-    (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
-    (*i)->eraseFromParent();
-  }
-
-  DEBUG(errs() << *F_X86);
-}
-
-} // End of namespace
-
-char DFG2LLVM_X86::ID = 0;
-static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc",
-                                    "Dataflow Graph to LLVM for X86 backend (DSOCC version)",
-                                    false /* does not modify the CFG */,
-                                    true /* transformation, not just analysis */);
-
diff --git a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt
deleted file mode 100644
index a6c4de9537..0000000000
--- a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = DFG2LLVM_X86_dsoc
-parent = Transforms
-
-- 
GitLab