diff --git a/README.md b/README.md
index d285c6e1f6852b9f49d02ae91bd193dde07e79fc..5d493035026121da19c8f95ddbfda9da8a77f6a0 100644
--- a/README.md
+++ b/README.md
@@ -39,13 +39,6 @@ Some common options that can be used with CMake are:
 
 * -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types).
 
-## Building hpvm runtime
-HPVM also includes a runtime library which comprises of low-level, target-specific wrappers required by HPVM's code generation.
-```shell
-cd projects/visc-rt
-make
-cd ..
-```
 To use hpvm to compile benchmarks set environment variable `LLVM_SRC_ROOT` to llvm directory in your local repository
 ```shell
 export LLVM_SRC_ROOT=<full path to hpvm>/llvm
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index d830574c54da3600a452768cd3731a7bc9bc86f5..6498b46cd9a56ad69df35d4497b463b9dda98c87 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -8,55 +8,34 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "DFG2LLVM_X86"
-#include "SupportVISC/DFG2LLVM.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
+#include "SupportVISC/DFG2LLVM.h"
 
 using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
 
 // VISC Command line option to use timer or not
-static cl::opt<bool> VISCTimer_X86("visc-timers-x86",
-                                   cl::desc("Enable visc timers"));
-// Command line option to enable device abstraction or not
 static cl::opt<bool>
-    DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
-                      cl::desc("Enable visc device abstraction"));
+VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
 
 namespace {
 
-// Helper Functions
-static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
-  if (!isa<CallInst>(I))
-    return false;
-  CallInst *CI = cast<CallInst>(I);
-  return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .equals("llvm_visc_policy_getVersion");
-}
-
-CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
-  for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
-    Instruction *I = &*ib;
-    if (isVISCCall_llvm_visc_policy_getVersion(I))
-      return cast<CallInst>(I);
-  }
-  return NULL;
-}
 
 // DFG2LLVM_X86 - The first implementation.
 struct DFG2LLVM_X86 : public DFG2LLVM {
   static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_X86() : DFG2LLVM(ID) {}
+  DFG2LLVM_X86() :DFG2LLVM(ID) {}
 
 private:
   // Member variables
@@ -71,7 +50,7 @@ public:
 class CGT_X86 : public CodeGenTraversal {
 
 private:
-  // Member variables
+  //Member variables
 
   FunctionCallee malloc;
   // VISC Runtime API
@@ -88,46 +67,41 @@ private:
   FunctionCallee llvm_visc_createEdgeBuffer;
   FunctionCallee llvm_visc_createLastInputBuffer;
   FunctionCallee llvm_visc_createThread;
-  // Constant* llvm_visc_freeThreads;
   FunctionCallee llvm_visc_bufferPush;
   FunctionCallee llvm_visc_bufferPop;
   FunctionCallee llvm_visc_x86_dstack_push;
   FunctionCallee llvm_visc_x86_dstack_pop;
   FunctionCallee llvm_visc_x86_getDimLimit;
   FunctionCallee llvm_visc_x86_getDimInstance;
-
-  // Functions
-  std::vector<IntrinsicInst *> *getUseList(Value *LI);
-  Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = "");
-  void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
+  
+  //Functions
+  std::vector<IntrinsicInst*>* getUseList(Value* LI);
+  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
+  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
   Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
-  Argument *getArgumentFromEnd(Function *F, unsigned offset);
-  Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
-                      Instruction *InsertBefore);
-  void invokeChild_X86(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
-                       Instruction *InsertBefore);
-  void invokeChild_PTX(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
-                       Instruction *InsertBefore);
-  StructType *getArgumentListStructTy(DFNode *);
-  Function *createFunctionFilter(DFNode *C);
-  void startNodeThread(DFNode *, std::vector<Value *>,
-                       DenseMap<DFEdge *, Value *>, Value *, Value *,
-                       Instruction *);
-  Function *createLaunchFunction(DFInternalNode *);
-  Function *createPushFunction(DFInternalNode *);
-  Function *createPopFunction(DFInternalNode *);
-  Function *createWaitFunction(DFInternalNode *);
-
+  Argument* getArgumentFromEnd(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+                       Instruction* InsertBefore);
+  StructType* getArgumentListStructTy(DFNode*);
+  Function* createFunctionFilter(DFNode* C);
+  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
+                      Value*, Value*, Instruction*);
+  Function* createLaunchFunction(DFInternalNode*);
+  
   // Virtual Functions
   void init() {
     VISCTimer = VISCTimer_X86;
     TargetName = "X86";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode *N);
-  void codeGen(DFLeafNode *N);
-  Function *codeGenStreamPush(DFInternalNode *N);
-  Function *codeGenStreamPop(DFInternalNode *N);
+  void codeGen(DFInternalNode* N);
+  void codeGen(DFLeafNode* N);
+  Function* codeGenStreamPush(DFInternalNode* N);
+  Function* codeGenStreamPop(DFInternalNode* N);
 
 public:
   // Constructor
@@ -136,20 +110,20 @@ public:
     initRuntimeAPI();
   }
 
-  void codeGenLaunch(DFInternalNode *Root);
-  void codeGenLaunchStreaming(DFInternalNode *Root);
+  void codeGenLaunch(DFInternalNode* Root);
+  void codeGenLaunchStreaming(DFInternalNode* Root);
 };
 
 bool DFG2LLVM_X86::runOnModule(Module &M) {
-  DEBUG(errs() << "\nDFG2LLVM_X86 PASS\n");
+  errs() << "\nDFG2LLVM_X86 PASS\n";
 
   // Get the BuildDFG Analysis Results:
   // - Dataflow graph
   // - Maps from i8* hansles to DFNode and DFEdge
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+  //DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode*> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
@@ -157,17 +131,16 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
 
   // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode : Roots) {
+  for (auto &rootNode: Roots) {
     // Initiate code generation for root DFNode
     CGTVisitor->visit(rootNode);
-    // Go ahead and replace the launch intrinsic with pthread call, otherwise
-    // return now.
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
     // TODO: Later on, we might like to do this in a separate pass, which would
-    // allow us the flexibility to switch between complete static code
-    // generation for DFG or having a customized runtime+scheduler
-
+    // allow us the flexibility to switch between complete static code generation
+    // for DFG or having a customized runtime+scheduler
+    
     // Do streaming code generation if root node is streaming. Usual otherwise
-    if (rootNode->isChildGraphStreaming())
+    if(rootNode->isChildGraphStreaming())
       CGTVisitor->codeGenLaunchStreaming(rootNode);
     else
       CGTVisitor->codeGenLaunch(rootNode);
@@ -183,7 +156,7 @@ void CGT_X86::initRuntimeAPI() {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
@@ -212,7 +185,6 @@ void CGT_X86::initRuntimeAPI() {
   DECLARE(llvm_visc_createEdgeBuffer);
   DECLARE(llvm_visc_createLastInputBuffer);
   DECLARE(llvm_visc_createThread);
-  // DECLARE(llvm_visc_freeThreads);
   DECLARE(llvm_visc_bufferPush);
   DECLARE(llvm_visc_bufferPop);
   DECLARE(llvm_visc_x86_dstack_push);
@@ -224,66 +196,28 @@ void CGT_X86::initRuntimeAPI() {
   initTimerAPI();
 
   // Insert init context in main
-  Function *VI = M.getFunction("llvm.visc.init");
+  Function* VI = M.getFunction("llvm.visc.init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
   DEBUG(errs() << "Inserting x86 timer initialization\n");
-  Instruction *I = cast<Instruction>(*VI->user_begin());
+  Instruction* I = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(I);
   switchToTimer(visc_TimerID_NONE, I);
-  // Insert code for initializing the sceduling policy
-  FunctionCallee IP = M.getOrInsertFunction(
-      "llvm_visc_policy_init",
-      runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType());
-  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
-  DEBUG(errs() << *IPCallInst << "\n");
-
-  // If device abstraction is enabled, we add a runtime call to start the
-  // device status simulation
-  if (DeviceAbstraction) {
-    FunctionCallee ID = M.getOrInsertFunction(
-        "llvm_visc_deviceAbstraction_start",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")
-            ->getFunctionType());
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
-    DEBUG(errs() << *IDCallInst << "\n");
-  }
-
   // Insert print instruction at visc exit
-  Function *VC = M.getFunction("llvm.visc.cleanup");
+  Function* VC = M.getFunction("llvm.visc.cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
 
-  // Insert code for clearing the sceduling policy
-  I = cast<Instruction>(*VC->user_begin());
-  IP = M.getOrInsertFunction(
-      "llvm_visc_policy_clear",
-      runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType());
-  IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
-  DEBUG(errs() << *IPCallInst << "\n");
-
   DEBUG(errs() << "Inserting x86 timer print\n");
   printTimerSet(I);
-
-  // If device abstraction is enabled, we add a runtime call to end the
-  // device status simulation
-  if (DeviceAbstraction) {
-    FunctionCallee ID = M.getOrInsertFunction(
-        "llvm_visc_deviceAbstraction_end",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")
-            ->getFunctionType());
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
-    DEBUG(errs() << *IDCallInst << "\n");
-  }
 }
 
 /* Returns vector of all wait instructions
  */
-std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) {
-  std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>();
+std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
   // It must have been loaded from memory somewhere
-  for (Value::user_iterator ui = GraphID->user_begin(),
-                            ue = GraphID->user_end();
-       ui != ue; ++ui) {
-    if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) {
+  for(Value::user_iterator ui = GraphID->user_begin(),
+      ue = GraphID->user_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
       UseList->push_back(waitI);
     } else {
       llvm_unreachable("Error: Operation on Graph ID not supported!\n");
@@ -295,14 +229,14 @@ std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) {
 /* Traverse the function argument list in reverse order to get argument at a
  * distance offset fromt he end of argument list of function F
  */
-Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) {
-  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
-         "Invalid offset to access arguments!");
+Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
+         && "Invalid offset to access arguments!");
   Function::arg_iterator e = F->arg_end();
   // Last element of argument iterator is dummy. Skip it.
   e--;
-  Argument *arg;
-  for (; offset != 0; e--) {
+  Argument* arg;
+  for( ; offset != 0; e--) {
     offset--;
     arg = &*e;
   }
@@ -320,24 +254,25 @@ Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) {
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-void CGT_X86::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
-                           Instruction *BodyEnd, Value *TerminationCond) {
-  BasicBlock *Entry = CondBlockStart->getParent();
-  BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
-  BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
-  BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
+                          Instruction* BodyEnd, Value* TerminationCond) {
+  BasicBlock* Entry = CondBlockStart->getParent();
+  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
 
   // Replace the terminator instruction of conditional with new conditional
   // branch which goes to while.body if true and branches to while.end otherwise
-  BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
   ReplaceInstWithInst(CondBlock->getTerminator(), BI);
 
   // While Body should jump to condition block
-  BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock);
+  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
   ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
+
 }
 
-Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
                                           BasicBlock *Body) {
   Module *M = Entry->getParent()->getParent();
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
@@ -347,10 +282,10 @@ Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
   PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
 
   ConstantInt *IConst =
-      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
   Instruction *CounterIncr =
-      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
-                                "cnt_incr", Body->getTerminator());
+    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                            "cnt_incr", Body->getTerminator());
 
   // Set incoming values for Phi node
   IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
@@ -372,40 +307,39 @@ Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
-  BasicBlock *Entry = I->getParent();
-  BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
+Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
+  BasicBlock* Entry = I->getParent();
+  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
 
   BasicBlock::iterator i(I);
   ++i;
-  Instruction *NextI = &*i;
+  Instruction* NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == ForBody &&
-         "Next Instruction should also belong to the same basic block!");
-  BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+  assert(NextI->getParent() == ForBody
+         && "Next Instruction should also belong to the same basic block!");
+  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
+
 
   // Add Phi Node for index variable
-  PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
-                                      "index." + indexName, I);
+  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
+                                      2, "index."+indexName, I);
 
   // Add incoming edge to phi
   IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
                         Entry);
   // Increment index variable
-  BinaryOperator *IndexInc = BinaryOperator::Create(
-      Instruction::Add, IndexPhi,
-      ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
-      "index." + indexName + ".inc", ForBody->getTerminator());
+  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
+                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+                             "index."+indexName+".inc", ForBody->getTerminator());
 
   // Compare index variable with limit
-  CmpInst *Cond =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
-                      "cond." + indexName, ForBody->getTerminator());
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
+                                  limit, "cond."+indexName, ForBody->getTerminator());
 
   // Replace the terminator instruction of for.body with new conditional
   // branch which loops over body if true and branches to for.end otherwise
-  BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
   ReplaceInstWithInst(ForBody->getTerminator(), BI);
 
   // Add incoming edge to phi node in body
@@ -417,274 +351,260 @@ Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
 // types, output types and isLastInput buffer type. All the streaming
 // inputs/outputs are converted to i8*, since this is the type of buffer
 // handles.
-StructType *CGT_X86::getArgumentListStructTy(DFNode *C) {
-  std::vector<Type *> TyList;
+StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
+  std::vector<Type*> TyList;
   // Input types
-  Function *CF = C->getFuncPointer();
-  for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
-       ai != ae; ++ai) {
-    if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+  Function* CF = C->getFuncPointer();
+  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+      ai != ae; ++ai) {
+    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
       TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-    else
+    else 
       TyList.push_back(ai->getType());
   }
   // Output Types
-  StructType *OutStructTy = cast<StructType>(CF->getReturnType());
+  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
   for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
     // All outputs of a node are streaming edge
-    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() &&
-           "All output edges of child node have to be streaming");
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
+        && "All output edges of child node have to be streaming");
     TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
   }
   // isLastInput buffer element
   TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
 
-  StructType *STy =
-      StructType::create(CF->getContext(), TyList,
-                         Twine("struct.thread." + CF->getName()).str(), true);
+  StructType* STy = StructType::create(CF->getContext(), TyList,
+                        Twine("struct.thread."+CF->getName()).str(), true);
   return STy;
+
 }
 
-void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args,
-                              DenseMap<DFEdge *, Value *> EdgeBufferMap,
-                              Value *isLastInputBuffer, Value *graphID,
-                              Instruction *IB) {
-  DEBUG(errs() << "Starting Pipeline for child node: "
-               << C->getFuncPointer()->getName() << "\n");
+void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
+                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
+                              Instruction* IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
   // Create a filter/pipeline function for the child node
-  Function *C_Pipeline = createFunctionFilter(C);
-  Function *CF = C->getFuncPointer();
+  Function* C_Pipeline = createFunctionFilter(C);
+  Function* CF = C->getFuncPointer();
 
   // Get module context and i32 0 constant, as they would be frequently used in
   // this function.
-  LLVMContext &Ctx = IB->getParent()->getContext();
-  Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+  LLVMContext& Ctx = IB->getParent()->getContext();
+  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
 
   // Marshall arguments
   // Create a packed struct type with inputs of C followed by outputs and then
   // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
   // by i8*
   //
-  StructType *STy = getArgumentListStructTy(C);
+  StructType* STy = getArgumentListStructTy(C);
   // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
-  CallInst *CI =
-      CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)),
-                       C->getFuncPointer()->getName() + ".inputs", IB);
-  CastInst *Struct = BitCastInst::CreatePointerCast(
-      CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB);
-  // AllocaInst* AI = new AllocaInst(STy,
-  // C->getFuncPointer()->getName()+".inputs", IB);
+  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
+                                  C->getFuncPointer()->getName()+".inputs", IB);
+  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
+  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
   // Insert elements in the struct
-  DEBUG(errs() << "Marshall inputs for child node: "
-               << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
   // Marshall Inputs
-  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
     // Create constant int (i)
-    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
     // Get Element pointer instruction
-    Value *GEPIndices[] = {IntZero, Int_i};
-    GetElementPtrInst *GEP = GetElementPtrInst::Create(
-        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
-        Struct->getName() + ".arg_" + Twine(i), IB);
-    DFEdge *E = C->getInDFEdgeAt(i);
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".arg_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getInDFEdgeAt(i);
     if (E->getSourceDF()->isEntryNode()) {
       // This is a Bind Input Edge
-      if (E->isStreamingEdge()) {
+      if(E->isStreamingEdge()) {
         // Streaming Bind Input edge. Get buffer corresponding to it
-        assert(EdgeBufferMap.count(E) &&
-               "No mapping buffer for a Streaming Bind DFEdge!");
+        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
         new StoreInst(EdgeBufferMap[E], GEP, IB);
-      } else {
+      }
+      else {
         // Non-streaming Bind edge
         new StoreInst(Args[i], GEP, IB);
       }
-    } else {
-      // This is an edge between siblings.
+    }
+    else {
+      // This is an edge between siblings. 
       // This must be an streaming edge. As it is our assumption that all edges
       // between two nodes in a DFG are streaming.
-      assert(EdgeBufferMap.count(E) &&
-             "No mapping buffer for a Streaming DFEdge!");
+      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
       new StoreInst(EdgeBufferMap[E], GEP, IB);
     }
   }
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
   // Marshall Outputs
-  DEBUG(errs() << "Marshall outputs for child node: "
-               << C->getFuncPointer()->getName() << "\n");
-  for (unsigned i = 0; i < numOutputs; i++) {
+  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  for(unsigned i = 0; i < numOutputs; i++ ) {
     // Create constant int (i+numInputs)
-    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs);
+    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
     // Get Element pointer instruction
-    Value *GEPIndices[] = {IntZero, Int_i};
-    GetElementPtrInst *GEP = GetElementPtrInst::Create(
-        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
-        Struct->getName() + ".out_" + Twine(i), IB);
-    DFEdge *E = C->getOutDFEdgeAt(i);
-    assert(E->isStreamingEdge() &&
-           "Output Edge must be streaming of all nodes");
-    assert(EdgeBufferMap.count(E) &&
-           "No mapping buffer for a Out Streaming DFEdge!");
+    Value* GEPIndices[] = { IntZero, Int_i };
+    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                             ArrayRef<Value*>(GEPIndices, 2),
+                             Struct->getName()+".out_"+Twine(i),
+                             IB);
+    DFEdge* E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
     new StoreInst(EdgeBufferMap[E], GEP, IB);
   }
   // Marshall last argument. isLastInput buffer
-  DEBUG(errs() << "Marshall isLastInput for child node: "
-               << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
   // Create constant int (i+numInputs)
-  Constant *Int_index =
-      ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs);
+  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
   // Get Element pointer instruction
-  Value *GEPIndices[] = {IntZero, Int_index};
-  GetElementPtrInst *GEP = GetElementPtrInst::Create(
-      nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
-      Struct->getName() + ".isLastInput", IB);
+  Value* GEPIndices[] = { IntZero, Int_index };
+  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
+                           ArrayRef<Value*>(GEPIndices, 2),
+                           Struct->getName()+".isLastInput", IB);
   new StoreInst(isLastInputBuffer, GEP, IB);
 
   // AllocaInst AI points to memory with all the arguments packed
   // Call runtime to create the thread with these arguments
-  DEBUG(errs() << "Start Thread for child node: "
-               << C->getFuncPointer()->getName() << "\n");
-  // DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
+// DEBUG(errs() << *llvm_visc_createThread << "\n");
   DEBUG(errs() << *graphID->getType() << "\n");
   DEBUG(errs() << *C_Pipeline->getType() << "\n");
   DEBUG(errs() << *Struct->getType() << "\n");
   // Bitcast AI to i8*
-  CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx),
-                                                Struct->getName(), IB);
-  Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
-  CallInst *CreateThread = CallInst::Create(
-      llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
+  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
+  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst::Create(llvm_visc_createThread, ArrayRef<Value*>(CreateThreadArgs, 3), "", IB);
+
 }
 
-Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
+Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
   DEBUG(errs() << "Generating Streaming Launch Function\n");
   // Get Function associated with Node N
-  Function *NF = N->getFuncPointer();
+  Function* NF = N->getFuncPointer();
 
-  // Map from Streaming edge to buffer
-  DenseMap<DFEdge *, Value *> EdgeBufferMap;
+  // Map from Streaming edge to buffer 
+  DenseMap<DFEdge*, Value*> EdgeBufferMap;
 
   /* Now we have all the necessary global declarations necessary to generate the
-   * Launch function, pointer to which can be passed to pthread utils to execute
-   * DFG. The Launch function has just one input: i8* data.addr
-   * This is the address of the all the input data that needs to be passed to
-   * this function. In our case it contains the input arguments of the Root
-   * function in the correct order.
-   * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
-   * (2) Extract each of inputs from data.addr
-   * (3) create Buffers for all the streaming edges
-   *     - Put buffers in the context
-   * (4) Go over each child node
-   *     - marshall its arguments together (use buffers in place of streaming
-   *       arguments)
-   *     - Start the threads
-   * (5) The return value from Root is stored in memory, pointer to which is
-   * passed to pthread_exit call.
-   */
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+  * (2) Extract each of inputs from data.addr
+  * (3) create Buffers for all the streaming edges
+  *     - Put buffers in the context
+  * (4) Go over each child node
+  *     - marshall its arguments together (use buffers in place of streaming
+  *       arguments)
+  *     - Start the threads
+  * (5) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
   // (1) Create Launch Function of type void (i8* args, i8* GraphID)
-  Type *i8Ty = Type::getInt8Ty(M.getContext());
-  Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
-  FunctionType *LaunchFuncTy = FunctionType::get(
-      Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false);
-  Function *LaunchFunc = Function::Create(
-      LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M);
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
+                                  ArrayRef<Type*>(ArgTypes, 2), false);
+  Function* LaunchFunc = Function::Create(LaunchFuncTy,
+                                       NF->getLinkage(),
+                                       NF->getName()+".LaunchFunction",
+                                       &M);
   DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Argument *data = &*LaunchFunc->arg_begin();
+  Argument* data = &*LaunchFunc->arg_begin();
   // NOTE-HS: Check correctness with Maria
-  Argument *graphID = &*(LaunchFunc->arg_begin() + 1);
+  Argument* graphID = &*(LaunchFunc->arg_begin() + 1);
   data->setName("data.addr");
   graphID->setName("graphID");
   // Add a basic block to this empty function and a return null statement to it
   DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
-  BasicBlock *BB =
-      BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
-  ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB);
+  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
+                                      BB);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
 
   // (2) Extract each of inputs from data.addr
-  std::vector<Type *> TyList;
+  std::vector<Type*> TyList;
   std::vector<std::string> names;
-  std::vector<Value *> Args;
+  std::vector<Value*> Args;
 
   for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
-       ai != ae; ++ai) {
-    if (N->getChildGraph()
-            ->getEntry()
-            ->getOutDFEdgeAt(ai->getArgNo())
-            ->isStreamingEdge()) {
+      ai != ae; ++ai) {
+    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back(Twine(ai->getName() + "_buffer").str());
+      names.push_back(Twine(ai->getName()+"_buffer").str());
       continue;
     }
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
   Args = extractElements(data, TyList, names, RI);
-  DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc
-               << "\n");
+  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
   // (3) Create buffers for all the streaming edges
-  for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
-                                de = N->getChildGraph()->dfedge_end();
-       di != de; ++di) {
-    DFEdge *Edge = *di;
+  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
+    DFEdge* Edge = *di;
     DEBUG(errs() << *Edge->getType() << "\n");
-    Value *size = ConstantExpr::getSizeOf(Edge->getType());
-    Value *CallArgs[] = {graphID, size};
+    Value* size = ConstantExpr::getSizeOf(Edge->getType());
+    Value* CallArgs[] = {graphID, size};
     if (Edge->isStreamingEdge()) {
-      CallInst *CI;
+      CallInst* CI;
       // Create a buffer call
-      if (Edge->getSourceDF()->isEntryNode()) {
+      if(Edge->getSourceDF()->isEntryNode()) {
         // Bind Input Edge
-        Constant *Int_ArgNo = ConstantInt::get(
-            Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
-        Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
-        CI = CallInst::Create(
-            llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
-            "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
-      } else if (Edge->getDestDF()->isExitNode()) {
+        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
+                                  Edge->getSourcePosition());
+        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
+                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else if(Edge->getDestDF()->isExitNode()) {
         // Bind Output Edge
-        CI = CallInst::Create(
-            llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
-            "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
-      } else {
+        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
+                              RI);
+      }
+      else {
         // Streaming Edge
-        CI = CallInst::Create(
-            llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
-            Edge->getSourceDF()->getFuncPointer()->getName() + "." +
-                Edge->getDestDF()->getFuncPointer()->getName(),
-            RI);
+        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
+                              ArrayRef<Value*>(CallArgs, 2),
+                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
+                              +Edge->getDestDF()->getFuncPointer()->getName(),
+                              RI);
       }
       EdgeBufferMap[Edge] = CI;
     }
   }
   // Create buffer for isLastInput for all the child nodes
-  DFGraph *G = N->getChildGraph();
-  DenseMap<DFNode *, Value *> NodeLastInputMap;
-  for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce;
-       ++ci) {
-    DFNode *child = *ci;
-    if (child->isDummyNode())
+  DFGraph* G = N->getChildGraph();
+  DenseMap<DFNode*, Value*> NodeLastInputMap;
+  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
+    DFNode* child = *ci;
+    if(child->isDummyNode())
       continue;
-    Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
-    Value *CallArgs[] = {graphID, size};
-    CallInst *CI = CallInst::Create(
-        llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
-        "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
+    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value* CallArgs[] = {graphID, size};
+    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
+                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
+                              RI);
     NodeLastInputMap[child] = CI;
   }
-  DEBUG(errs() << "Start Each child node filter\n");
+  DEBUG(errs() <<  "Start Each child node filter\n");
   // (4) Marshall arguments for each child node and start the thread with its
   //     pipeline funtion
-  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-                                  ce = N->getChildGraph()->end();
-       ci != ce; ++ci) {
-    DFNode *C = *ci;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
-
+    
     // Marshall all the arguments for this node into an i8*
     // Pass to the runtime to create the thread
     // Start the thread for child node C
@@ -697,23 +617,7 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
   return LaunchFunc;
 }
 
-Function *CGT_X86::createPushFunction(DFInternalNode *N) {
-  DEBUG(errs() << "Generating Push function\n");
-  Function *PushFunc;
-  return PushFunc;
-}
-
-Function *CGT_X86::createPopFunction(DFInternalNode *N) {
-  DEBUG(errs() << "Generating Pop function\n");
-  Function *PushFunc;
-  return PushFunc;
-}
 
-Function *CGT_X86::createWaitFunction(DFInternalNode *N) {
-  DEBUG(errs() << "Generating Wait function\n");
-  Function *PushFunc;
-  return PushFunc;
-}
 /* This fuction does the steps necessary to launch a streaming graph
  * Steps
  * Create Pipeline/Filter function for each node in child graph of Root
@@ -721,162 +625,167 @@ Function *CGT_X86::createWaitFunction(DFInternalNode *N) {
  * Modify each of the instrinsic in host code
  * Launch, Push, Pop, Wait
  */
-void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) {
-  IntrinsicInst *LI = Root->getInstruction();
-  Function *RootLaunch = createLaunchFunction(Root);
-  // Function* RootPush = createPushFunction(Root);
-  // Function* RootPop = createPopFunction(Root);
-  // Function* RootWait = createWaitFunction(Root);
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
+  IntrinsicInst* LI = Root->getInstruction();
+  Function* RootLaunch = createLaunchFunction(Root);
   // Substitute launch intrinsic main
-  DEBUG(errs() << "Substitute launch intrinsic\n");
-  Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
-  CallInst *LaunchInst = CallInst::Create(
-      llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
-      "graph" + Root->getFuncPointer()->getName(), LI);
-  // ReplaceInstWithInst(LI, LaunchInst);
+  DEBUG(errs() <<  "Substitute launch intrinsic\n");
+  Value* LaunchInstArgs[] = {RootLaunch,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  DEBUG(errs() << "Substitute wait, push, pop intrinsics\n");
-  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
-  for (unsigned i = 0; i < UseList->size(); ++i) {
-    IntrinsicInst *II = UseList->at(i);
-    CallInst *CI;
-    Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
-    switch (II->getIntrinsicID()) {
+  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch(II->getIntrinsicID()) {
     case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_streamWait,
+                            ArrayRef<Value*>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_push:
       CI = CallInst::Create(llvm_visc_streamPush,
-                            ArrayRef<Value *>(PushArgs, 2), "");
+                            ArrayRef<Value*>(PushArgs, 2),
+                            "");
       break;
     case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_streamPop,
+                            ArrayRef<Value*>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable(
-          "GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
     };
     DEBUG(errs() << "Replace:\n\t" << *II << "\n");
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << "\twith " << *CI << "\n");
   }
+
+
 }
 
-void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
+void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
   // TODO: Place an assert to check if the constant passed by launch intrinsic
   // as the number of arguments to DFG is same as the number of arguments of the
   // root of DFG
   DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
-  IntrinsicInst *LI = Root->getInstruction();
+  IntrinsicInst* LI = Root->getInstruction();
   switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
   DEBUG(errs() << "Generating Launch Function\n");
 
   /* Now we have all the necessary global declarations necessary to generate the
-   * Launch function, pointer to which can be passed to pthread utils to execute
-   * DFG. The Launch function has just one input: i8* data.addr
-   * This is the address of the all the input data that needs to be passed to
-   * this function. In our case it contains the input arguments of the Root
-   * function in the correct order.
-   * (1) Create an empty Launch function of type i8*(i8*)
-   * (2) Extract each of inputs from data.addr and pass them as arguments to the
-   * call to Root function
-   * (3) The return value from Root is stored in memory, pointer to which is
-   * passed to pthread_exit call.
-   */
+  * Launch function, pointer to which can be passed to pthread utils to execute
+  * DFG. The Launch function has just one input: i8* data.addr
+  * This is the address of the all the input data that needs to be passed to
+  * this function. In our case it contains the input arguments of the Root
+  * function in the correct order.
+  * (1) Create an empty Launch function of type i8*(i8*)
+  * (2) Extract each of inputs from data.addr and pass them as arguments to the
+  * call to Root function
+  * (3) The return value from Root is stored in memory, pointer to which is
+  * passed to pthread_exit call.
+  */
   // Create Launch Function of type i8*(i8*) which calls the root function
-  Type *i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType *AppFuncTy = FunctionType::get(
-      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
-  Function *AppFunc =
-      Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(),
-                       "LaunchDataflowGraph", &M);
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                            ArrayRef<Type*>(i8Ty->getPointerTo()),
+                            false);
+  Function* AppFunc = Function::Create(AppFuncTy,
+                                       Root->getFuncPointer()->getLinkage(),
+                                       "LaunchDataflowGraph",
+                                       &M);
   DEBUG(errs() << "Generating Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value *data = &*AppFunc->arg_begin();
+  Value* data = &*AppFunc->arg_begin();
   data->setName("data.addr");
   // Add a basic block to this empty function and a return null statement to it
   BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
-  ReturnInst *RI =
-      ReturnInst::Create(AppFunc->getContext(),
-                         Constant::getNullValue(AppFunc->getReturnType()), BB);
+  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
+                                      Constant::getNullValue(AppFunc->getReturnType()),
+                                      BB);
   switchToTimer(visc_TimerID_ARG_UNPACK, RI);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and
-  //  Function* RootF_X86 = Root->getGenFunc();
-  Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+//  Function* RootF_X86 = Root->getGenFunc();
+  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
   assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
   assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
          "Error: Generated Function for Root node with no x86 wrapper\n");
 
   // Generate a call to RootF_X86 with null parameters for now
-  std::vector<Value *> Args;
-  for (unsigned i = 0; i < RootF_X86->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(
-        Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  std::vector<Value*>Args;
+  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
   }
-  CallInst *CI =
-      CallInst::Create(RootF_X86, Args, RootF_X86->getName() + ".output", RI);
+  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
 
   // Extract input data from i8* data.addr and patch them to correct argument of
   // call to RootF_X86. For each argument
-  std::vector<Type *> TyList;
+  std::vector<Type*> TyList;
   std::vector<std::string> names;
-  for (Function::arg_iterator ai = RootF_X86->arg_begin(),
-                              ae = RootF_X86->arg_end();
-       ai != ae; ++ai) {
+  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+      ai != ae; ++ai) {
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
-  std::vector<Value *> elements = extractElements(data, TyList, names, CI);
+  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
   // Patch the elements to the call arguments
-  for (unsigned i = 0; i < CI->getNumArgOperands(); i++)
+  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
     CI->setArgOperand(i, elements[i]);
 
   // Add timers around Call to RootF_X86 function
   switchToTimer(visc_TimerID_COMPUTATION, CI);
   switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
 
-  StructType *RootRetTy =
-      cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
+  StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
 
-  // if Root has non empty return
+  // if Root has non empty return 
   if (RootRetTy->getNumElements()) {
     // We can't access the type of the arg struct - build it
-    std::vector<Type *> TyList;
-    for (Function::arg_iterator ai = RootF_X86->arg_begin(),
-                                ae = RootF_X86->arg_end();
-         ai != ae; ++ai) {
+    std::vector<Type*> TyList;
+    for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
+        ai != ae; ++ai) {
       TyList.push_back(ai->getType());
     }
     TyList.push_back(CI->getType());
 
-    StructType *ArgStructTy = StructType::create(
-        M.getContext(), ArrayRef<Type *>(TyList),
-        (RootF_X86->getName() + ".arg.struct.ty").str(), true);
+    StructType* ArgStructTy = StructType::create(M.getContext(),
+                                                 ArrayRef<Type*>(TyList),
+                                 (RootF_X86->getName()+".arg.struct.ty").str(), true);
 
     // Cast the data pointer to the type of the arg struct
-    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
-        data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI);
+    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                                 ArgStructTy->getPointerTo(),
+                                 "argStructCast.addr",
+                                 RI);
 
     // Result struct is the last element of the packed struct passed to launch
     unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
 
-    ConstantInt *IntZero =
-        ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
-    ConstantInt *IntIdx =
-        ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx);
+    ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                          outStructIdx);
 
-    Value *GEPIIdxList[] = {IntZero, IntIdx};
+    Value* GEPIIdxList[] = { IntZero,
+                             IntIdx
+                           };
     // Get data pointer to the last element of struct - result field
-    GetElementPtrInst *OutGEPI = GetElementPtrInst::Create(
-        ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2),
-        CI->getName() + ".addr", RI);
+    GetElementPtrInst *OutGEPI =
+      GetElementPtrInst::Create(ArgStructTy,
+                                OutputAddrCast,
+                                ArrayRef<Value*>(GEPIIdxList, 2),
+                                CI->getName()+".addr",
+                                RI);
     // Store result there
     new StoreInst(CI, OutGEPI, RI);
   } else {
@@ -885,8 +794,10 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
     // We were casting the data pointer to the result type of Root, and
     // returning result there. This would work at the LLVM level, but not
     // at the C level, thus the rewrite.
-    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
-        data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI);
+    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
+                               CI->getType()->getPointerTo(),
+                               CI->getName()+".addr",
+                               RI);
     new StoreInst(CI, OutputAddrCast, RI);
   }
 
@@ -896,109 +807,114 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   DEBUG(errs() << *AppFunc << "\n");
 
   // Substitute launch intrinsic main
-  Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
-  CallInst *LaunchInst = CallInst::Create(
-      llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
-      "graph" + Root->getFuncPointer()->getName(), LI);
-  // ReplaceInstWithInst(LI, LaunchInst);
+  Value* LaunchInstArgs[] = {AppFunc,
+                             LI->getArgOperand(1)
+                            };
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
-  for (unsigned i = 0; i < UseList->size(); ++i) {
-    IntrinsicInst *II = UseList->at(i);
-    CallInst *CI;
-    switch (II->getIntrinsicID()) {
+  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
+  for(unsigned i=0; i < UseList->size(); ++i) {
+    IntrinsicInst* II = UseList->at(i);
+    CallInst* CI;
+    switch(II->getIntrinsicID()) {
     case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_x86_wait,
+                            ArrayRef<Value*>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_bufferPush,
+                            ArrayRef<Value*>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_bufferPop,
+                            ArrayRef<Value*>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable(
-          "GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
     };
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << *CI << "\n");
   }
+
 }
 
-Value *CGT_X86::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
-                             Instruction *InsertBefore) {
+Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
   // TODO: Assumption is that each input port of a node has just one
   // incoming edge. May change later on.
 
   // Find the incoming edge at the requested input port
-  DFEdge *E = Child->getInDFEdgeAt(i);
+  DFEdge* E = Child->getInDFEdgeAt(i);
   assert(E && "No incoming edge or binding for input element!");
   // Find the Source DFNode associated with the incoming edge
-  DFNode *SrcDF = E->getSourceDF();
+  DFNode* SrcDF = E->getSourceDF();
 
   // If Source DFNode is a dummyNode, edge is from parent. Get the
   // argument from argument list of this internal node
-  Value *inputVal;
-  if (SrcDF->isEntryNode()) {
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
     inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
-    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
-  } else {
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
     // edge is from a sibling
     // Check - code should already be generated for this source dfnode
-    assert(OutputMap.count(SrcDF) &&
-           "Source node call not found. Dependency violation!");
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
 
     // Find CallInst associated with the Source DFNode using OutputMap
-    Value *CI = OutputMap[SrcDF];
+    Value* CI = OutputMap[SrcDF];
 
     // Extract element at source position from this call instruction
     std::vector<unsigned> IndexList;
     IndexList.push_back(E->getSourcePosition());
-    DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
-    ExtractValueInst *EI =
-        ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
     inputVal = EI;
   }
   return inputVal;
 }
 
-void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
-                              ValueToValueMapTy &VMap, Instruction *IB) {
-  Function *CF = C->getFuncPointer();
+void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
+                              ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
 
-  //  Function* CF_X86 = C->getGenFunc();
+//  Function* CF_X86 = C->getGenFunc();
   Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
-  assert(CF_X86 != NULL &&
-         "Found leaf node for which code generation has not happened yet!\n");
+  assert(CF_X86 != NULL
+         && "Found leaf node for which code generation has not happened yet!\n");
   assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
-         "The generated function to be called from x86 backend is not an x86 "
-         "function\n");
+         "The generated function to be called from x86 backend is not an x86 function\n");
   DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
 
-  std::vector<Value *> Args;
+  std::vector<Value*> Args;
   // Create argument list to pass to call instruction
   // First find the correct values using the edges
   // The remaing six values are inserted as constants for now.
-  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
     Args.push_back(getInValueAt(C, i, F_X86, IB));
   }
 
-  Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
-  for (unsigned j = 0; j < 6; j++)
+  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
     Args.push_back(I64Zero);
 
-  DEBUG(errs() << "Gen Function type: " << *CF_X86->getType() << "\n");
-  DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n");
-  DEBUG(errs() << "Arguments: " << Args.size() << "\n");
+  errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
+  errs() << "Node Function type: " << *CF->getType() << "\n";
+  errs() << "Arguments: " << Args.size() << "\n";
 
   // Call the F_X86 function associated with this node
-  CallInst *CI =
-      CallInst::Create(CF_X86, Args, CF_X86->getName() + "_output", IB);
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
   DEBUG(errs() << *CI << "\n");
   OutputMap[C] = CI;
 
@@ -1006,56 +922,55 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
   // Based on number of dimensions, insert loop instructions
   std::string varNames[3] = {"x", "y", "z"};
   unsigned numArgs = CI->getNumArgOperands();
-  for (unsigned j = 0; j < C->getNumOfDim(); j++) {
-    Value *indexLimit = NULL;
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit = NULL;
     // Limit can either be a constant or an arguement of the internal node.
     // In case of constant we can use that constant value directly in the
     // new F_X86 function. In case of an argument, we need to get the mapped
     // value using VMap
-    if (isa<Constant>(C->getDimLimits()[j])) {
+    if(isa<Constant>(C->getDimLimits()[j])) {
       indexLimit = C->getDimLimits()[j];
       DEBUG(errs() << "In Constant case:\n"
-                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    } else {
+             << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    }
+    else {
       indexLimit = VMap[C->getDimLimits()[j]];
       DEBUG(errs() << "In VMap case:"
-                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
     }
     assert(indexLimit && "Invalid dimension limit!");
     // Insert loop
-    Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
     DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
     // Insert index variable and limit arguments
-    CI->setArgOperand(numArgs - 6 + j, indexVar);
-    CI->setArgOperand(numArgs - 3 + j, indexLimit);
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
   }
   // Insert call to runtime to push the dim limits and instanceID on the depth
   // stack
-  Value *args[] = {
-      ConstantInt::get(Type::getInt32Ty(CI->getContext()),
-                       C->getNumOfDim()), // numDim
-      CI->getArgOperand(numArgs - 3 + 0), // limitX
-      CI->getArgOperand(numArgs - 6 + 0), // iX
-      CI->getArgOperand(numArgs - 3 + 1), // limitY
-      CI->getArgOperand(numArgs - 6 + 1), // iY
-      CI->getArgOperand(numArgs - 3 + 2), // limitZ
-      CI->getArgOperand(numArgs - 6 + 2)  // iZ
+  Value* args[] = {
+    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
+    CI->getArgOperand(numArgs-3+0), // limitX
+    CI->getArgOperand(numArgs-6+0), // iX
+    CI->getArgOperand(numArgs-3+1), // limitY
+    CI->getArgOperand(numArgs-6+1), // iY
+    CI->getArgOperand(numArgs-3+2), // limitZ
+    CI->getArgOperand(numArgs-6+2)  // iZ
   };
 
-  CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push,
-                                    ArrayRef<Value *>(args, 7), "", CI);
+  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
   DEBUG(errs() << "Push on stack: " << *Push << "\n");
   // Insert call to runtime to pop the dim limits and instanceID from the depth
   // stack
   BasicBlock::iterator i(CI);
   ++i;
-  Instruction *NextI = &*i;
+  Instruction* NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == CI->getParent() &&
-         "Next Instruction should also belong to the same basic block!");
+  assert(NextI->getParent() == CI->getParent()
+         && "Next Instruction should also belong to the same basic block!");
 
-  CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
   DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
   DEBUG(errs() << *CI->getParent()->getParent());
 }
@@ -1076,33 +991,34 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
 // Add runtime API calls to push output for each of the streaming outputs
 // Add loop around the basic block, which exits the loop if isLastInput is false
 
-Function *CGT_X86::createFunctionFilter(DFNode *C) {
-  DEBUG(errs() << "*********Creating Function filter for "
-               << C->getFuncPointer()->getName() << "*****\n");
+Function* CGT_X86::createFunctionFilter(DFNode* C) {
+  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
 
   /* Create a function with same argument list as child.*/
   DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
   // Get the generated function for child node
-  Function *CF = C->getFuncPointer();
+  Function* CF = C->getFuncPointer();
   // Create Filter Function of type i8*(i8*) which calls the root function
-  Type *i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType *CF_PipelineTy = FunctionType::get(
-      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
-  Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(),
-                                           CF->getName() + "_Pipeline", &M);
-  DEBUG(errs() << "Generating Pipline Function\n");
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
+  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
+                          CF->getLinkage(),
+                          CF->getName()+"_Pipeline",
+                          &M);
+  DEBUG(errs() << "Generating Pipeline Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value *data = &*CF_Pipeline->arg_begin();
+  Value* data = &*CF_Pipeline->arg_begin();
   data->setName("data.addr");
   // Create a new basic block
   DEBUG(errs() << "\tCreate new BB and add a return function\n");
   // Add a basic block to this empty function
-  BasicBlock *BB =
-      BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
   // Add a return instruction to the basic block
-  ReturnInst *RI =
-      ReturnInst::Create(CF_Pipeline->getContext(),
-                         UndefValue::get(CF_Pipeline->getReturnType()), BB);
+  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
+                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
+
 
   /* Extract the elements from the aggregate argument to the function.
    * Replace the streaming inputs with i8* types signifying handle to
@@ -1113,24 +1029,25 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
   // These Args will be used when passing arguments to the generated function
   // inside loop, and reading outputs as well.
-  std::vector<Value *> Args;
-  std::vector<Type *> TyList;
+  std::vector<Value*> Args;
+  std::vector<Type*> TyList;
   std::vector<std::string> names;
   // Adding inputs
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
-       ++i) {
-    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back((Twine(i->getName()) + "_buffer").str());
-    } else {
+      names.push_back((Twine(i->getName())+"_buffer").str());
+    }
+    else {
       TyList.push_back(i->getType());
       names.push_back(i->getName());
     }
   }
   // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
   // because we get there buffer handles
-  StructType *RetTy = cast<StructType>(CF->getReturnType());
-  for (unsigned i = 0; i < RetTy->getNumElements(); i++) {
+  StructType* RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
     TyList.push_back(i8Ty->getPointerTo());
     names.push_back("out");
   }
@@ -1139,54 +1056,66 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   TyList.push_back(i8Ty->getPointerTo());
   names.push_back("isLastInput_buffer");
 
-  // Extract the inputs, outputs and
+  // Extract the inputs, outputs 
   Args = extractElements(data, TyList, names, RI);
-  for (unsigned i = 0; i < Args.size(); i++) {
+  for(unsigned i=0; i<Args.size(); i++) {
     DEBUG(errs() << *Args[i] << "\n");
   }
 
   // Split the Args vector into, input output and isLastInput
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = RetTy->getNumElements();
-  std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs);
-  std::vector<Value *> OutputArgs(Args.begin() + numInputs,
-                                  Args.begin() + numInputs + numOutputs);
-  Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]);
+  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
+  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
 
   /* Add runtime API calls to get input for each of the streaming input edges */
-  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the "
-                  "streaming input edges\n");
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
   // First read the termination condition variable islastInput
-  CallInst *isLastInputPop = CallInst::Create(
-      llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
-
-  CastInst *BI = BitCastInst::CreateIntegerCast(
-      isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
-      "isLastInput", RI);
+  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
+                                        ArrayRef<Value*>(isLastInput),
+                                        "",
+                                        RI);
+
+  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
+                 Type::getInt64Ty(CF_Pipeline->getContext()),
+                 false,
+                 "isLastInput",
+                 RI);
   isLastInput = BI;
   // Create a loop termination condition
-  CmpInst *Cond = CmpInst::Create(
-      Instruction::ICmp, CmpInst::ICMP_NE, isLastInput,
-      Constant::getNullValue(Type::getInt64Ty(CF->getContext())),
-      "isLastInputNotZero", RI);
+  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
+      RI);
 
   // Get input from buffers of all the incoming streaming edges
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
-       ++i) {
-    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
-      CallInst *bufferIn =
-          CallInst::Create(llvm_visc_bufferPop,
-                           ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
-      CastInst *BI;
-      if (i->getType()->isPointerTy()) {
-        BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(),
-                              i->getName() + ".addr", RI);
-      } else if (i->getType()->isFloatTy()) {
-        BI = CastInst::CreateFPCast(bufferIn, i->getType(),
-                                    i->getName() + ".addr", RI);
-      } else {
-        BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false,
-                                         i->getName() + ".addr", RI);
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
+       i != e; ++i) {
+    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
+                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
+                                            "",
+                                            RI);
+      CastInst* BI;
+      if(i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr,
+                              bufferIn,
+                              i->getType(),
+                              i->getName()+".addr",
+                              RI);
+      }
+      else if(i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn,
+                                    i->getType(),
+                                    i->getName()+".addr",
+                                    RI);
+      }
+      else {
+        BI = CastInst::CreateIntegerCast(bufferIn,
+                                         i->getType(),
+                                         false,
+                                         i->getName()+".addr",
+                                         RI);
       }
       // Replace the argument in Args vector. We would be using the vector as
       // parameters passed to the call
@@ -1195,88 +1124,52 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   }
   /* Add a call to the generated function of the child node */
   DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-  //  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
-  //  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
-  //                                  C->getGenFunc()->getName()+".output", RI);
+//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+//                                  C->getGenFunc()->getName()+".output", RI);
   Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
-  DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
-  CallInst *CI =
-      CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
+  DEBUG(errs() << "Type: "
+               << *CGenF->getType()
+               << "\n");
+  CallInst* CI = CallInst::Create(CGenF,
+                                  InputArgs,
+                                  CGenF->getName()+".output",
+                                  RI);
 
   /* Add runtime API calls to push output for each of the streaming outputs */
   // FIXME: Assumption
   // All edges between siblings are streaming edges
-  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the "
-                  "streaming outputs\n");
-  for (unsigned i = 0; i < numOutputs; i++) {
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
+  for (unsigned i=0; i< numOutputs; i++) {
     // Extract output
-    ExtractValueInst *EI =
-        ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI);
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
+                           "",RI);
     // Convert to i64
-    CastInst *BI;
-    if (EI->getType()->isPointerTy())
-      BI =
-          CastInst::Create(CastInst::PtrToInt, EI,
-                           Type::getInt64Ty(CF_Pipeline->getContext()), "", RI);
+    CastInst* BI;
+    if(EI->getType()->isPointerTy())
+      BI = CastInst::Create(CastInst::PtrToInt,EI,
+                            Type::getInt64Ty(CF_Pipeline->getContext()),
+                            "",
+                            RI);
     else
-      BI = CastInst::CreateIntegerCast(
-          EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI);
+      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
+                                       false, "", RI);
     // Push to Output buffer
-    Value *bufferOutArgs[] = {OutputArgs[i], BI};
-    CallInst *bufferOut = CallInst::Create(
-        llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI);
+    Value* bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst::Create(llvm_visc_bufferPush,
+                                           ArrayRef<Value*>(bufferOutArgs, 2),
+                                           "",
+                                           RI);
   }
 
-  // Add loop around the basic block, which exits the loop if isLastInput is
-  // false Pointers to keep the created loop structure
+  // Add loop around the basic block, which exits the loop if isLastInput is false
+  // Pointers to keep the created loop structure
   BasicBlock *EntryBB, *CondBB, *BodyBB;
   Instruction *CondStartI = cast<Instruction>(isLastInputPop);
   Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
   EntryBB = CondStartI->getParent();
 
   addWhileLoop(CondStartI, BodyStartI, RI, Cond);
-  CondBB = CondStartI->getParent();
-  BodyBB = CI->getParent();
-  Instruction *CntI = NULL;
-  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
-
-  // If the node function calls the visc runtime call to get policy, we update
-  // it with the counter information. This means we need to pass an additional
-  // argument to the generated function, that is the iteration number, and then
-  // use it as an argument to the policy_getVersion call
-  if (GetPolicyCI) {
-    CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
-    assert(CntI && "Counter instruction not found\n");
-
-    // Create new function type (with additional argument for iteration number)
-    Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
-    std::vector<Type *> NewArgTypes;
-    for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
-         ai != ae; ++ai) {
-      NewArgTypes.push_back(ai->getType());
-    }
-    NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
-    FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
-    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
-    // At least one (the last) argument exists (we added it)
-    Function::arg_iterator ae = NewCGenF->arg_end();
-    --ae;
-    Argument *CntArg = &*ae;
-    CntArg->setName("iteration");
-    // Replace the old cpu gen func with this one
-    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
-
-    // Add counter to the actual parameter list, to create the new call
-    InputArgs.push_back(CntI);
-    CallInst *newCI =
-        CallInst::Create(NewCGenF, InputArgs, NewCGenF->getName() + ".output");
-    ReplaceInstWithInst(CI, newCI);
-
-    // Set second operand of the policy_getVersion call to the last function
-    // argument
-    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
-    GetPolicyCI->setArgOperand(1, CntArg);
-  }
 
   // Return the Function pointer
   DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
@@ -1284,19 +1177,19 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   return CF_Pipeline;
 }
 
-void CGT_X86::codeGen(DFInternalNode *N) {
+void CGT_X86::codeGen(DFInternalNode* N) {
   // Check if N is root node and its graph is streaming. We do not do codeGen
   // for Root in such a case
-  if (N->isRoot() && N->isChildGraphStreaming())
+  if(N->isRoot() && N->isChildGraphStreaming())
     return;
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-  //  if(N->getGenFunc() != NULL)
-  //    return;
+//  if(N->getGenFunc() != NULL)
+//    return;
   if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
-                 << " : skipping it\n");
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
     return;
   }
 
@@ -1309,10 +1202,9 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   // Only process if all children have a CPU x86 function
   // Otherwise skip to end
   bool codeGen = true;
-  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-                                  ce = N->getChildGraph()->end();
-       ci != ce; ++ci) {
-    DFNode *C = *ci;
+  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+    DFNode* C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
@@ -1327,18 +1219,17 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   }
 
   if (codeGen) {
-    Function *F = N->getFuncPointer();
+    Function* F = N->getFuncPointer();
     // Create of clone of F with no instructions. Only the type is the same as F
     // without the extra arguments.
-    Function *F_X86;
-
+    Function* F_X86;
+  
     // Clone the function, if we are seeing this function for the first time. We
     // only need a clone in terms of type.
     ValueToValueMapTy VMap;
-
+  
     // Create new function with the same type
-    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(),
-                             F->getName(), &M);
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
 
     // Loop over the arguments, copying the names of arguments over.
     Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -1351,19 +1242,19 @@ void CGT_X86::codeGen(DFInternalNode *N) {
 
     // Add a basic block to this empty function
     BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
-    ReturnInst *RI = ReturnInst::Create(
-        F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB);
+    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
 
-    // Add Index and Dim arguments except for the root node and the child graph
-    // of parent node is not streaming
-    if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+    // Add Index and Dim arguments except for the root node and the child graph of
+    // parent node is not streaming
+    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
       F_X86 = addIdxDimArgs(F_X86);
 
     BB = &*F_X86->begin();
     RI = cast<ReturnInst>(BB->getTerminator());
-
-    // Add generated function info to DFNode
-    //    N->setGenFunc(F_X86, visc::CPU_TARGET);
+  
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
     N->addGenFunc(F_X86, visc::CPU_TARGET, true);
 
     // Loop over the arguments, to create the VMap.
@@ -1376,59 +1267,59 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     }
 
     // Iterate over children in topological order
-    for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-                                    ce = N->getChildGraph()->end();
-         ci != ce; ++ci) {
-      DFNode *C = *ci;
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      DFNode* C = *ci;
       // Skip dummy node call
       if (C->isDummyNode())
         continue;
-
+  
       // Create calls to CPU function of child node
       invokeChild_X86(C, F_X86, VMap, RI);
+  
     }
-
+ 
     DEBUG(errs() << "*** Generating epilogue code for the function****\n");
     // Generate code for output bindings
     // Get Exit node
-    DFNode *C = N->getChildGraph()->getExit();
+    DFNode* C = N->getChildGraph()->getExit();
     // Get OutputType of this node
-    StructType *OutTy = N->getOutputType();
+    StructType* OutTy = N->getOutputType();
     Value *retVal = UndefValue::get(F_X86->getReturnType());
     // Find all the input edges to exit node
-    for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
       DEBUG(errs() << "Output Edge " << i << "\n");
       // Find the incoming edge at the requested input port
-      DFEdge *E = C->getInDFEdgeAt(i);
-
+      DFEdge* E = C->getInDFEdgeAt(i);
+  
       assert(E && "No Binding for output element!");
       // Find the Source DFNode associated with the incoming edge
-      DFNode *SrcDF = E->getSourceDF();
-
-      DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
-                   << "\n");
-
+      DFNode* SrcDF = E->getSourceDF();
+  
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+  
       // If Source DFNode is a dummyNode, edge is from parent. Get the
       // argument from argument list of this internal node
-      Value *inputVal;
-      if (SrcDF->isEntryNode()) {
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
         inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
-      } else {
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
         // edge is from a internal node
         // Check - code should already be generated for this source dfnode
-        assert(OutputMap.count(SrcDF) &&
-               "Source node call not found. Dependency violation!");
-
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+  
         // Find Output Value associated with the Source DFNode using OutputMap
-        Value *CI = OutputMap[SrcDF];
-
+        Value* CI = OutputMap[SrcDF];
+  
         // Extract element at source position from this call instruction
         std::vector<unsigned> IndexList;
         IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI
-                     << "\n");
-        ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
         inputVal = EI;
       }
       std::vector<unsigned> IdxList;
@@ -1437,8 +1328,9 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     }
     DEBUG(errs() << "Extracted all\n");
     retVal->setName("output");
-    ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
     ReplaceInstWithInst(RI, newRI);
+
   }
 
   //-------------------------------------------------------------------------//
@@ -1455,22 +1347,24 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
   bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
 
-  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
-               << N->getTag() << "\n");
-  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
+	DEBUG(errs() << "Before editing\n");
+  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName()
+                     << " with tag " << N->getTag() << "\n");
+  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
   DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
+  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
   DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
 
+
   if (N->getTag() == visc::None) {
     // No code is available for this node. This (usually) means that this
     // node is a node that
     // - from the accelerator backends has been mapped to an intermediate
     // node, and thus they have not produced a genFunc
-    // - a child node had no CPU hint, thus no code gen for CPU could
+    // - a child node had no CPU hint, thus no code gen for CPU could 
     // take place
     DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
-                 << N->getFuncPointer()->getName() << "\n");
+           << N->getFuncPointer()->getName() << "\n");
   } else if (viscUtils::isSingleTargetTag(N->getTag())) {
     // There is a single version for this node according to code gen hints.
     // Therefore, we do not need to check the policy, we simply use the
@@ -1478,226 +1372,55 @@ void CGT_X86::codeGen(DFInternalNode *N) {
 
     // Sanity check - to be removed TODO
     switch (N->getTag()) {
-    case visc::CPU_TARGET:
-      assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
-      assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
-      assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-      assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-      break;
-    case visc::GPU_TARGET:
-      assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-      assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-      assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
-      assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
-      break;
-    default:
-      assert(false && "Unreachable: we checked that tag was single target!\n");
-      break;
-    }
-
-    // If device abstraction is enabled, then we may need to edit the node
-    // function. In case this is a GPU or SPIR gen func, we issue a call to
-    // the runtime that waits for the device to be available
-    if (DeviceAbstraction) {
-      Function *NodeGenFunc = NULL;
-      switch (N->getTag()) {
+      case visc::CPU_TARGET:
+        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+        break;
       case visc::GPU_TARGET:
-        NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
         break;
       default:
+        assert(false && "Unreachable: we checked that tag was single target!\n");
         break;
-      }
-
-      if (NodeGenFunc) {
-        // If we found a function to edit, we add the call to the runtime as
-        // its first statement
-        BasicBlock *BB = &*NodeGenFunc->begin();
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        FunctionCallee RTF = M.getOrInsertFunction(
-            "llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-            runtimeModule
-                ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")
-                ->getFunctionType());
-        CallInst *RTFInst =
-            CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
-      }
-    }
-
-    Function *Ftmp = N->getGenFuncForTarget(N->getTag());
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->setTag(visc::None);
-    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
-    N->setTag(visc::CPU_TARGET);
-
-    // Sanity checks - to be removed TODO
-    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-
-    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-
-    DEBUG(errs() << "After editing\n");
-    DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
-                 << N->getTag() << "\n");
-    DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
-    DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-    DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
-    DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
-
-    DEBUG(errs() << "Node Name (for policy) : "
-                 << N->getFuncPointer()->getName() << "\n");
-
-    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-
-    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-
-    // These assertions express what we can support with the current runtime.
-    // Code generation works the same way even for other target combinations.
-    // For now, we want either CPU and GPU, or CPU and SPIR
-    assert((CF && (GF || !GF)) && "Invalid target selection\n");
-    assert((CFx86 && (GFx86 || !GFx86)) &&
-           "Generated functions without appropriate x86 wrapper\n");
-
-    FunctionType *FT = CF->getFunctionType();
-    if (GF)
-      assert(FT == GF->getFunctionType() &&
-             "Type mismatch between generated functions for GPU and CPU "
-             "targets.\n");
-
-    // Code generation of wrapper function
-    Function *F_wrapper;
-    ValueToValueMapTy VMap;
-    F_wrapper =
-        Function::Create(FT, CF->getLinkage(), CF->getName() + "_wrapper", &M);
-
-    // Copy argument names over
-    Function::arg_iterator dest_iterator = F_wrapper->arg_begin();
-    for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
-         ++i) {
-      dest_iterator->setName(i->getName());
-      VMap[&*i] = &*dest_iterator;
-      ++dest_iterator;
-    }
-    // Gather all arguments of wrapper in a vector, to prepare the call to
-    // the individual gen functions
-    std::vector<Value *> GenFuncCallArgs;
-    for (Function::arg_iterator i = F_wrapper->arg_begin(),
-                                e = F_wrapper->arg_end();
-         i != e; ++i) {
-      GenFuncCallArgs.push_back(&*i);
-    }
-
-    BasicBlock *BBcurrent, *BBtrue, *BBfalse;
-
-    BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper);
-
-    StringRef FName = N->getFuncPointer()->getName();
-    size_t nameSize = FName.size() + 1;
-    std::vector<Constant *> NameV;
-    for (char c : FName) {
-      NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c));
-    }
-    NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0'));
-    ArrayType *NameType =
-        ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize);
-    AllocaInst *AI = new AllocaInst(NameType, 0, nullptr, "", BBcurrent);
-    Constant *NameConst = ConstantArray::get(NameType, NameV);
-    StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent);
-    CastInst *BI = BitCastInst::CreatePointerCast(
-        AI, Type::getInt8PtrTy(M.getContext()), "", BBcurrent);
-    std::vector<Value *> Args;
-    Args.push_back(BI);
-    Args.push_back(
-        ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
-    FunctionCallee RTF = M.getOrInsertFunction(
-        "llvm_visc_policy_getVersion",
-        runtimeModule->getFunction("llvm_visc_policy_getVersion")
-            ->getFunctionType());
-    CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
-
-    ConstantInt *CmpConst =
-        ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true);
-    CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                                    RTFInst, CmpConst, "", BBcurrent);
-
-    BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper);
-    BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper);
-    BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-
-    CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue);
-    ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-    // Switch basic block pointers
-    BBcurrent = BBfalse;
-    if (GF) {
-      // We have a GPU version. Generate policy check and call
-      CmpConst = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true);
-      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, RTFInst,
-                             CmpConst, "", BBcurrent);
-      BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper);
-      BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper);
-      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-
-      GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue);
-      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-      if (DeviceAbstraction) {
-        // Prepare arguments and function for call to wait for device runtime
-        // call
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        FunctionCallee RTF = M.getOrInsertFunction(
-            "llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-            runtimeModule
-                ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")
-                ->getFunctionType());
-        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
-      }
     }
+		
+    N->addGenFunc(N->getGenFuncForTarget(N->getTag()),
+				visc::CPU_TARGET,
+				true);
+		N->removeGenFuncForTarget(visc::GPU_TARGET);
+		N->setTag(visc::CPU_TARGET);
+
+		// Sanity checks - to be removed TODO
+		CF = N->getGenFuncForTarget(visc::CPU_TARGET);
+		GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+
+		CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
+		GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+
+		DEBUG(errs() << "After editing\n");
+		DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() 
+                        << " with tag " << N->getTag() << "\n");
+		DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
+		DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
+		DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
+		DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
+
+   } 
+   else {
+    assert(false && "Multiple tags unsupported!");
+   }
 
-    // Switch basic block pointers
-    BBcurrent = BBfalse;
-    // if (SF) {
-    // We have a GPU version. Generate policy check and call
-    //   CmpConst =
-    //    ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
-    // CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-    //                     RTFInst, CmpConst, "", BBcurrent);
-    // BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
-    // BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
-    // BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
-
-    // GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
-    // RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
-
-    // if (DeviceAbstraction) {
-    // Prepare arguments and function for call to wait for device runtime call
-    //  std::vector<Value *> Args; // TODO: add the device type as argument?
-    // FunctionCallee RTF =
-    //  M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-    // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
-    // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
-    // }
-    // }
-
-    RI = ReturnInst::Create(M.getContext(),
-                            UndefValue::get(FT->getReturnType()), BBfalse);
-
-    // Now, make the node cpu gen func to be this one
-    // Remove all other versions and update the tag
-    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->setTag(visc::CPU_TARGET);
-
-    // assert(false && "got to the point where we have to combine\n");
-  }
 }
 
 // Code generation for leaf nodes
-void CGT_X86::codeGen(DFLeafNode *N) {
+void CGT_X86::codeGen(DFLeafNode* N) {
   // Skip code generation if it is a dummy node
-  if (N->isDummyNode()) {
+  if(N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
@@ -1714,24 +1437,21 @@ void CGT_X86::codeGen(DFLeafNode *N) {
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-  //  if(N->getGenFunc() != NULL)
-  //    return;
+//  if(N->getGenFunc() != NULL)
+//    return;
 
   if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
-                 << " : skipping it\n");
-
-    DEBUG(errs() << "Check for cudnn or promise hint for node "
-                 << N->getFuncPointer()->getName() << "\n");
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
+              " : skipping it\n";
 
     switch (N->getTag()) {
-    case visc::GPU_TARGET:
-      // A leaf node should not have an x86 function for GPU
-      // by design of DFG2LLVM_NVPTX backend
-      assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-      break;
-    default:
-      break;
+       case visc::GPU_TARGET:
+         // A leaf node should not have an x86 function for GPU
+         // by design of DFG2LLVM_NVPTX backend
+         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "Leaf node not expected to have GPU GenFunc");
+         break;
+       default:
+         break;
     }
 
     return;
@@ -1741,7 +1461,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
          "Error: Visiting a node for which code already generated\n");
 
   std::vector<IntrinsicInst *> IItoRemove;
-  std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace;
+  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
   BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
 
   // Get the function associated woth the dataflow node
@@ -1757,61 +1477,61 @@ void CGT_X86::codeGen(DFLeafNode *N) {
 
   // Add the new argument to the argument list. Add arguments only if the cild
   // graph of parent node is not streaming
-  if (!N->getParent()->isChildGraphStreaming())
+  if(!N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   // Add generated function info to DFNode
-  //  N->setGenFunc(F_X86, visc::CPU_TARGET);
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
   N->addGenFunc(F_X86, visc::CPU_TARGET, true);
 
   // Go through the arguments, and any pointer arguments with in attribute need
   // to have x86_argument_ptr call to get the x86 ptr of the argument
   // Insert these calls in a new BB which would dominate all other BBs
   // Create new BB
-  BasicBlock *EntryBB = &*F_X86->begin();
-  BasicBlock *BB =
-      BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
-  BranchInst *Terminator = BranchInst::Create(EntryBB, BB);
+  BasicBlock* EntryBB = &*F_X86->begin();
+  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
   // Insert calls
-  for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
-       ai != ae; ++ai) {
-    if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1,
-                                            Attribute::In)) {
-      assert(ai->getType()->isPointerTy() &&
-             "Only pointer arguments can have visc in/out attributes ");
+  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+        ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
+      assert(ai->getType()->isPointerTy()
+          && "Only pointer arguments can have visc in/out attributes ");
       Function::arg_iterator aiNext = ai;
       ++aiNext;
-      Argument *size = &*aiNext;
-      assert(size->getType() == Type::getInt64Ty(M.getContext()) &&
-             "Next argument after a pointer should be an i64 type");
-      CastInst *BI = BitCastInst::CreatePointerCast(
-          &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr",
-          Terminator);
-      Value *ArgPtrCallArgs[] = {BI, size};
+      Argument* size = &*aiNext;
+      assert(size->getType() == Type::getInt64Ty(M.getContext())
+          && "Next argument after a pointer should be an i64 type");
+      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
+                                                    Type::getInt8PtrTy(M.getContext()),
+                                                    ai->getName()+".i8ptr",
+                                                    Terminator);
+      Value* ArgPtrCallArgs[] = {BI, size};
       CallInst::Create(llvm_visc_x86_argument_ptr,
-                       ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator);
+                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
+                                              "",
+                                              Terminator);
+
     }
   }
-  DEBUG(errs() << *BB << "\n");
+  errs() << *BB << "\n";
 
   // Go through all the instructions
   for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
     Instruction *I = &(*i);
     DEBUG(errs() << *I << "\n");
     // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) &&
-           "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) &&
-           "VISC graph intrinsic within a leaf dataflow node!");
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
 
     if (BuildDFG::isViscQueryIntrinsic(I)) {
-      IntrinsicInst *II = cast<IntrinsicInst>(I);
-      IntrinsicInst *ArgII;
-      DFNode *ArgDFNode;
+      IntrinsicInst* II = cast<IntrinsicInst>(I);
+      IntrinsicInst* ArgII;
+      DFNode* ArgDFNode;
 
       /***********************************************************************
-       *                        Handle VISC Query intrinsics                  *
-       ***********************************************************************/
+      *                        Handle VISC Query intrinsics                  *
+      ***********************************************************************/
       switch (II->getIntrinsicID()) {
       /**************************** llvm.visc.getNode() *******************/
       case Intrinsic::visc_getNode: {
@@ -1840,9 +1560,8 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         // get the appropriate field
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
-        IntegerType *IntTy = Type::getInt32Ty(M.getContext());
-        ConstantInt *numOfDimConstant =
-            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
+        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
+        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
 
         II->replaceAllUsesWith(numOfDimConstant);
         IItoRemove.push_back(II);
@@ -1858,46 +1577,46 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         // The dfnode argument should be an ancestor of this leaf node or
         // the leaf node itself
         int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
-               "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
 
         // Get specified dimension
         // (dim = 0) => x
         // (dim = 1) => y
         // (dim = 2) => z
-        int dim =
-            (int)(II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x);
-        assert((dim >= 0) && (dim < 3) &&
-               "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic "
-               "ID!");
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNodeInstanceID_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
 
         // For immediate ancestor, use the extra argument introduced in
         // F_X86
         int numParamsF = F->getFunctionType()->getNumParams();
         int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert(
-            (numParamsF_X86 - numParamsF == 6) &&
-            "Difference of arguments between function and its clone is not 6!");
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
 
-        if (parentLevel == 0) {
+        if(parentLevel == 0) {
           // Case when the query is for this node itself
-          unsigned offset = 3 + (3 - dim);
+          unsigned offset = 3 + (3-dim);
           // Traverse argument list of F_X86 in reverse order to find the
           // correct index or dim argument.
-          Argument *indexVal = getArgumentFromEnd(F_X86, offset);
+          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
           assert(indexVal && "Index argument not found. Invalid offset!");
 
           DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
 
           II->replaceAllUsesWith(indexVal);
           IItoRemove.push_back(II);
-        } else {
+        }
+        else {
           // Case when query is for an ancestor
-          Value *args[] = {
-              ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-              ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
-          CallInst *CI = CallInst::Create(llvm_visc_x86_getDimInstance,
-                                          ArrayRef<Value *>(args, 2),
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
+                                          ArrayRef<Value*>(args, 2),
                                           "nodeInstanceID", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
           II->replaceAllUsesWith(CI);
@@ -1916,46 +1635,46 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         // The dfnode argument should be an ancestor of this leaf node or
         // the leaf node itself
         int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
-               "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
+               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
 
         // Get specified dimension
         // (dim = 0) => x
         // (dim = 1) => y
         // (dim = 2) => z
-        int dim =
-            (int)(II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x);
-        assert((dim >= 0) && (dim < 3) &&
-               "Invalid dimension for getNumNodeInstances_[xyz]. Check "
-               "Intrinsic ID!");
+        int dim = (int) (II->getIntrinsicID() -
+                         Intrinsic::visc_getNumNodeInstances_x);
+        assert((dim >= 0) && (dim < 3)
+               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
 
         // For immediate ancestor, use the extra argument introduced in
         // F_X86
         int numParamsF = F->getFunctionType()->getNumParams();
         int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert(
-            (numParamsF_X86 - numParamsF == 6) &&
-            "Difference of arguments between function and its clone is not 6!");
+        assert((numParamsF_X86 - numParamsF == 6)
+               && "Difference of arguments between function and its clone is not 6!");
 
-        if (parentLevel == 0) {
+        if(parentLevel == 0) {
           // Case when the query is for this node itself
           unsigned offset = 3 - dim;
           // Traverse argument list of F_X86 in reverse order to find the
           // correct index or dim argument.
-          Argument *limitVal = getArgumentFromEnd(F_X86, offset);
+          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
           assert(limitVal && "Limit argument not found. Invalid offset!");
 
-          DEBUG(errs() << *II << " replaced with " << *limitVal << "\n");
+          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
 
           II->replaceAllUsesWith(limitVal);
           IItoRemove.push_back(II);
-        } else {
+        }
+        else {
           // Case when query is from the ancestor
-          Value *args[] = {
-              ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-              ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
-          CallInst *CI = CallInst::Create(llvm_visc_x86_getDimLimit,
-                                          ArrayRef<Value *>(args, 2),
+          Value* args[] = {
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
+          };
+          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
+                                          ArrayRef<Value*>(args, 2),
                                           "numNodeInstances", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
           II->replaceAllUsesWith(CI);
@@ -1965,26 +1684,20 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         break;
       }
       default:
-        DEBUG(errs() << "Found unknown intrinsic with ID = "
-                     << II->getIntrinsicID() << "\n");
+        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
+              II->getIntrinsicID() << "\n");
         assert(false && "Unknown VISC Intrinsic!");
         break;
       }
 
     } else {
-      // TODO: how to handle address space qualifiers in load/store
     }
+
   }
 
-  // TODO:
-  // When to replace the uses?
-  // In which order is it safe to replace the instructions in
-  // IItoReplace?
-  // Probably in the reverse order in the vectors
-  // It is a good idea to have them in one vector and chech the type
-  // using dyn_cast in order to determine if we replace with inst or value
 
-  // TODO: maybe leave these instructions to be removed by a later DCE pass
+
+  // Remove them in reverse order
   for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
        i != IItoRemove.end(); ++i) {
     (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
@@ -1997,7 +1710,8 @@ void CGT_X86::codeGen(DFLeafNode *N) {
 } // End of namespace
 
 char DFG2LLVM_X86::ID = 0;
-static RegisterPass<DFG2LLVM_X86>
-    X("dfg2llvm-x86", "Dataflow Graph to LLVM for X86 backend",
-      false /* does not modify the CFG */,
-      true /* transformation, not just analysis */);
+static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86",
+                                    "Dataflow Graph to LLVM for X86 backend",
+                                    false /* does not modify the CFG */,
+                                    true /* transformation, not just analysis */);
+
diff --git a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt
deleted file mode 100644
index 7069470a1a6f8b1a49eea2824f27204ebdf3fb26..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-10
-10 15 10 16 15 30 15 25 20 15
diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/visc-rt/device_abstraction.h
deleted file mode 100644
index 7e77d100deb6b23b6ed9ca994796cd1cb108b0d4..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/device_abstraction.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef __DEVICE_ABSTRACTION__
-#define __DEVICE_ABSTRACTION__
-
-#include <fstream>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <thread>
-#include <time.h>
-#include <vector>
-
-#define MIN_INTERVAL 2
-#define MAX_INTERVAL 8
-#define NUM_INTERVALS 10
-
-// Device status variable: true if the device is available for use
-volatile bool deviceStatus = true;
-// Intervals at which to change the device status
-std::vector<unsigned> Intervals;
-
-// Set to true when program execution ends and so we can end the device
-// simulation
-volatile bool executionEnd = false;
-
-void initializeDeviceStatusIntervals() {
-
-  unsigned sz = 0;
-  unsigned tmp = 0;
-
-  const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/"
-                   "deviceStatusSwitchIntervals.txt";
-  std::ifstream infile;
-  infile.open(fn);
-  if (!infile.is_open()) {
-    std::cout << "Failed to open " << fn << " for reading\n";
-    return;
-  }
-  infile >> sz;
-
-  if (sz) {
-    // We have data. Read them into the vector
-    for (unsigned i = 0; i < sz; i++) {
-      infile >> tmp;
-      Intervals.push_back(tmp);
-    }
-    infile.close();
-  } else {
-    // We have no data. Create random data and write them into the file
-    infile.close();
-    std::ofstream outfile;
-    outfile.open(fn);
-    if (!outfile.is_open()) {
-      std::cout << "Failed to open " << fn << " for writing\n";
-      return;
-    }
-    sz = 1 + rand() % NUM_INTERVALS;
-    outfile << sz;
-    for (unsigned i = 0; i < sz; i++) {
-      Intervals.push_back(MIN_INTERVAL +
-                          rand() % (MAX_INTERVAL - MIN_INTERVAL));
-      outfile << Intervals[i];
-    }
-    outfile.close();
-  }
-
-  return;
-}
-
-void updateDeviceStatus() {
-
-  unsigned i = 0;
-  while (!executionEnd) {
-    std::this_thread::sleep_for(std::chrono::seconds(Intervals[i]));
-    deviceStatus = !deviceStatus;
-    std::cout << "Changed device status to " << deviceStatus << "\n";
-    i = (i + 1) % Intervals.size();
-  }
-}
-
-#endif // __DEVICE_ABSTRACTION__
diff --git a/hpvm/projects/visc-rt/makefile b/hpvm/projects/visc-rt/makefile
deleted file mode 100644
index adcc6323356d2537eca6ed653cad6d17a1d1ef0e..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-#LLVM_SRC_ROOT = 
-LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/
-
-CUDA_INC_PATH = /software/cuda-9.1/include/CL/
-
-
-ifeq ($(NUM_CORES),)
-  NUM_CORES=1
-endif
-
-CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
-TARGET:=visc-rt
-
-LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
-LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
-
-OPTS =
-
-ifeq ($(DEBUG),1)
-  OPTS+=-DDEBUG_BUILD
-endif
-
-all: $(TARGET:%=%.ll)
-
-$(TARGET:%=%.ll):%.ll:%.cpp %.h
-	$(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@
-
-clean :
-	rm -f $(TARGET).ll
diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/visc-rt/policy.h
deleted file mode 100644
index d50e65868b376bfbcc3d4bd00d4919db677722b8..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/policy.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef __POLICY__
-#define __POLICY__
-
-#include "device_abstraction.h"
-#include <string>
-
-/************************* Policies *************************************/
-class Policy {
-public:
-  virtual int getVersion(const char *, int64_t) = 0;
-  virtual ~Policy(){};
-};
-
-class ConstPolicy : public Policy {
-public:
-  ConstPolicy(int deviceID) : deviceID(deviceID) {}
-
-  int getVersion(const char *, int64_t) override { return deviceID; }
-
-private:
-  int deviceID;
-};
-
-class NodePolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    std::string s(name);
-    // std::string NodeNames[1] = {
-    // "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
-    std::string NodeNames[] = {
-        "WrapperGaussianSmoothing_cloned",
-        "WrapperlaplacianEstimate_cloned",
-        "WrapperComputeZeroCrossings_cloned",
-        "WrapperComputeGradient_cloned",
-        "WrapperComputeMaxGradient_cloned",
-        "WrapperRejectZeroCrossings_cloned",
-    };
-    // if (!s.compare(NodeNames[4])) {
-    //  std::cout << s << ": CPU" << "\n";
-    //  return 0;
-    //}
-    return 2;
-  }
-};
-
-class IterationPolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    if ((it % 10 == 0) || (it % 10 == 1))
-      return 0;
-    else
-      return 2;
-  }
-};
-
-class DeviceStatusPolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    if (deviceStatus) {
-      // std::cout << "Returning GPU\n";
-      return 2;
-    } else {
-      // std::cout << "Returning CPU\n";
-      return 0;
-    }
-  }
-};
-
-/* ------------------------------------------------------------------------- */
-// Added for the CFAR interactive policy demo.
-
-class InteractivePolicy : public Policy {
-private:
-  // 0 :for CPU, 1 for GPU, 2 for Vector
-  unsigned int userTargetDeviceChoice;
-  // Used to end thread execution
-  bool end;
-  // Thread that will update userTargetDeviceChoice
-  std::thread userTargetDeviceChoiceThread;
-  // Thread function
-  void updateUserTargetChoice() {
-    while (!end) {
-      std::cout << "Select target device (0 for CPU, 1 fpr GPU): ";
-      std::cin >> userTargetDeviceChoice;
-      if (userTargetDeviceChoice > 1) {
-        std::cout << "Invalid target device. Selecting GPU instead.\n";
-        userTargetDeviceChoice = 1;
-      }
-    }
-  }
-
-public:
-  // Inherited method, erquired for every policy object
-  virtual int getVersion(const char *name, int64_t it) {
-    return userTargetDeviceChoice;
-  }
-
-  InteractivePolicy() {
-    userTargetDeviceChoice = 1;
-    end = false;
-    userTargetDeviceChoiceThread =
-        std::thread(&InteractivePolicy::updateUserTargetChoice, this);
-  }
-
-  ~InteractivePolicy() {
-    end = true;
-    userTargetDeviceChoiceThread.join();
-  }
-};
-
-#endif // __POLICY__
diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp
index 53d3b516f22b59857b1a17aecba32a6b723998f0..df5b1b80f7ae71ca49f461a50f36f81064028ef9 100644
--- a/hpvm/projects/visc-rt/visc-rt.cpp
+++ b/hpvm/projects/visc-rt/visc-rt.cpp
@@ -52,7 +52,6 @@ cl_context globalOCLContext;
 cl_device_id *clDevices;
 cl_command_queue globalCommandQue;
 
-Policy *policy = NULL;
 MemTracker MTracker;
 vector<DFGDepth> DStack;
 // Mutex to prevent concurrent access by multiple thereads in pipeline
@@ -69,55 +68,6 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) {
   }
 }
 
-/************************* Policies *************************************/
-void llvm_visc_policy_init() {
-  cout << "Initializing policy object ...\n";
-  //  policy = new NodePolicy();
-  //  policy = new IterationPolicy();
-  //  policy = new DeviceStatusPolicy();
-  // policy = new InteractivePolicy();
-  policy = new ConstPolicy(0);
-  cout << "DONE: Initializing policy object.\n";
-}
-
-void llvm_visc_policy_clear() {
-  if (policy)
-    free(policy);
-}
-
-int llvm_visc_policy_getVersion(const char *name, int64_t i) {
-  return policy->getVersion(name, i);
-}
-
-/******************** Device Abstraction ********************************/
-std::thread deviceStatusThread;
-
-void llvm_visc_deviceAbstraction_start() {
-  cout << "Starting device status simulation ...\n";
-  // Initialize vector with points where ti switch device status
-  initializeDeviceStatusIntervals();
-  // Create a thread that performs the changes
-  deviceStatusThread = std::thread(updateDeviceStatus);
-  cout << "Started device status simulation thread ...\n";
-  return;
-}
-
-void llvm_visc_deviceAbstraction_end() {
-  cout << "Ending device status simulation thread ...\n";
-  // Set the variable that allows the thread to know that execution has ended
-  executionEnd = true;
-  // Wait for the thread that manages device status to terminate
-  deviceStatusThread.join();
-  cout << "Ended device status simulation.\n";
-  return;
-}
-
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
-  while (!deviceStatus) {
-  };
-  return;
-}
-
 /************************* Depth Stack Routines ***************************/
 
 void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/visc-rt/visc-rt.h
index 3ad315768bf90584a68c1d620ac68936e62a17f0..d9d946f1da14245f8cde426e7b5ea92f791537f5 100644
--- a/hpvm/projects/visc-rt/visc-rt.h
+++ b/hpvm/projects/visc-rt/visc-rt.h
@@ -15,8 +15,6 @@
 
 #include "../../include/SupportVISC/VISCHint.h"
 #include "../../include/SupportVISC/VISCTimer.h"
-#include "device_abstraction.h"
-#include "policy.h"
 
 #ifndef DEBUG_BUILD
 #define DEBUG(s)                                                               \
@@ -29,17 +27,6 @@ using namespace std;
 
 extern "C" {
 
-/************************* Policies *************************************/
-
-void llvm_visc_policy_init();
-void llvm_visc_policy_clear();
-int llvm_visc_policy_getVersion(const char *, int64_t);
-
-/******************** Device Abstraction ********************************/
-void llvm_visc_deviceAbstraction_start();
-void llvm_visc_deviceAbstraction_end();
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
-
 /********************* DFG Depth Stack **********************************/
 class DFGDepth {
 private:
diff --git a/hpvm/test/README.md b/hpvm/test/README.md
index b391bd2186b696685adb2645c288c01ea7006850..e709ef04195c90e0f91c2a4b4a4b1d2f0b716d1f 100644
--- a/hpvm/test/README.md
+++ b/hpvm/test/README.md
@@ -4,9 +4,7 @@ Tests are provided, along with a template Makefile for user projects.
 ## Parboil
 Several tests from the [parboil suite](http://impact.crhc.illinois.edu/parboil/parboil.aspx) have been ported to HPVM.
 To run one of these tests, navigate to its directory under `parboil/benchmarks/`.
-Tests may be built for the cpu or gpu with hpvm, and openCL versions are provided for comparison.
-Check under the `src/` directory in each benchmark to see which versions are available,
-denoted by the names of the subdirectories.
+Tests may be built for the cpu or gpu with hpvm.
 ```
 # sgemm example
 cd parboil/benchmarks/sgemm
@@ -16,9 +14,6 @@ make run TARGET=seq VERSION=visc
 # HPVM gpu
 make TARGET=gpu VERSION=visc
 make run TARGET=gpu VERSION=visc
-# openCL
-make VERSION=opencl_base
-make run VERSION=opencl_base
 ```
 
 ## Cava
diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile
index 24027cb1a9f78599f2a41a5c1f960701c112395e..0054af8c4d9cc39c21b00e73a5b53c8ac2a089b8 100644
--- a/hpvm/test/hpvm-cava/Makefile
+++ b/hpvm/test/hpvm-cava/Makefile
@@ -52,7 +52,7 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(VISC_BUILD_DIR)/tools/hpvm/projects/visc-rt
+VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt
 
 VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
 
@@ -82,9 +82,7 @@ OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
 TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
-ifeq ($(TARGET),seq)
-else
-  KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
+ifeq ($(TARGET),gpu)
   KERNEL_OCL = $(TEST_OBJS).kernels.cl
 endif
 
@@ -102,7 +100,7 @@ default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE)
 #default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
 
 $(KERNEL_OCL) : $(KERNEL)
-	$(OCLBE) --debug $< -o $@
+	$(OCLBE) $< -o $@
 
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
@@ -111,7 +109,7 @@ $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
 $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -123,6 +121,6 @@ $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
 	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
 
 $(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/hpvm-cava/Makefile.config b/hpvm/test/hpvm-cava/Makefile.config
index ffb2942911313421ce9b9186a392578d7bcaf4c7..1bdb62dec493fc63e581ae8204ea736c45ed5f7d 100644
--- a/hpvm/test/hpvm-cava/Makefile.config
+++ b/hpvm/test/hpvm-cava/Makefile.config
@@ -3,23 +3,21 @@ CUDA_LIB_PATH=$(CUDA_PATH)/lib64
 OPENCL_PATH=/software/cuda-9.1
 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-release/hpvm/llvm/
-# NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-reorg-9-temp/hpvm/llvm/
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/../build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
-OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe
+LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build
+CC = $(LLVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
+OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+CXX = $(LLVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(LLVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/hpvm-cava/Makefile.config.example b/hpvm/test/hpvm-cava/Makefile.config.example
index 269f0b7df273c958f0cd20a0f935716a329e00ae..2627ca508f17acb96c858bf4473eed4d89ebec20 100644
--- a/hpvm/test/hpvm-cava/Makefile.config.example
+++ b/hpvm/test/hpvm-cava/Makefile.config.example
@@ -1,23 +1,21 @@
-CUDA_PATH=/usr/local/cuda 
-CUDA_LIB_PATH=/usr/local/cuda/lib64
-OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64
-OPENCL_LIB_PATH=$(OPENCL_PATH)/lib
+CUDA_PATH=/software/cuda-9.1
+CUDA_LIB_PATH=$(CUDA_PATH)/lib64
+OPENCL_PATH=/software/cuda-9.1
+OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-# NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build
+CC = $(LLVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
+OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+CXX = $(LLVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
-
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(LLVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile
deleted file mode 100644
index 25131be39c23baa95b34e95444f7e19e1d03b389..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o lbm.o ocl.o
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl
deleted file mode 100644
index 3f34ea5ef25943ac7eeb18eead429f70ec4cf807..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef LBM_KERNEL_CL
-#define LBM_KERNEL_CL
-
-#include "layout_config.h"
-#include "lbm_macros.h"
-/******************************************************************************/
-
-__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid )
-{
-	srcGrid += MARGIN;
-	dstGrid += MARGIN;
-
-
-	//Using some predefined macros here.  Consider this the declaration 
-        //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
-
-        SWEEP_VAR
-	SWEEP_X = get_local_id(0);
-	SWEEP_Y = get_group_id(0);
-	SWEEP_Z = get_group_id(1);
-	
-	float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
-	float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
-	float tempSB, tempET, tempEB, tempWT, tempWB ;
-
-	//Load all of the input fields
-	//This is a gather operation of the SCATTER preprocessor variable
-        // is undefined in layout_config.h, or a "local" read otherwise
-	tempC = SRC_C(srcGrid);
-
-	tempN = SRC_N(srcGrid);
-	tempS = SRC_S(srcGrid);
-	tempE = SRC_E(srcGrid);
-	tempW = SRC_W(srcGrid);
-	tempT = SRC_T(srcGrid);
-	tempB = SRC_B(srcGrid);
-
-	tempNE = SRC_NE(srcGrid);
-	tempNW = SRC_NW(srcGrid);
-	tempSE = SRC_SE(srcGrid);
-	tempSW = SRC_SW(srcGrid);
-	tempNT = SRC_NT(srcGrid);
-	tempNB = SRC_NB(srcGrid);
-	tempST = SRC_ST(srcGrid);
-	tempSB = SRC_SB(srcGrid);
-	tempET = SRC_ET(srcGrid);
-	tempEB = SRC_EB(srcGrid);
-	tempWT = SRC_WT(srcGrid);
-	tempWB = SRC_WB(srcGrid);
-
-	//Test whether the cell is fluid or obstacle
-	if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) {
-		
-		//Swizzle the inputs: reflect any fluid coming into this cell 
-		// back to where it came from
-		temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ;
-		temp_swp = tempE ; tempE = tempW ; tempW = temp_swp;
-		temp_swp = tempT ; tempT = tempB ; tempB = temp_swp;
-		temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp;
-		temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp;
-		temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; 
-		temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp;
-		temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp;
-		temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp;
-	}
-	else {
- 
-                //The math meat of LBM: ignore for optimization
-	        float ux, uy, uz, rho, u2;
-		float temp1, temp2, temp_base;
-		rho = tempC + tempN
-			+ tempS + tempE
-			+ tempW + tempT
-			+ tempB + tempNE
-			+ tempNW + tempSE
-			+ tempSW + tempNT
-			+ tempNB + tempST
-			+ tempSB + tempET
-			+ tempEB + tempWT
-			+ tempWB;
-
-		ux = + tempE - tempW
-			+ tempNE - tempNW
-			+ tempSE - tempSW
-			+ tempET + tempEB
-			- tempWT - tempWB;
-
-		uy = + tempN - tempS
-			+ tempNE + tempNW
-			- tempSE - tempSW
-			+ tempNT + tempNB
-			- tempST - tempSB;
-
-		uz = + tempT - tempB
-			+ tempNT - tempNB
-			+ tempST - tempSB
-			+ tempET - tempEB
-			+ tempWT - tempWB;		
-		
-		ux /= rho;
-		uy /= rho;
-		uz /= rho;
-
-		if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) {
-
-			ux = 0.005f;
-			uy = 0.002f;
-			uz = 0.000f;
-		}
-
-		u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
-		temp_base = OMEGA*rho;
-		temp1 = DFL1*temp_base;
-
-		//Put the output values for this cell in the shared memory
-		temp_base = OMEGA*rho;
-		temp1 = DFL1*temp_base;
-		temp2 = 1.0f-OMEGA;
-		tempC = temp2*tempC + temp1*(                                 - u2);
-	        temp1 = DFL2*temp_base;	
-		tempN = temp2*tempN + temp1*(       uy*(4.5f*uy       + 3.0f) - u2);
-		tempS = temp2*tempS + temp1*(       uy*(4.5f*uy       - 3.0f) - u2);
-		tempT = temp2*tempT + temp1*(       uz*(4.5f*uz       + 3.0f) - u2);
-		tempB = temp2*tempB + temp1*(       uz*(4.5f*uz       - 3.0f) - u2);
-		tempE = temp2*tempE + temp1*(       ux*(4.5f*ux       + 3.0f) - u2);
-		tempW = temp2*tempW + temp1*(       ux*(4.5f*ux       - 3.0f) - u2);
-		temp1 = DFL3*temp_base;
-		tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
-		tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
-		tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
-		tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
-		tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
-		tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
-		tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
-		tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
-		tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
-		tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
-		tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
-		tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
-	}
-
-	//Write the results computed above
-	//This is a scatter operation of the SCATTER preprocessor variable
-        // is defined in layout_config.h, or a "local" write otherwise
-	DST_C ( dstGrid ) = tempC;
-
-	DST_N ( dstGrid ) = tempN; 
-	DST_S ( dstGrid ) = tempS;
-	DST_E ( dstGrid ) = tempE;
-	DST_W ( dstGrid ) = tempW;
-	DST_T ( dstGrid ) = tempT;
-	DST_B ( dstGrid ) = tempB;
-
-	DST_NE( dstGrid ) = tempNE;
-	DST_NW( dstGrid ) = tempNW;
-	DST_SE( dstGrid ) = tempSE;
-	DST_SW( dstGrid ) = tempSW;
-	DST_NT( dstGrid ) = tempNT;
-	DST_NB( dstGrid ) = tempNB;
-	DST_ST( dstGrid ) = tempST;
-	DST_SB( dstGrid ) = tempSB;
-	DST_ET( dstGrid ) = tempET;
-	DST_EB( dstGrid ) = tempEB;
-	DST_WT( dstGrid ) = tempWT;
-	DST_WB( dstGrid ) = tempWB;
-}
-
-#endif // LBM_KERNEL_CL
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl
deleted file mode 100644
index 3f34ea5ef25943ac7eeb18eead429f70ec4cf807..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef LBM_KERNEL_CL
-#define LBM_KERNEL_CL
-
-#include "layout_config.h"
-#include "lbm_macros.h"
-/******************************************************************************/
-
-__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid )
-{
-	srcGrid += MARGIN;
-	dstGrid += MARGIN;
-
-
-	//Using some predefined macros here.  Consider this the declaration 
-        //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
-
-        SWEEP_VAR
-	SWEEP_X = get_local_id(0);
-	SWEEP_Y = get_group_id(0);
-	SWEEP_Z = get_group_id(1);
-	
-	float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
-	float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
-	float tempSB, tempET, tempEB, tempWT, tempWB ;
-
-	//Load all of the input fields
-	//This is a gather operation of the SCATTER preprocessor variable
-        // is undefined in layout_config.h, or a "local" read otherwise
-	tempC = SRC_C(srcGrid);
-
-	tempN = SRC_N(srcGrid);
-	tempS = SRC_S(srcGrid);
-	tempE = SRC_E(srcGrid);
-	tempW = SRC_W(srcGrid);
-	tempT = SRC_T(srcGrid);
-	tempB = SRC_B(srcGrid);
-
-	tempNE = SRC_NE(srcGrid);
-	tempNW = SRC_NW(srcGrid);
-	tempSE = SRC_SE(srcGrid);
-	tempSW = SRC_SW(srcGrid);
-	tempNT = SRC_NT(srcGrid);
-	tempNB = SRC_NB(srcGrid);
-	tempST = SRC_ST(srcGrid);
-	tempSB = SRC_SB(srcGrid);
-	tempET = SRC_ET(srcGrid);
-	tempEB = SRC_EB(srcGrid);
-	tempWT = SRC_WT(srcGrid);
-	tempWB = SRC_WB(srcGrid);
-
-	//Test whether the cell is fluid or obstacle
-	if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) {
-		
-		//Swizzle the inputs: reflect any fluid coming into this cell 
-		// back to where it came from
-		temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ;
-		temp_swp = tempE ; tempE = tempW ; tempW = temp_swp;
-		temp_swp = tempT ; tempT = tempB ; tempB = temp_swp;
-		temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp;
-		temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp;
-		temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; 
-		temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp;
-		temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp;
-		temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp;
-	}
-	else {
- 
-                //The math meat of LBM: ignore for optimization
-	        float ux, uy, uz, rho, u2;
-		float temp1, temp2, temp_base;
-		rho = tempC + tempN
-			+ tempS + tempE
-			+ tempW + tempT
-			+ tempB + tempNE
-			+ tempNW + tempSE
-			+ tempSW + tempNT
-			+ tempNB + tempST
-			+ tempSB + tempET
-			+ tempEB + tempWT
-			+ tempWB;
-
-		ux = + tempE - tempW
-			+ tempNE - tempNW
-			+ tempSE - tempSW
-			+ tempET + tempEB
-			- tempWT - tempWB;
-
-		uy = + tempN - tempS
-			+ tempNE + tempNW
-			- tempSE - tempSW
-			+ tempNT + tempNB
-			- tempST - tempSB;
-
-		uz = + tempT - tempB
-			+ tempNT - tempNB
-			+ tempST - tempSB
-			+ tempET - tempEB
-			+ tempWT - tempWB;		
-		
-		ux /= rho;
-		uy /= rho;
-		uz /= rho;
-
-		if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) {
-
-			ux = 0.005f;
-			uy = 0.002f;
-			uz = 0.000f;
-		}
-
-		u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
-		temp_base = OMEGA*rho;
-		temp1 = DFL1*temp_base;
-
-		//Put the output values for this cell in the shared memory
-		temp_base = OMEGA*rho;
-		temp1 = DFL1*temp_base;
-		temp2 = 1.0f-OMEGA;
-		tempC = temp2*tempC + temp1*(                                 - u2);
-	        temp1 = DFL2*temp_base;	
-		tempN = temp2*tempN + temp1*(       uy*(4.5f*uy       + 3.0f) - u2);
-		tempS = temp2*tempS + temp1*(       uy*(4.5f*uy       - 3.0f) - u2);
-		tempT = temp2*tempT + temp1*(       uz*(4.5f*uz       + 3.0f) - u2);
-		tempB = temp2*tempB + temp1*(       uz*(4.5f*uz       - 3.0f) - u2);
-		tempE = temp2*tempE + temp1*(       ux*(4.5f*ux       + 3.0f) - u2);
-		tempW = temp2*tempW + temp1*(       ux*(4.5f*ux       - 3.0f) - u2);
-		temp1 = DFL3*temp_base;
-		tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
-		tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
-		tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
-		tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
-		tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
-		tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
-		tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
-		tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
-		tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
-		tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
-		tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
-		tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
-	}
-
-	//Write the results computed above
-	//This is a scatter operation of the SCATTER preprocessor variable
-        // is defined in layout_config.h, or a "local" write otherwise
-	DST_C ( dstGrid ) = tempC;
-
-	DST_N ( dstGrid ) = tempN; 
-	DST_S ( dstGrid ) = tempS;
-	DST_E ( dstGrid ) = tempE;
-	DST_W ( dstGrid ) = tempW;
-	DST_T ( dstGrid ) = tempT;
-	DST_B ( dstGrid ) = tempB;
-
-	DST_NE( dstGrid ) = tempNE;
-	DST_NW( dstGrid ) = tempNW;
-	DST_SE( dstGrid ) = tempSE;
-	DST_SW( dstGrid ) = tempSW;
-	DST_NT( dstGrid ) = tempNT;
-	DST_NB( dstGrid ) = tempNB;
-	DST_ST( dstGrid ) = tempST;
-	DST_SB( dstGrid ) = tempSB;
-	DST_ET( dstGrid ) = tempET;
-	DST_EB( dstGrid ) = tempEB;
-	DST_WT( dstGrid ) = tempWT;
-	DST_WB( dstGrid ) = tempWB;
-}
-
-#endif // LBM_KERNEL_CL
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h
deleted file mode 100644
index d44088661d313eeca6d44612549337b5a2630e04..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*############################################################################*/
-
-#ifndef _LAYOUT_CONFIG_H_
-#define _LAYOUT_CONFIG_H_
-
-/*############################################################################*/
-
-// Unchangeable settings: volume simulation size for the given example
-#define SIZE_X (120)
-#define SIZE_Y (120)
-#define SIZE_Z (150)
-
-// Changeable settings
-// Padding in each dimension
-#define PADDING_X (8)
-#define PADDING_Y (0)
-#define PADDING_Z (4)
-
-// Pitch in each dimension
-#define PADDED_X (SIZE_X + PADDING_X)
-#define PADDED_Y (SIZE_Y + PADDING_Y)
-#define PADDED_Z (SIZE_Z + PADDING_Z)
-
-#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
-
-//  Flattening function
-//  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x, y, z, e)                                                 \
-  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
-
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
-
-// Set this value to 1 for GATHER, or 0 for SCATTER
-#if 1
-#define GATHER
-#else
-#define SCATTER
-#endif
-
-// OpenCL block size (not trivially changeable here)
-#define BLOCK_SIZE SIZE_X
-
-/*############################################################################*/
-
-typedef enum {
-  C = 0,
-  N,
-  S,
-  E,
-  W,
-  T,
-  B,
-  NE,
-  NW,
-  SE,
-  SW,
-  NT,
-  NB,
-  ST,
-  SB,
-  ET,
-  EB,
-  WT,
-  WB,
-  FLAGS,
-  N_CELL_ENTRIES
-} CELL_ENTRIES;
-
-#define N_DISTR_FUNCS FLAGS
-
-typedef enum {
-  OBSTACLE = 1 << 0,
-  ACCEL = 1 << 1,
-  IN_OUT_FLOW = 1 << 2
-} CELL_FLAGS;
-
-#endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c
deleted file mode 100644
index 14ffa4211b3763d7c1c6538e693a76be61a0b158..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c
+++ /dev/null
@@ -1,321 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*############################################################################*/
-
-// includes, system
-#include <CL/cl.h>
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// includes, project
-#include "layout_config.h"
-#include "lbm.h"
-#include "lbm_macros.h"
-#include "ocl.h"
-
-/******************************************************************************/
-
-void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
-                                     cl_mem dstGrid) {
-
-  cl_int clStatus;
-
-  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
-  CHECK_ERROR("clSetKernelArg")
-
-  size_t dimBlock[3] = {SIZE_X, 1, 1};
-  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
-  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
-                                    dimGrid, dimBlock, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-
-  clStatus = clFinish(prm->clCommandQueue);
-  CHECK_ERROR("clFinish")
-}
-/*############################################################################*/
-
-void LBM_allocateGrid(float **ptr) {
-  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-
-  *ptr = (float *)malloc(size);
-  if (!*ptr) {
-    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
-           size / (1024.0 * 1024.0));
-    exit(1);
-  }
-
-  memset(*ptr, 0, size);
-
-  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
-
-  *ptr += MARGIN;
-}
-
-/******************************************************************************/
-
-void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
-  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-  cl_int clStatus;
-  *ptr =
-      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-}
-
-/*############################################################################*/
-
-void LBM_freeGrid(float **ptr) {
-  free(*ptr - MARGIN);
-  *ptr = NULL;
-}
-
-/******************************************************************************/
-
-void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
-
-/*############################################################################*/
-
-void LBM_initializeGrid(LBM_Grid grid) {
-  SWEEP_VAR
-
-  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
-  SRC_C(grid) = DFL1;
-  SRC_N(grid) = DFL2;
-  SRC_S(grid) = DFL2;
-  SRC_E(grid) = DFL2;
-  SRC_W(grid) = DFL2;
-  SRC_T(grid) = DFL2;
-  SRC_B(grid) = DFL2;
-  SRC_NE(grid) = DFL3;
-  SRC_NW(grid) = DFL3;
-  SRC_SE(grid) = DFL3;
-  SRC_SW(grid) = DFL3;
-  SRC_NT(grid) = DFL3;
-  SRC_NB(grid) = DFL3;
-  SRC_ST(grid) = DFL3;
-  SRC_SB(grid) = DFL3;
-  SRC_ET(grid) = DFL3;
-  SRC_EB(grid) = DFL3;
-  SRC_WT(grid) = DFL3;
-  SRC_WB(grid) = DFL3;
-
-  CLEAR_ALL_FLAGS_SWEEP(grid);
-  SWEEP_END
-}
-
-/******************************************************************************/
-
-void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
-                               LBM_Grid h_grid) {
-  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-  cl_int clStatus;
-  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
-                                  h_grid - MARGIN, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-}
-
-void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
-                              LBM_Grid h_grid) {
-  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-  cl_int clStatus;
-  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
-                                 h_grid - MARGIN, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-}
-
-/*############################################################################*/
-
-void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
-  cl_mem aux = *grid1;
-  *grid1 = *grid2;
-  *grid2 = aux;
-}
-
-/*############################################################################*/
-
-void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
-  int x, y, z;
-
-  FILE *file = fopen(filename, "rb");
-
-  for (z = 0; z < SIZE_Z; z++) {
-    for (y = 0; y < SIZE_Y; y++) {
-      for (x = 0; x < SIZE_X; x++) {
-        if (fgetc(file) != '.')
-          SET_FLAG(grid, x, y, z, OBSTACLE);
-      }
-      fgetc(file);
-    }
-    fgetc(file);
-  }
-
-  fclose(file);
-}
-
-/*############################################################################*/
-
-void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
-  int x, y, z;
-
-  for (z = -2; z < SIZE_Z + 2; z++) {
-    for (y = 0; y < SIZE_Y; y++) {
-      for (x = 0; x < SIZE_X; x++) {
-        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
-            z == SIZE_Z - 1) {
-          SET_FLAG(grid, x, y, z, OBSTACLE);
-        } else {
-          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
-              y < SIZE_Y - 2) {
-            SET_FLAG(grid, x, y, z, ACCEL);
-          }
-        }
-      }
-    }
-  }
-}
-
-/*############################################################################*/
-
-void LBM_showGridStatistics(LBM_Grid grid) {
-  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
-  float ux, uy, uz;
-  float minU2 = 1e+30, maxU2 = -1e+30, u2;
-  float minRho = 1e+30, maxRho = -1e+30, rho;
-  float mass = 0;
-
-  SWEEP_VAR
-
-  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
-  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
-        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
-        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
-        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
-        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
-
-  if (rho < minRho)
-    minRho = rho;
-  if (rho > maxRho)
-    maxRho = rho;
-  mass += rho;
-
-  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
-    nObstacleCells++;
-  } else {
-    if (TEST_FLAG_SWEEP(grid, ACCEL))
-      nAccelCells++;
-    else
-      nFluidCells++;
-
-    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
-         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
-         LOCAL(grid, WT) - LOCAL(grid, WB);
-    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
-         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
-         LOCAL(grid, ST) - LOCAL(grid, SB);
-    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
-         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
-         LOCAL(grid, WT) - LOCAL(grid, WB);
-    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
-    if (u2 < minU2)
-      minU2 = u2;
-    if (u2 > maxU2)
-      maxU2 = u2;
-  }
-  SWEEP_END
-
-  printf("LBM_showGridStatistics:\n"
-         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-         "\tminU: %e maxU: %e\n\n",
-         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
-         sqrt(minU2), sqrt(maxU2));
-}
-
-/*############################################################################*/
-
-static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
-  const int litteBigEndianTest = 1;
-  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
-    const char *vPtr = (char *)v;
-    char buffer[sizeof(OUTPUT_PRECISION)];
-    int i;
-
-    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
-      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
-
-    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
-  } else { /* little endian */
-    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
-  }
-}
-
-/*############################################################################*/
-
-static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
-  const int litteBigEndianTest = 1;
-  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
-    char *vPtr = (char *)v;
-    char buffer[sizeof(OUTPUT_PRECISION)];
-    int i;
-
-    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
-
-    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
-      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
-  } else { /* little endian */
-    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
-  }
-}
-
-/*############################################################################*/
-
-void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
-                            const int binary) {
-  OUTPUT_PRECISION rho, ux, uy, uz;
-
-  FILE *file = fopen(filename, (binary ? "wb" : "w"));
-
-  SWEEP_VAR
-  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
-  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
-        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
-        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
-        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
-        SRC_WB(grid);
-  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
-       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
-  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
-       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
-  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
-       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
-  ux /= rho;
-  uy /= rho;
-  uz /= rho;
-
-  if (binary) {
-    /*
-       fwrite( &ux, sizeof( ux ), 1, file );
-       fwrite( &uy, sizeof( uy ), 1, file );
-       fwrite( &uz, sizeof( uz ), 1, file );
-       */
-    storeValue(file, &ux);
-    storeValue(file, &uy);
-    storeValue(file, &uz);
-  } else
-    fprintf(file, "%e %e %e\n", ux, uy, uz);
-
-  SWEEP_END;
-
-  fclose(file);
-}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h
deleted file mode 100644
index 64a617feb862bdffdcb0c6aa57b0f1b09c26debb..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*############################################################################*/
-
-#ifndef _LBM_H_
-#define _LBM_H_
-
-/*############################################################################*/
-
-void LBM_allocateGrid(float **ptr);
-void LBM_freeGrid(float **ptr);
-void LBM_initializeGrid(LBM_Grid grid);
-void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
-void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
-void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
-void LBM_showGridStatistics(LBM_Grid Grid);
-void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
-                            const BOOL binary);
-
-/* OpenCL *********************************************************************/
-
-void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
-void OpenCL_LBM_freeGrid(cl_mem ptr);
-void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
-                               LBM_Grid h_grid);
-void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
-                              LBM_Grid h_grid);
-void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
-                                     cl_mem dstGrid);
-
-/*############################################################################*/
-
-#endif /* _LBM_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h
deleted file mode 100644
index 99c50c048a14bb47bb3659b61f088db95706bb0c..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef _LBM_MARCOS_H
-#define _LBM_MACROS_H_
-
-#define OMEGA (1.95f)
-
-#define OUTPUT_PRECISION float
-
-#define BOOL int
-#define TRUE (-1)
-#define FALSE (0)
-
-#define DFL1 (1.0f / 3.0f)
-#define DFL2 (1.0f / 18.0f)
-#define DFL3 (1.0f / 36.0f)
-
-/*############################################################################*/
-
-typedef float
-    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid *LBM_GridPtr;
-
-/*############################################################################*/
-
-#define SWEEP_X __temp_x__
-#define SWEEP_Y __temp_y__
-#define SWEEP_Z __temp_z__
-#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
-
-#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
-  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
-    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
-      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
-
-#define SWEEP_END                                                              \
-  }                                                                            \
-  }                                                                            \
-  }
-
-#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
-#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
-  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
-
-#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
-#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
-#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
-#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
-#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
-#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
-#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
-#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
-#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
-#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
-#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
-#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
-#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
-#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
-#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
-#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
-#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
-#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
-#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
-#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
-
-#ifdef SCATTER
-
-#define SRC_C(g) (LOCAL(g, C))
-#define SRC_N(g) (LOCAL(g, N))
-#define SRC_S(g) (LOCAL(g, S))
-#define SRC_E(g) (LOCAL(g, E))
-#define SRC_W(g) (LOCAL(g, W))
-#define SRC_T(g) (LOCAL(g, T))
-#define SRC_B(g) (LOCAL(g, B))
-#define SRC_NE(g) (LOCAL(g, NE))
-#define SRC_NW(g) (LOCAL(g, NW))
-#define SRC_SE(g) (LOCAL(g, SE))
-#define SRC_SW(g) (LOCAL(g, SW))
-#define SRC_NT(g) (LOCAL(g, NT))
-#define SRC_NB(g) (LOCAL(g, NB))
-#define SRC_ST(g) (LOCAL(g, ST))
-#define SRC_SB(g) (LOCAL(g, SB))
-#define SRC_ET(g) (LOCAL(g, ET))
-#define SRC_EB(g) (LOCAL(g, EB))
-#define SRC_WT(g) (LOCAL(g, WT))
-#define SRC_WB(g) (LOCAL(g, WB))
-
-#define DST_C(g) (NEIGHBOR_C(g, C))
-#define DST_N(g) (NEIGHBOR_N(g, N))
-#define DST_S(g) (NEIGHBOR_S(g, S))
-#define DST_E(g) (NEIGHBOR_E(g, E))
-#define DST_W(g) (NEIGHBOR_W(g, W))
-#define DST_T(g) (NEIGHBOR_T(g, T))
-#define DST_B(g) (NEIGHBOR_B(g, B))
-#define DST_NE(g) (NEIGHBOR_NE(g, NE))
-#define DST_NW(g) (NEIGHBOR_NW(g, NW))
-#define DST_SE(g) (NEIGHBOR_SE(g, SE))
-#define DST_SW(g) (NEIGHBOR_SW(g, SW))
-#define DST_NT(g) (NEIGHBOR_NT(g, NT))
-#define DST_NB(g) (NEIGHBOR_NB(g, NB))
-#define DST_ST(g) (NEIGHBOR_ST(g, ST))
-#define DST_SB(g) (NEIGHBOR_SB(g, SB))
-#define DST_ET(g) (NEIGHBOR_ET(g, ET))
-#define DST_EB(g) (NEIGHBOR_EB(g, EB))
-#define DST_WT(g) (NEIGHBOR_WT(g, WT))
-#define DST_WB(g) (NEIGHBOR_WB(g, WB))
-
-#else /* GATHER */
-
-#define SRC_C(g) (NEIGHBOR_C(g, C))
-#define SRC_N(g) (NEIGHBOR_S(g, N))
-#define SRC_S(g) (NEIGHBOR_N(g, S))
-#define SRC_E(g) (NEIGHBOR_W(g, E))
-#define SRC_W(g) (NEIGHBOR_E(g, W))
-#define SRC_T(g) (NEIGHBOR_B(g, T))
-#define SRC_B(g) (NEIGHBOR_T(g, B))
-#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
-#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
-#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
-#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
-#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
-#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
-#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
-#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
-#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
-#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
-#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
-#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
-
-#define DST_C(g) (LOCAL(g, C))
-#define DST_N(g) (LOCAL(g, N))
-#define DST_S(g) (LOCAL(g, S))
-#define DST_E(g) (LOCAL(g, E))
-#define DST_W(g) (LOCAL(g, W))
-#define DST_T(g) (LOCAL(g, T))
-#define DST_B(g) (LOCAL(g, B))
-#define DST_NE(g) (LOCAL(g, NE))
-#define DST_NW(g) (LOCAL(g, NW))
-#define DST_SE(g) (LOCAL(g, SE))
-#define DST_SW(g) (LOCAL(g, SW))
-#define DST_NT(g) (LOCAL(g, NT))
-#define DST_NB(g) (LOCAL(g, NB))
-#define DST_ST(g) (LOCAL(g, ST))
-#define DST_SB(g) (LOCAL(g, SB))
-#define DST_ET(g) (LOCAL(g, ET))
-#define DST_EB(g) (LOCAL(g, EB))
-#define DST_WT(g) (LOCAL(g, WT))
-#define DST_WB(g) (LOCAL(g, WB))
-
-#endif /* GATHER */
-
-#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
-#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g, f)                                                   \
-  {                                                                            \
-    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
-    (*_aux_) |= (f);                                                           \
-  }
-#define CLEAR_FLAG_SWEEP(g, f)                                                 \
-  {                                                                            \
-    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
-    (*_aux_) &= ~(f);                                                          \
-  }
-#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
-  {                                                                            \
-    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
-    (*_aux_) = 0;                                                              \
-  }
-
-#define TEST_FLAG(g, x, y, z, f)                                               \
-  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g, x, y, z, f)                                                \
-  {                                                                            \
-    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
-    (*_aux_) |= (f);                                                           \
-  }
-#define CLEAR_FLAG(g, x, y, z, f)                                              \
-  {                                                                            \
-    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
-    (*_aux_) &= ~(f);                                                          \
-  }
-#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
-  {                                                                            \
-    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
-    (*_aux_) = 0;                                                              \
-  }
-
-/*############################################################################*/
-
-#endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
deleted file mode 100644
index 18320b7394e5d499339ee820a992b00acd9b368e..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*############################################################################*/
-
-#include <CL/cl.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-
-#include "layout_config.h"
-#include "lbm.h"
-#include "lbm_macros.h"
-#include "main.h"
-#include "ocl.h"
-
-/*############################################################################*/
-
-static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
-
-/*############################################################################*/
-
-struct pb_TimerSet timers;
-int main(int nArgs, char *arg[]) {
-  MAIN_Param param;
-  int t;
-
-  OpenCL_Param prm;
-
-  struct pb_Parameters *params;
-  params = pb_ReadParameters(&nArgs, arg);
-
-  // Setup TEMP datastructures
-  MAIN_parseCommandLine(nArgs, arg, &param, params);
-  MAIN_printInfo(&param);
-
-  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
-
-  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  // Setup TEMP datastructures
-  LBM_allocateGrid((float **)&TEMP_srcGrid);
-  LBM_allocateGrid((float **)&TEMP_dstGrid);
-  LBM_initializeGrid(TEMP_srcGrid);
-  LBM_initializeGrid(TEMP_dstGrid);
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-  if (param.obstacleFilename != NULL) {
-    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
-    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
-  }
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
-  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  LBM_showGridStatistics(TEMP_srcGrid);
-
-  pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  OpenCL_initialize(&prm);
-
-  // Setup DEVICE datastructures
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
-  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
-
-  // Initialize DEVICE datastructures
-  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
-  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
-
-  for (t = 1; t <= param.nTimeSteps; t++) {
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
-
-    /*if( (t & 63) == 0 ) {*/
-    /*printf( "timestep: %i\n", t );*/
-#if 0
-            CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
-            LBM_showGridStatistics( *TEMP_srcGrid );
-#endif
-    /*}*/
-  }
-
-  /*MAIN_finalize( &param, &prm );*/ // inlined
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
-  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
-
-  clReleaseProgram(prm.clProgram);
-  clReleaseKernel(prm.clKernel);
-  clReleaseCommandQueue(prm.clCommandQueue);
-  clReleaseContext(prm.clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-  LBM_showGridStatistics(TEMP_srcGrid);
-  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
-
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  LBM_freeGrid((float **)&TEMP_srcGrid);
-  LBM_freeGrid((float **)&TEMP_dstGrid);
-
-  pb_FreeParameters(params);
-  return 0;
-}
-
-/*############################################################################*/
-
-void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
-                           struct pb_Parameters *params) {
-  struct stat fileStat;
-
-  if (nArgs < 2) {
-    printf("syntax: lbm <time steps>\n");
-    exit(1);
-  }
-
-  param->nTimeSteps = atoi(arg[1]);
-
-  if (params->inpFiles[0] != NULL) {
-    param->obstacleFilename = params->inpFiles[0];
-
-    if (stat(param->obstacleFilename, &fileStat) != 0) {
-      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-             param->obstacleFilename);
-      exit(1);
-    }
-    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
-      printf("MAIN_parseCommandLine:\n"
-             "\tsize of file '%s' is %i bytes\n"
-             "\texpected size is %i bytes\n",
-             param->obstacleFilename, (int)fileStat.st_size,
-             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
-      exit(1);
-    }
-  } else
-    param->obstacleFilename = NULL;
-
-  param->resultFilename = params->outFile;
-}
-
-/*############################################################################*/
-
-void MAIN_printInfo(const MAIN_Param *param) {
-  printf("MAIN_printInfo:\n"
-         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-         "\tnTimeSteps     : %i\n"
-         "\tresult file    : %s\n"
-         "\taction         : %s\n"
-         "\tsimulation type: %s\n"
-         "\tobstacle file  : %s\n\n",
-         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
-         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
-         (param->obstacleFilename == NULL) ? "<none>"
-                                           : param->obstacleFilename);
-}
-
-/*############################################################################*/
-
-void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
-  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // Setup TEMP datastructures
-  LBM_allocateGrid((float **)&TEMP_srcGrid);
-  LBM_allocateGrid((float **)&TEMP_dstGrid);
-  LBM_initializeGrid(TEMP_srcGrid);
-  LBM_initializeGrid(TEMP_dstGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  if (param->obstacleFilename != NULL) {
-    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
-    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
-  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
-
-  // Setup DEVICE datastructures
-  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
-  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
-
-  // Initialize DEVICE datastructures
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
-  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  LBM_showGridStatistics(TEMP_srcGrid);
-
-  LBM_freeGrid((float **)&TEMP_srcGrid);
-  LBM_freeGrid((float **)&TEMP_dstGrid);
-}
-
-/*############################################################################*/
-
-void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  LBM_Grid TEMP_srcGrid;
-
-  // Setup TEMP datastructures
-  LBM_allocateGrid((float **)&TEMP_srcGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  LBM_showGridStatistics(TEMP_srcGrid);
-
-  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
-
-  LBM_freeGrid((float **)&TEMP_srcGrid);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
-  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
-
-  clReleaseProgram(prm->clProgram);
-  clReleaseKernel(prm->clKernel);
-  clReleaseCommandQueue(prm->clCommandQueue);
-  clReleaseContext(prm->clContext);
-}
-
-void OpenCL_initialize(OpenCL_Param *prm) {
-  cl_int clStatus;
-
-  clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  prm->clCps[0] = CL_CONTEXT_PLATFORM;
-  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-  prm->clCps[2] = 0;
-
-  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1,
-                            &(prm->clDevice), NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL,
-                                           NULL, &clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  prm->clCommandQueue = clCreateCommandQueue(
-      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
-
-  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-  prm->clProgram =
-      clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char clOptions[100];
-  sprintf(clOptions, "-I src/opencl_nvidia");
-
-  clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions,
-                            NULL, NULL);
-  CHECK_ERROR("clBuildProgram")
-
-  prm->clKernel =
-      clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
-  CHECK_ERROR("clCreateKernel")
-
-  free((void *)clSource[0]);
-
-  /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s",
-   * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice,
-   * &prm->clProgram, &prm->clKernel);*/
-}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h
deleted file mode 100644
index 9d8e145c93b37488a3826e77b964c56699377d2a..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef _MAIN_H_
-#define _MAIN_H_
-
-/*############################################################################*/
-
-typedef struct {
-  int nTimeSteps;
-  char *resultFilename;
-  char *obstacleFilename;
-} MAIN_Param;
-
-/*############################################################################*/
-
-void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
-                           struct pb_Parameters *);
-void MAIN_printInfo(const MAIN_Param *param);
-void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
-void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
-
-void OpenCL_initialize(OpenCL_Param *prm);
-
-/*############################################################################*/
-
-#endif /* _MAIN_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c
deleted file mode 100644
index 4f232db0d9776f4f2d0eb4b2444036f35ff27257..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "ocl.h"
-#include <CL/cl.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-char *readFile(char *fileName) {
-  FILE *fp;
-  fp = fopen(fileName, "r");
-
-  if (fp == NULL) {
-    printf("Error 1!\n");
-    return NULL;
-  }
-
-  fseek(fp, 0, SEEK_END);
-  long size = ftell(fp);
-  rewind(fp);
-
-  char *buffer = malloc(sizeof(char) * (size + 1));
-  if (buffer == NULL) {
-    printf("Error 2!\n");
-    fclose(fp);
-    return NULL;
-  }
-
-  size_t res = fread(buffer, 1, size, fp);
-  if (res != size) {
-    printf("Error 3!\n");
-    fclose(fp);
-    return NULL;
-  }
-
-  buffer[size] = 0;
-  fclose(fp);
-  return buffer;
-}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h
deleted file mode 100644
index 5d5d984ba698d6ac71af3e51de3e6724a79135aa..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-typedef struct {
-  cl_platform_id clPlatform;
-  cl_context_properties clCps[3];
-  cl_device_id clDevice;
-  cl_context clContext;
-  cl_command_queue clCommandQueue;
-  cl_program clProgram;
-  cl_kernel clKernel;
-} OpenCL_Param;
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    printf("Error: %s!\n", errorMessage);                                      \
-    printf("Line: %d\n", __LINE__);                                            \
-    exit(1);                                                                   \
-  }
-
-char *readFile(char *);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile
deleted file mode 100644
index 36b421ec6f1359114ea0035d21048ab0b95bf30e..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o io.o #compute_gold.o
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc
deleted file mode 100644
index 04744f404ebaf6e669c2bbe91600519742b57dc9..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* I/O routines for reading and writing matrices in column-major
- * layout
- */
-
-#include <fstream>
-#include <iostream>
-#include <vector>
-
-char *readFile(const char *fileName) {
-  std::fstream f(fileName, std::fstream::in);
-  if (!f.good()) {
-    std::cerr << "Error Reading File!!" << std::endl;
-    return NULL;
-  }
-
-  f.seekg(0, std::ios::end);
-  int length = f.tellg();
-  f.seekg(0, std::ios::beg);
-
-  char *buffer;
-
-  if (length > 0) {
-    buffer = new char[length];
-    f.read(buffer, length);
-    buffer[length - 1] = 0;
-  } else {
-    buffer = new char;
-    buffer[0] = 0;
-  }
-
-  f.close();
-
-  return buffer;
-}
-
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                            std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << std::endl;
-  std::fstream f(fn, std::fstream::in);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f >> nr_row;
-  f >> nr_col;
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  while (f.good()) {
-    f >> data;
-    v.push_back(data);
-  }
-  v.pop_back(); // remove the duplicated last element
-  return true;
-}
-
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
-                             std::vector<float> &v) {
-  std::cerr << "Opening file:" << fn << " for write." << std::endl;
-  std::fstream f(fn, std::fstream::out);
-  if (!f.good()) {
-    return false;
-  }
-
-  // Read # of rows and cols
-  f << nr_row << " " << nr_col << " ";
-
-  float data;
-  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
-  for (int i = 0; i < v.size(); ++i) {
-    f << v[i] << ' ';
-  }
-  f << "\n";
-  return true;
-}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl
deleted file mode 100644
index 5ee2fd5df0a57bc59c1f714bc3efb6b3670b0386..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
-{
-    float c = 0.0f;
-    int m = get_global_id(0);
-    int n = get_global_id(1);
-
-    /*for (int i = 0; i < k; ++i) {*/
-	/*float a = A[m + i * lda]; */
-	/*float b = B[n + i * ldb];*/
-	/*c += a * b;*/
-    /*}*/
-    C[m+n*ldc] = beta + alpha * c;
-}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl
deleted file mode 100644
index f376a27d90003e3c7c18dafb9f64a8b459a40029..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
-{
-    float c = 0.0f;
-    int m = get_global_id(0);
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-	float a = A[m + i * lda]; 
-	float b = B[n + i * ldb];
-	c += a * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
-}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
deleted file mode 100644
index 5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/*
- * Main entry of dense matrix-matrix multiplication kernel
- */
-
-#include <CL/cl.h>
-#include <iostream>
-#include <malloc.h>
-#include <math.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <vector>
-
-// I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
-                                   std::vector<float> &v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int,
-                                    std::vector<float> &);
-extern char *readFile(const char *);
-
-// Parameters of tile sizes
-#define TILE_SZ 16
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
-    std::cout << "Line: " << __LINE__ << "\n";                                 \
-    exit(1);                                                                   \
-  }
-
-void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
-                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
-                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) {
-  if ((transa != 'N') && (transa != 'n')) {
-    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  if ((transb != 'T') && (transb != 't')) {
-    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-    return;
-  }
-
-  // In this code we assume the matrix sizes are multiple of tile size
-  if ((m % TILE_SZ) || (n % TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of "
-              << TILE_SZ << "; n should be multiple of " << TILE_SZ
-              << std::endl;
-  }
-
-  size_t db[2] = {TILE_SZ, TILE_SZ};
-  size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
-
-  cl_int clStatus;
-
-  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
-  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
-  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
-  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
-  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
-  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
-  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
-  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
-  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
-                                    0, NULL, NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-
-  clStatus = clFinish(clCommandQueue);
-  CHECK_ERROR("clFinish")
-}
-
-int main(int argc, char *argv[]) {
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  size_t A_sz, B_sz, C_sz;
-  int matArow, matAcol;
-  int matBrow, matBcol;
-  std::vector<float> matA, matBT;
-
-  /* Read command line. Expect 3 inputs: A, B and B^T
-     in column-major layout*/
-  params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
-      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
-    fprintf(stderr, "Expecting three input filenames\n");
-    exit(-1);
-  }
-
-  /* Read in data */
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
-
-  pb_InitializeTimerSet(&timers);
-
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-  cl_int clStatus;
-  cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
-                                    (cl_context_properties)clPlatform, 0};
-  cl_context clContext =
-      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(
-      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-
-  const char *clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
-  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  // cl_program clProgram =
-  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  cl_kernel clKernel;
-  cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary(
-      "build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT",
-      &clContext, &clDevice, &clProgram, &clKernel);
-  // cl_program clProgram =
-  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  // CHECK_ERROR("clCreateProgramWithSource")
-
-  // char clOptions[50];
-  // sprintf(clOptions,"");
-
-  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  // CHECK_ERROR("clBuildProgram")
-
-  // size_t binarySizes = 0;
-  // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES,
-  // sizeof(size_t), &binarySizes, NULL); CHECK_ERROR("clGetProgramInfo")
-
-  // std::cout << "Binary Size = " << binarySizes << "\n";
-
-  // unsigned char* binaries = (unsigned char*) malloc(binarySizes);
-  // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes,
-  // &binaries, NULL); CHECK_ERROR("clGetProgramInfo")
-
-  // std::cout << "Binary = \n" << binaries << "\n";
-
-  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  // CHECK_ERROR("clCreateKernel")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // copy A to device memory
-  A_sz = matArow * matAcol * sizeof(float);
-  B_sz = matBrow * matBcol * sizeof(float);
-
-  // allocate space for C
-  C_sz = matArow * matBcol * sizeof(float);
-
-  // OpenCL memory allocation
-  std::vector<float> matC(matArow * matBcol);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  cl_mem dA =
-      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  cl_mem dB =
-      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  cl_mem dC =
-      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
-                                  &matA.front(), 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
-                                  &matBT.front(), 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  for (int i = 0; i < matC.size(); i++)
-    matC[i] = 0.0f;
-
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
-                                  &matC.front(), 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-  // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
-             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
-                      NULL, NULL);
-
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseMemObject(dA);
-  clStatus = clReleaseMemObject(dB);
-  clStatus = clReleaseMemObject(dC);
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  if (params->outFile) {
-
-    /* Write C to file */
-    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-  }
-
-  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
-            << std::endl;
-  pb_FreeParameters(params);
-
-  // free((void*)clSource[0]);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile
deleted file mode 100644
index 5774b2827bd0588b16cfc32b2ac787346c5b7b1d..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-TOOLS_SRC=common_src/convert-dataset
-SRCDIR_OBJS=main.o gpu_info.o file.o ocl.o
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -O3 -I$(TOOLS_SRC)
-APP_CXXFLAGS=-ffast-math -O3
-KERNEL_OBJS=kernel_offline.nvptx.s
-
-include $(TOOLS_SRC)/commontools.mk
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c
deleted file mode 100644
index 22397498f7f43b3f60926bf51c2ddbff91529787..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <inttypes.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-#error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
-               int *pad, float **h_data, int **h_indices, int **h_ptr,
-               int **h_perm, int **h_nzcnt) {
-  FILE *fid = fopen(fName, "rb");
-
-  if (fid == NULL) {
-    fprintf(stderr, "Cannot open input file\n");
-    exit(-1);
-  }
-
-  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
-  int _len = len[0];
-  int _depth = depth[0];
-  int _dim = dim[0];
-  int _pad = pad[0];
-  int _nzcnt_len = nzcnt_len[0];
-
-  *h_data = (float *)malloc(_len * sizeof(float));
-  fread(*h_data, sizeof(float), _len, fid);
-
-  *h_indices = (int *)malloc(_len * sizeof(int));
-  fread(*h_indices, sizeof(int), _len, fid);
-
-  *h_ptr = (int *)malloc(_depth * sizeof(int));
-  fread(*h_ptr, sizeof(int), _depth, fid);
-
-  *h_perm = (int *)malloc(_dim * sizeof(int));
-  fread(*h_perm, sizeof(int), _dim, fid);
-
-  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
-  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
-
-  fclose(fid);
-}
-
-void input_vec(char *fName, float *h_vec, int dim) {
-  FILE *fid = fopen(fName, "rb");
-  fread(h_vec, sizeof(float), dim, fid);
-  fclose(fid);
-}
-
-void outputData(char *fName, float *h_Ax_vector, int dim) {
-  FILE *fid = fopen(fName, "w");
-  uint32_t tmp32;
-  if (fid == NULL) {
-    fprintf(stderr, "Cannot open output file\n");
-    exit(-1);
-  }
-  tmp32 = dim;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-  fwrite(h_Ax_vector, sizeof(float), dim, fid);
-
-  fclose(fid);
-}
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h
deleted file mode 100644
index 5e38a6875e9e5f8be4d01b68569d80adf8c49548..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-#ifndef __FILEH__
-#define __FILEH__
-
-void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
-               int *pad, float **h_data, int **h_indices, int **h_ptr,
-               int **h_perm, int **h_nzcnt);
-
-void input_vec(char *fNanme, float *h_vec, int dim);
-void outputData(char *fName, float *h_Ax_vector, int dim);
-
-#endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c
deleted file mode 100644
index 90beedd747480ede3fd1e5da4017ed0051e043be..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-#include <endian.h>
-#include <inttypes.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "gpu_info.h"
-
-void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
-                           int major, int minor, int sm) {
-  int max_thread;
-  int max_block = 8;
-  if (major == 1) {
-    if (minor >= 2)
-      max_thread = 1024;
-    else
-      max_thread = 768;
-  } else if (major == 2)
-    max_thread = 1536;
-  else
-    // newer GPU  //keep using 2.0
-    max_thread = 1536;
-
-  int _grid;
-  int _thread;
-
-  if (task * pad > sm * max_thread) {
-    _thread = max_thread / max_block;
-    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
-  } else {
-    _thread = pad;
-    _grid = task * pad;
-  }
-
-  thread[0] = _thread;
-  grid[0] = _grid;
-}
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h
deleted file mode 100644
index ab1af7d0b8ba92f87c643582171e48cee0a9b95e..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef __GPUINFOH__
-#define __GPUINFOH__
-
-void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
-                           int major, int minor, int sm);
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl
deleted file mode 100644
index 8c19a22511cd4c65364cb55c5df1a22d12b182b7..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-#define WARP_BITS 5
-
-__kernel void spmv_jds(__global float *dst_vector, __global float *d_data,
-		       __global int *d_index, __global int *d_perm,
-		       __global float *x_vec, const int dim, 
-		       __constant int *jds_ptr_int,
-		       __constant int *sh_zcnt_int)
-{
-	int ix = get_global_id(0);
-	int warp_id=ix>>WARP_BITS;
-
-	if(ix<dim)
-	{
-		float sum=0.0f;
-		int bound=sh_zcnt_int[warp_id];
-		//prefetch 0
-		int j=jds_ptr_int[0]+ix;  
-		float d = d_data[j]; 
-		int i = d_index[j];  
-		float t = x_vec[i];
-		
-		if (bound>1)  //bound >=2
-		{
-			//prefetch 1
-			j=jds_ptr_int[1]+ix;    
-			i =  d_index[j];  
-			int in;
-			float dn;
-			float tn;
-			for(int k=2;k<bound;k++ )
-			{	
-				//prefetch k-1
-				dn = d_data[j]; 
-				//prefetch k
-				j=jds_ptr_int[k]+ix;    
-				in = d_index[j]; 
-				//prefetch k-1
-				tn = x_vec[i];
-				
-				//compute k-2
-				sum += d*t; 
-				//sweep to k
-				i = in;  
-				//sweep to k-1
-				d = dn;
-				t =tn; 
-			}	
-		
-			//fetch last
-			dn = d_data[j];
-			tn = x_vec[i];
-	
-			//compute last-1
-			sum += d*t; 
-			//sweep to last
-			d=dn;
-			t=tn;
-		}
-		//compute last
-		sum += d*t;  // 3 3
-		
-		//write out data
-		dst_vector[d_perm[ix]]=sum; 
-	}
-}
-
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl
deleted file mode 100644
index 9a17a299afdd032fb9c07d8ff5b559055704a573..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/kernel_offline.cl
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-#define WARP_BITS 5
-
-__kernel void spmv_jds(__global float *dst_vector, __global float *d_data,
-		       __global int *d_index, __global int *d_perm,
-		       __global float *x_vec, const int dim, 
-		       __constant int *jds_ptr_int,
-		       __constant int *sh_zcnt_int)
-{
-	int ix = get_global_id(0);
-	int warp_id=ix>>WARP_BITS;
-
-	if(ix<dim)
-	{
-		float sum=0.0f;
-		int bound=sh_zcnt_int[warp_id];
-		//prefetch 0
-		int j=jds_ptr_int[0]+ix;  
-		float d = d_data[j]; 
-		int i = d_index[j];  
-		float t = x_vec[i];
-		
-		if (bound>1)  //bound >=2
-		{
-			//prefetch 1
-			j=jds_ptr_int[1]+ix;    
-			i =  d_index[j];  
-			int in;
-			float dn;
-			float tn;
-			for(int k=2;k<bound;k++ )
-			{	
-				//prefetch k-1
-				dn = d_data[j]; 
-				//prefetch k
-				j=jds_ptr_int[k]+ix;    
-				in = d_index[j]; 
-				//prefetch k-1
-				tn = x_vec[i];
-				
-				//compute k-2
-				sum += d*t; 
-				//sweep to k
-				i = in;  
-				//sweep to k-1
-				d = dn;
-				t =tn; 
-			}	
-		
-			//fetch last
-			dn = d_data[j];
-			tn = x_vec[i];
-	
-			//compute last-1
-			sum += d*t; 
-			//sweep to last
-			d=dn;
-			t=tn;
-		}
-		//compute last
-		sum += d*t;  // 3 3
-		
-		//write out data
-		dst_vector[d_perm[ix]]=sum; 
-	}
-}
-
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
deleted file mode 100644
index 343814149aa74139930380c2178e2f447c64e806..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <CL/cl.h>
-#include <CL/cl_ext.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "convert_dataset.h"
-#include "file.h"
-#include "gpu_info.h"
-#include "ocl.h"
-
-static int generate_vector(float *x_vector, int dim) {
-  srand(54321);
-  int i;
-  for (i = 0; i < dim; i++) {
-    x_vector[i] = (rand() / (float)RAND_MAX);
-  }
-  return 0;
-}
-
-int main(int argc, char **argv) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
-         "Wu<wu14@illinois.edu>\n");
-  parameters = pb_ReadParameters(&argc, argv);
-
-  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
-    fprintf(stderr, "Expecting one two filenames\n");
-    exit(-1);
-  }
-
-  // load matrix from files
-  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-  //    &h_data, &h_indices, &h_ptr,
-  //    &h_perm, &h_nzcnt);
-  int col_count;
-
-  // parameters declaration
-  int len;
-  int depth;
-  int dim;
-  int pad = 32;
-  int nzcnt_len;
-
-  // host memory allocation
-  // matrix
-  float *h_data;
-  int *h_indices;
-  int *h_ptr;
-  int *h_perm;
-  int *h_nzcnt;
-
-  // vector
-  float *h_Ax_vector;
-  float *h_x_vector;
-
-  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-             1,                       // row padding
-             pad,                     // warp size
-             1,                       // pack size
-             1,                       // is mirrored?
-             0,                       // binary matrix
-             1,                       // debug level [0:2]
-             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
-             &len, &nzcnt_len, &depth);
-
-  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
-  h_x_vector = (float *)malloc(sizeof(float) * dim);
-  input_vec(parameters->inpFiles[1], h_x_vector, dim);
-
-  pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  cl_int clStatus;
-  cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
-  CHECK_ERROR("clGetPlatformIDs")
-
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
-                                    (cl_context_properties)clPlatform, 0};
-
-  cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  cl_context clContext =
-      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(
-      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-
-  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-  cl_program clProgram =
-      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char clOptions[50];
-  sprintf(clOptions, "");
-  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
-  CHECK_ERROR("clBuildProgram")
-
-  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus);
-  CHECK_ERROR("clCreateKernel")
-  /*cl_kernel clKernel;*/
-  /*cl_program clProgram;*/
-  /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s",
-   * "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  // device memory allocation
-  // matrix
-  cl_mem d_data;
-  cl_mem d_indices;
-  cl_mem d_perm;
-  cl_mem d_Ax_vector;
-  cl_mem d_x_vector;
-
-  cl_mem jds_ptr_int;
-  cl_mem sh_zcnt_int;
-
-  OpenCLDeviceProp clDeviceProp;
-  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
-  CHECK_ERROR("clGetDeviceInfo")
-  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
-  CHECK_ERROR("clGetDeviceInfo")
-  clStatus =
-      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
-                      &(clDeviceProp.multiProcessorCount), NULL);
-  CHECK_ERROR("clGetDeviceInfo")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  // memory allocation
-  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
-                          NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
-                             NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
-                          &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
-                              NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
-                               dim * sizeof(float), NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
-                               NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
-                               nzcnt_len * sizeof(int), NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
-
-  // memory copy
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
-                                  len * sizeof(float), h_data, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
-                                  len * sizeof(int), h_indices, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
-                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
-                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
-                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus =
-      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
-                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  size_t grid;
-  size_t block;
-
-  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
-                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
-
-  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
-  CHECK_ERROR("clSetKernelArg")
-  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
-  CHECK_ERROR("clSetKernelArg")
-
-  // main execution
-
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  int i;
-  for (i = 0; i < 50; i++) {
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
-                                      &block, 0, NULL, NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-  }
-
-  clStatus = clFinish(clCommandQueue);
-  CHECK_ERROR("clFinish")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-  // HtoD memory copy
-  clStatus =
-      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
-                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-
-  clStatus = clReleaseMemObject(d_data);
-  clStatus = clReleaseMemObject(d_indices);
-  clStatus = clReleaseMemObject(d_perm);
-  clStatus = clReleaseMemObject(sh_zcnt_int);
-  clStatus = clReleaseMemObject(jds_ptr_int);
-  clStatus = clReleaseMemObject(d_x_vector);
-  clStatus = clReleaseMemObject(d_Ax_vector);
-
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  pb_PrintTimerSet(&timers);
-  if (parameters->outFile) {
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    outputData(parameters->outFile, h_Ax_vector, dim);
-  }
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-  // free((void*)clSource[0]);
-
-  free(h_data);
-  free(h_indices);
-  free(h_ptr);
-  free(h_perm);
-  free(h_nzcnt);
-  free(h_Ax_vector);
-  free(h_x_vector);
-
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c
deleted file mode 100644
index 2990031255acae7fe480b0fe7cdc79db7cb08287..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "ocl.h"
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-
-char *readFile(const char *fileName) {
-  FILE *fp;
-  fp = fopen(fileName, "r");
-  if (fp == NULL) {
-    printf("Error 1!\n");
-    exit(1);
-  }
-
-  fseek(fp, 0, SEEK_END);
-  long size = ftell(fp);
-  rewind(fp);
-
-  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
-  if (buffer == NULL) {
-    printf("Error 2!\n");
-    fclose(fp);
-    exit(1);
-  }
-
-  size_t res = fread(buffer, 1, size, fp);
-  if (res != size) {
-    printf("Error 3!\n");
-    fclose(fp);
-    exit(1);
-  }
-
-  buffer[size] = 0;
-  fclose(fp);
-  return buffer;
-}
-
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
-              size_t size) {
-  cl_int clStatus;
-  char *temp = (char *)malloc(size);
-  memset(temp, val, size);
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
-                                  0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  free(temp);
-}
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h
deleted file mode 100644
index 42ff7b4d1059550293b56325d0cce2afea6c004b..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-typedef struct {
-  cl_uint major;
-  cl_uint minor;
-  cl_uint multiProcessorCount;
-} OpenCLDeviceProp;
-
-void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char *readFile(const char *);
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    printf("Error: %s!\n", errorMessage);                                      \
-    printf("Line: %d\n", __LINE__);                                            \
-    exit(1);                                                                   \
-  }
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile
deleted file mode 100644
index c4ef6c0dac3a1646d63aca309b2d44138abb55d3..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=opencl
-SRCDIR_OBJS=main.o file.o
-APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
-KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h
deleted file mode 100644
index 1a682890b3619ef712c5e5e3a7313e325935ec6f..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
-#endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc
deleted file mode 100644
index 95cd65c4a0e013c60c6edd43077346a7efdad1ae..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <endian.h>
-#include <inttypes.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-#error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
-  FILE *fid = fopen(fName, "r");
-
-  if (fid == NULL) {
-    fprintf(stderr, "Cannot open input file\n");
-    exit(-1);
-  }
-
-  fread(nx, sizeof(int), 1, fid);
-  fread(ny, sizeof(int), 1, fid);
-  fread(nz, sizeof(int), 1, fid);
-  fclose(fid);
-}
-
-extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
-  FILE *fid = fopen(fName, "w");
-  uint32_t tmp32;
-  if (fid == NULL) {
-    fprintf(stderr, "Cannot open output file\n");
-    exit(-1);
-  }
-  tmp32 = nx * ny * nz;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-  fwrite(h_A0, sizeof(float), tmp32, fid);
-
-  fclose(fid);
-}
-
-extern "C" char *readFile(const char *fileName) {
-  FILE *fp;
-  fp = fopen(fileName, "r");
-  if (fp == NULL) {
-    printf("Error 1!\n");
-    exit(1);
-  }
-
-  fseek(fp, 0, SEEK_END);
-  long size = ftell(fp);
-  rewind(fp);
-
-  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
-  if (buffer == NULL) {
-    printf("Error 2!\n");
-    fclose(fp);
-    exit(1);
-  }
-
-  size_t res = fread(buffer, 1, size, fp);
-  if (res != size) {
-    printf("Error 3!\n");
-    fclose(fp);
-    exit(1);
-  }
-
-  buffer[size] = 0;
-  fclose(fp);
-  return buffer;
-}
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h
deleted file mode 100644
index b45c42371bbde3c3a39d88277adf39a8f537baab..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-#ifndef __FILEH__
-#define __FILEH__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char *fName, int *nx, int *ny, int *nz);
-void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
-char *readFile(const char *fileName);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl
deleted file mode 100644
index 4c5d1263db5948e4e61ea2baa27261613cd0ea06..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "common.h"
-
-__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz)
-{
-    	int i = get_global_id(0)+1;
-    	int j = get_global_id(1)+1;
-    	int k = get_global_id(2)+1;
-
-if(i<nx-1)
-{
-		Anext[Index3D (nx, ny, i, j, k)] = c1 *
-		( A0[Index3D (nx, ny, i, j, k + 1)] +
-		  A0[Index3D (nx, ny, i, j, k - 1)] +
-		  A0[Index3D (nx, ny, i, j + 1, k)] +
-		  A0[Index3D (nx, ny, i, j - 1, k)] +
-	 	  A0[Index3D (nx, ny, i + 1, j, k)] +
-		  A0[Index3D (nx, ny, i - 1, j, k)] )
-		- A0[Index3D (nx, ny, i, j, k)] * c0;
-}
-}
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl
deleted file mode 100644
index 4c5d1263db5948e4e61ea2baa27261613cd0ea06..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/kernel_offline.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include "common.h"
-
-__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz)
-{
-    	int i = get_global_id(0)+1;
-    	int j = get_global_id(1)+1;
-    	int k = get_global_id(2)+1;
-
-if(i<nx-1)
-{
-		Anext[Index3D (nx, ny, i, j, k)] = c1 *
-		( A0[Index3D (nx, ny, i, j, k + 1)] +
-		  A0[Index3D (nx, ny, i, j, k - 1)] +
-		  A0[Index3D (nx, ny, i, j + 1, k)] +
-		  A0[Index3D (nx, ny, i, j - 1, k)] +
-	 	  A0[Index3D (nx, ny, i + 1, j, k)] +
-		  A0[Index3D (nx, ny, i - 1, j, k)] )
-		- A0[Index3D (nx, ny, i, j, k)] * c0;
-}
-}
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
deleted file mode 100644
index ec47c22227648df094cbf03ea1b667943207207e..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
+++ /dev/null
@@ -1,231 +0,0 @@
-
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <CL/cl.h>
-#include <parboil.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-#include "file.h"
-
-#define CHECK_ERROR(errorMessage)                                              \
-  if (clStatus != CL_SUCCESS) {                                                \
-    printf("Error: %s!\n", errorMessage);                                      \
-    printf("Line: %d\n", __LINE__);                                            \
-    exit(1);                                                                   \
-  }
-
-static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
-  int s = 0;
-  int i, j, k;
-  for (i = 0; i < nz; i++) {
-    for (j = 0; j < ny; j++) {
-      for (k = 0; k < nx; k++) {
-        fread(A0 + s, sizeof(float), 1, fp);
-        s++;
-      }
-    }
-  }
-  return 0;
-}
-
-int main(int argc, char **argv) {
-  struct pb_TimerSet timers;
-  struct pb_Parameters *parameters;
-
-  printf("OpenCL accelerated 7 points stencil codes****\n");
-  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-  parameters = pb_ReadParameters(&argc, argv);
-
-  // declaration
-  unsigned nx, ny, nz;
-  unsigned size;
-  int iteration;
-  float c0 = 1.0f / 6.0f;
-  float c1 = 1.0f / 6.0f / 6.0f;
-
-  if (argc < 5) {
-    printf("Usage: probe nx ny nz t\n"
-           "nx: the grid size x\n"
-           "ny: the grid size y\n"
-           "nz: the grid size z\n"
-           "t: the iteration time\n");
-    return -1;
-  }
-
-  nx = atoi(argv[1]);
-  if (nx < 1)
-    return -1;
-  ny = atoi(argv[2]);
-  if (ny < 1)
-    return -1;
-  nz = atoi(argv[3]);
-  if (nz < 1)
-    return -1;
-  iteration = atoi(argv[4]);
-  if (iteration < 1)
-    return -1;
-
-  // host data
-  float *h_A0;
-  float *h_Anext;
-  // load data from files
-
-  size = nx * ny * nz;
-
-  h_A0 = (float *)malloc(sizeof(float) * size);
-  h_Anext = (float *)malloc(sizeof(float) * size);
-  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-  FILE *fp = fopen(parameters->inpFiles[0], "rb");
-  read_data(h_A0, nx, ny, nz, fp);
-  fclose(fp);
-
-  pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  cl_int clStatus;
-  cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
-  CHECK_ERROR("clGetPlaformIDs")
-
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
-                                    (cl_context_properties)clPlatform, 0};
-
-  cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
-  CHECK_ERROR("clGetDeviceIDs")
-
-  cl_context clContext =
-      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
-  CHECK_ERROR("clCreateContextFromType")
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(
-      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-
-  /*cl_program clProgram;*/
-  /*cl_kernel clKernel;*/
-
-  /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s",
-   * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
-  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  cl_program clProgram =
-      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char clOptions[50];
-  sprintf(clOptions, "-I src/opencl_base");
-  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
-  CHECK_ERROR("clBuildProgram")
-
-  cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus);
-  CHECK_ERROR("clCreateKernel")
-
-  // device
-  cl_mem d_A0;
-  cl_mem d_Anext;
-
-  memcpy(h_Anext, h_A0, sizeof(float) * size);
-
-  // memory allocation
-  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
-                        NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
-                           NULL, &clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  // memory copy
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
-                                  size * sizeof(float), h_A0, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
-                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  // only use 1D thread block
-  unsigned tx = 256;
-  size_t block[3] = {tx, 1, 1};
-  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
-  // size_t grid[3] = {nx-2,ny-2,nz-2};
-  size_t offset[3] = {1, 1, 1};
-  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
-  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-
-  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
-  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
-  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
-  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
-  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
-  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
-  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
-  CHECK_ERROR("clSetKernelArg")
-
-  // main execution
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  int t;
-  for (t = 0; t < iteration; t++) {
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
-                                      block, 0, NULL, NULL);
-    // printf("iteration %d\n",t)
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-
-    cl_mem d_temp = d_A0;
-    d_A0 = d_Anext;
-    d_Anext = d_temp;
-
-    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
-    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
-  }
-
-  cl_mem d_temp = d_A0;
-  d_A0 = d_Anext;
-  d_Anext = d_temp;
-
-  clStatus = clFinish(clCommandQueue);
-  CHECK_ERROR("clFinish")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
-                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  clStatus = clReleaseMemObject(d_A0);
-  clStatus = clReleaseMemObject(d_Anext);
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext);
-  CHECK_ERROR("clReleaseContext")
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-  pb_PrintTimerSet(&timers);
-
-  if (parameters->outFile) {
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    outputData(parameters->outFile, h_Anext, nx, ny, nz);
-  }
-  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-  // free((void*)clSource[0]);
-
-  free(h_A0);
-  free(h_Anext);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/visc.mk
index 7ef5b017f054661f2c4b6235bed43839a66d87ae..0a8984deeac5696557f4b6a220b4f0758f5aefcf 100755
--- a/hpvm/test/parboil/common/mk/visc.mk
+++ b/hpvm/test/parboil/common/mk/visc.mk
@@ -172,17 +172,9 @@ clean :
 	if [ -d $(BUILDDIR) ]; then rm -rf $(BUILDDIR); fi
 	if [ -d $(RUNDIR) ]; then rm -rf $(RUNDIR); fi
 
-#$(APP_BINS) : $(KERNEL_OCL) $(BIN)
-	#echo Generating $(APP_BINS) ...
-
-#TODO: Fix this. Remove echo to actually compile kernel to ptx
 $(KERNEL_OCL) : $(KERNEL_OPT)
 	$(OCLBE) $< -o $@
 
-$(SPIR_ASSEMBLY) : $(KERNEL_OPT)
-	python $(PYTHON_LLVM_40_34) $< $(BUILDDIR)/kernel_34.ll
-	$(LLVM_34_AS) $(BUILDDIR)/kernel_34.ll -o $@
-
 $(KERNEL_OPT) : $(KERNEL)
 	$(OPT) $(APP_OPTFLAGS) -S $< -o $@
 
@@ -193,7 +185,7 @@ $(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
 $(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS)
-	$(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
 
 $(RUNDIR) :
 	mkdir -p $(RUNDIR)
@@ -212,9 +204,6 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp
 
 $(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
-	cat $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil/RUN.parboil.script $@ > $@.tmp
-	mv $@.tmp $(BUILDDIR)/$(APP).visc.ll
-	#@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS)
 
 $(BUILDDIR)/%.o : $(SRCDIR)/%.c
 	$(CC) $(CFLAGS) -c $< -o $@
diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile
index 52ac5a9b7e33448249ac3cb74a4bc9a473787d4b..e3572ecdfc4322ecd12c25517880b87f94c0f9e1 100644
--- a/hpvm/test/pipeline/Makefile
+++ b/hpvm/test/pipeline/Makefile
@@ -39,8 +39,8 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt
+VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
 
 TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
 
@@ -94,9 +94,6 @@ $(EXE) : $(HOST_LINKED)
 $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
-	make -C $(LLVM_LIB_PATH)
-
 $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
 	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
 
diff --git a/hpvm/test/pipeline/Makefile.config b/hpvm/test/pipeline/Makefile.config
index ffb2942911313421ce9b9186a392578d7bcaf4c7..1bdb62dec493fc63e581ae8204ea736c45ed5f7d 100644
--- a/hpvm/test/pipeline/Makefile.config
+++ b/hpvm/test/pipeline/Makefile.config
@@ -3,23 +3,21 @@ CUDA_LIB_PATH=$(CUDA_PATH)/lib64
 OPENCL_PATH=/software/cuda-9.1
 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-release/hpvm/llvm/
-# NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-reorg-9-temp/hpvm/llvm/
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/../build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
-OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe
+LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build
+CC = $(LLVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
+OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+CXX = $(LLVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(LLVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/pipeline/Makefile.config.example b/hpvm/test/pipeline/Makefile.config.example
index 269f0b7df273c958f0cd20a0f935716a329e00ae..2627ca508f17acb96c858bf4473eed4d89ebec20 100644
--- a/hpvm/test/pipeline/Makefile.config.example
+++ b/hpvm/test/pipeline/Makefile.config.example
@@ -1,23 +1,21 @@
-CUDA_PATH=/usr/local/cuda 
-CUDA_LIB_PATH=/usr/local/cuda/lib64
-OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64
-OPENCL_LIB_PATH=$(OPENCL_PATH)/lib
+CUDA_PATH=/software/cuda-9.1
+CUDA_LIB_PATH=$(CUDA_PATH)/lib64
+OPENCL_PATH=/software/cuda-9.1
+OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-# NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build
+CC = $(LLVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
+OCLBE = $(LLVM_BUILD_DIR)/bin/llvm-cbe
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+CXX = $(LLVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I../include -I$(LLVM_BUILD_DIR)/include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
-
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(LLVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/template/Makefile b/hpvm/test/template/Makefile
index 82d7c61e2b77947da770aab506c45919ee139a63..3aa4bd1d6f2c3f7bb2be07ba5e662c5b6faf1655 100644
--- a/hpvm/test/template/Makefile
+++ b/hpvm/test/template/Makefile
@@ -41,8 +41,8 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+VISC_RT_PATH = $(LLVM_BUILD_ROOT)/tools/hpvm/projects/visc-rt
+VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
 
 TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
 
@@ -69,9 +69,7 @@ OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
 TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
-ifeq ($(TARGET),seq)
-else
-  KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
+ifeq ($(TARGET),gpu)
   KERNEL_OCL = $(TEST_OBJS).kernels.cl
 endif
 
@@ -88,7 +86,7 @@ endif
 default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE)
 
 $(KERNEL_OCL) : $(KERNEL)
-	$(OCLBE) -debug $< -o $@
+	$(OCLBE) $< -o $@
 
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
@@ -96,11 +94,8 @@ $(EXE) : $(HOST_LINKED)
 $(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
-	make -C $(LLVM_LIB_PATH)
-
 $(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -112,6 +107,6 @@ $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc
 	$(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $<
 
 $(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/template/Makefile.config.example b/hpvm/test/template/Makefile.config.example
index 269f0b7df273c958f0cd20a0f935716a329e00ae..9b182a3b7998f3a302a758796a9fe950740b1366 100644
--- a/hpvm/test/template/Makefile.config.example
+++ b/hpvm/test/template/Makefile.config.example
@@ -1,23 +1,23 @@
-CUDA_PATH=/usr/local/cuda 
-CUDA_LIB_PATH=/usr/local/cuda/lib64
-OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64
-OPENCL_LIB_PATH=$(OPENCL_PATH)/lib
+# These paths should be set to your local directories of CUDA and OpenCL
+CUDA_PATH=/software/cuda-9.1 
+CUDA_LIB_PATH=$(CUDA_PATH)/lib64
+OPENCL_PATH=$(CUDA_PATH)
+OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-# NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+# These paths should be set based on your HPVM build and source locations.
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+LLVM_BUILD_DIR =$(LLVM_SRC_ROOT)/../build
+CC = $(LLVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(OPENCL_PATH)/include/CL/ -I$(LLVM_BUILD_DIR)/include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+CXX = $(LLVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include  -I$(OPENCL_PATH)/include/CL/ -I$(LLVM_BUILD_DIR)/include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(LLVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(LLVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(LLVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/template/README.md b/hpvm/test/template/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..198604817d5a8463e555451a1188b426ec4e31cd
--- /dev/null
+++ b/hpvm/test/template/README.md
@@ -0,0 +1,37 @@
+Compilation of a project with HPVM is a multi-step process.
+Let's look at the compilation of the `pipeline` test for gpu as an example.
+
+`clang` is used to produce an LLVM IR file that contains the HPVM intrinsics in the form of function calls.
+```
+/.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -DDEVICE=GPU_TARGET -emit-llvm -S -o build/main.ll src/main.cc
+```
+
+`opt` is used to invoke the GenVISC pass, which converts the HPVM function calls to LLVM intrinsics.
+```
+/.../hpvm/build/bin/opt -debug-only=genvisc -load LLVMGenVISC.so -genvisc -globaldce -visc-timers-gen build/main.ll -S -o build/main.visc.ll
+```
+
+`opt` is used again to invoke the BuildDFG pass, which converts the textual representation to the internal HPVM representation.
+```
+/.../hpvm/build/bin/opt -debug -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -visc-timers-x86 -visc-timers-ptx -S build/main.visc.ll -o build/pipeline-gpu.host.ll
+```
+
+`llvm-cbe` is a C backend for LLVM. It is used here to create the OpenCL kernel.
+```
+/.../hpvm/build/bin/llvm-cbe -debug build/gpu/main.visc.ll.kernels.ll -o build/gpu/main.visc.ll.kernels.cl
+```
+
+`clang` is used again to compile a separate source file that contains I/O code.
+```
+/.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -emit-llvm -S -o build/gpu/io.ll src/io.cc
+```
+
+`llvm-link` is used to link against the VISC runtime.
+```
+/.../hpvm/build/bin/llvm-link build/gpu/pipeline-gpu.host.ll build/gpu/io.ll /.../hpvm/llvm/tools/hpvm/projects/visc-rt/visc-rt.ll -S -o build/gpu/pipeline-gpu.linked.ll
+```
+
+`clang++` is used to do the final linking against OpenCL and emit the binary.
+```
+/.../hpvm/build/bin/clang++ -O3 `pkg-config opencv --libs` -lm -lpthread -lrt -lOpenCL -L/software/cuda-9.1/lib64 build/gpu/pipeline-gpu.linked.ll -o pipeline-gpu
+```
\ No newline at end of file