diff --git a/.gitignore b/.gitignore
index a17e2716a5e90ee10ac32c19c3fc2f29f953f286..0da6a3671489a915ad13194ada7007d94dd13321 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,5 +34,5 @@ hpvm/install/
 hpvm/llvm/
 hpvm/llvm-*.src.tar.xz
 hpvm/llvm-*.src/
-hpvm/projects/visc-rt/visc-rt.ll
+hpvm/projects/hpvm-rt/hpvm-rt.ll
 hpvm/test/**/build/
diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h
index 28230e135beb68c07c998e607fa3d03d40a66791..ca4c616da5f4076528b1294992ec8ad3ab768809 100644
--- a/hpvm/include/BuildDFG/BuildDFG.h
+++ b/hpvm/include/BuildDFG/BuildDFG.h
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SupportVISC/DFGraph.h"
+#include "SupportHPVM/DFGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -58,10 +58,10 @@ public:
   // Functions
   virtual bool runOnModule(Module &M);
 
-  static bool isViscLaunchIntrinsic(Instruction *I);
-  static bool isViscGraphIntrinsic(Instruction *I);
-  static bool isViscQueryIntrinsic(Instruction *I);
-  static bool isViscIntrinsic(Instruction *I);
+  static bool isHPVMLaunchIntrinsic(Instruction *I);
+  static bool isHPVMGraphIntrinsic(Instruction *I);
+  static bool isHPVMQueryIntrinsic(Instruction *I);
+  static bool isHPVMIntrinsic(Instruction *I);
   static bool isTypeCongruent(Type *L, Type *R);
 
   // TODO: Maybe make these fields const
diff --git a/hpvm/include/GenVISC/GenVISC.h b/hpvm/include/GenHPVM/GenHPVM.h
similarity index 67%
rename from hpvm/include/GenVISC/GenVISC.h
rename to hpvm/include/GenHPVM/GenHPVM.h
index 1db9929be70fdc4335e23d7e879248f0ebb45c07..24798bc2740e2299f67cc7f515437339f2fe8310 100644
--- a/hpvm/include/GenVISC/GenVISC.h
+++ b/hpvm/include/GenHPVM/GenHPVM.h
@@ -1,4 +1,4 @@
-//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =//
+//== GenHPVM.h - Header file for "LLVM IR to HPVM IR Pass" =//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SupportVISC/VISCTimer.h"
+#include "SupportHPVM/HPVMTimer.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -18,24 +18,24 @@
 
 using namespace llvm;
 
-namespace genvisc {
-// GenVISC - The first implementation.
-struct GenVISC : public ModulePass {
+namespace genhpvm {
+// GenHPVM - The first implementation.
+struct GenHPVM : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
-  GenVISC() : ModulePass(ID) {}
+  GenHPVM() : ModulePass(ID) {}
 
 private:
   // Member variables
   Module *M;
-  FunctionCallee llvm_visc_initializeTimerSet;
-  FunctionCallee llvm_visc_switchToTimer;
-  FunctionCallee llvm_visc_printTimerSet;
+  FunctionCallee llvm_hpvm_initializeTimerSet;
+  FunctionCallee llvm_hpvm_switchToTimer;
+  FunctionCallee llvm_hpvm_printTimerSet;
 
   GlobalVariable *TimerSet;
 
   // Functions
   void initializeTimerSet(Instruction *);
-  void switchToTimer(enum visc_TimerID, Instruction *);
+  void switchToTimer(enum hpvm_TimerID, Instruction *);
   void printTimerSet(Instruction *);
   Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
                           const Twine &Name = "");
@@ -45,4 +45,4 @@ public:
   virtual bool runOnModule(Module &M);
 };
 
-} // namespace genvisc
+} // namespace genhpvm
diff --git a/hpvm/include/SupportVISC/DFG2LLVM.h b/hpvm/include/SupportHPVM/DFG2LLVM.h
similarity index 82%
rename from hpvm/include/SupportVISC/DFG2LLVM.h
rename to hpvm/include/SupportHPVM/DFG2LLVM.h
index b9e4cc4158b71ab18fbeadf2e4d094055feb6149..07147c6d909f5352dd886b5f8bc1a2b0ae434ffe 100644
--- a/hpvm/include/SupportVISC/DFG2LLVM.h
+++ b/hpvm/include/SupportHPVM/DFG2LLVM.h
@@ -1,7 +1,7 @@
 #ifndef __DFG2LLVM_H__
 #define __DFG2LLVM_H__
 
-//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "BuildDFG/BuildDFG.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMTimer.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -26,7 +26,7 @@ using namespace builddfg;
 
 #define TIMER(X)                                                               \
   do {                                                                         \
-    if (VISCTimer) {                                                           \
+    if (HPVMTimer) {                                                           \
       X;                                                                       \
     }                                                                          \
   } while (0)
@@ -37,8 +37,8 @@ using namespace builddfg;
 
 namespace dfg2llvm {
 // Helper Functions
-static inline ConstantInt *getTimerID(Module &, enum visc_TimerID);
-static inline ConstantInt *getTimerID(Module &, enum visc::Target);
+static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID);
+static inline ConstantInt *getTimerID(Module &, enum hpvm::Target);
 
 bool hasAttribute(Function *, unsigned, Attribute::AttrKind);
 
@@ -69,7 +69,7 @@ protected:
   // Member variables
   Module &M;
   BuildDFG &DFG;
-  bool VISCTimer = false;
+  bool HPVMTimer = false;
   std::string TargetName = "None";
 
   // Map from Old function associated with DFNode to new cloned function with
@@ -78,12 +78,12 @@ protected:
   // "Have we visited this function before?")
   DenseMap<DFNode *, Value *> OutputMap;
 
-  // VISC Runtime API
+  // HPVM Runtime API
   std::unique_ptr<Module> runtimeModule;
 
-  FunctionCallee llvm_visc_initializeTimerSet;
-  FunctionCallee llvm_visc_switchToTimer;
-  FunctionCallee llvm_visc_printTimerSet;
+  FunctionCallee llvm_hpvm_initializeTimerSet;
+  FunctionCallee llvm_hpvm_switchToTimer;
+  FunctionCallee llvm_hpvm_printTimerSet;
   GlobalVariable *TimerSet;
   GlobalVariable *GraphIDAddr;
   Instruction *InitCall;
@@ -109,7 +109,7 @@ protected:
 
   // Virtual Functions
   virtual void initializeTimerSet(Instruction *);
-  virtual void switchToTimer(enum visc_TimerID, Instruction *);
+  virtual void switchToTimer(enum hpvm_TimerID, Instruction *);
   virtual void printTimerSet(Instruction *);
 
   virtual ~CodeGenTraversal() {}
@@ -118,9 +118,9 @@ public:
   // Constructor
   CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
-  static bool checkPreferredTarget(DFNode *N, visc::Target T);
-  static bool preferredTargetIncludes(DFNode *N, visc::Target T);
-  visc::Target getPreferredTarget(DFNode *N);
+  static bool checkPreferredTarget(DFNode *N, hpvm::Target T);
+  static bool preferredTargetIncludes(DFNode *N, hpvm::Target T);
+  hpvm::Target getPreferredTarget(DFNode *N);
 
   virtual void visit(DFInternalNode *N) {
     // If code has already been generated for this internal node, skip the
@@ -157,25 +157,25 @@ public:
 
 // -------------- CodeGenTraversal Implementation -----------------
 
-bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) {
+bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) {
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  case hpvm::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::SPIR_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  case hpvm::SPIR_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_spir");
     break;
-  case visc::CUDNN_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+  case hpvm::CUDNN_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cudnn");
     break;
-  case visc::PROMISE_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+  case hpvm::PROMISE_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise");
     break;
-  case visc::CPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  case hpvm::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
   default:
     llvm_unreachable("Target Not supported yet!");
@@ -190,37 +190,37 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) {
   return false;
 }
 
-visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
-  return viscUtils::getPreferredTarget(N->getFuncPointer());
+hpvm::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
+  return hpvmUtils::getPreferredTarget(N->getFuncPointer());
 }
 
-bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) {
+bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) {
 
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
   std::vector<NamedMDNode *> HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+  case hpvm::GPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"));
     break;
-  case visc::SPIR_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+  case hpvm::SPIR_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_spir"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir"));
     break;
-  case visc::CPU_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+  case hpvm::CPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir"));
     break;
-  case visc::CUDNN_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn"));
+  case hpvm::CUDNN_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cudnn"));
     break;
-  case visc::PROMISE_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise"));
+  case hpvm::PROMISE_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_promise"));
     break;
-  case visc::CPU_OR_GPU_TARGET:
-  case visc::CPU_OR_SPIR_TARGET:
+  case hpvm::CPU_OR_GPU_TARGET:
+  case hpvm::CPU_OR_SPIR_TARGET:
     assert(false && "Target should be one of CPU/GPU/SPIR\n");
     break;
   default:
@@ -308,11 +308,11 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
   Function *newF = Function::Create(FTy, F->getLinkage(),
                                     F->getName() + "_cloned", F->getParent());
   renameNewArgument(newF, name);
-  newF = viscUtils::cloneFunction(F, newF, false);
+  newF = hpvmUtils::cloneFunction(F, newF, false);
 
   // Check if the function is used by a metadata node
   if (F->isUsedByMetadata()) {
-    viscUtils::fixHintMetadata(*F->getParent(), F, newF);
+    hpvmUtils::fixHintMetadata(*F->getParent(), F, newF);
   }
 
   return newF;
@@ -396,32 +396,32 @@ Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) {
 }
 
 void CodeGenTraversal::initTimerAPI() {
-  DECLARE(llvm_visc_initializeTimerSet);
-  DECLARE(llvm_visc_switchToTimer);
-  DECLARE(llvm_visc_printTimerSet);
+  DECLARE(llvm_hpvm_initializeTimerSet);
+  DECLARE(llvm_hpvm_switchToTimer);
+  DECLARE(llvm_hpvm_printTimerSet);
 }
 
 // Timer Routines
 // Initialize the timer set
 void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) {
-  // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet <<
+  // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet <<
   // "\n");
   TIMER(TimerSet = new GlobalVariable(
             M, Type::getInt8PtrTy(M.getContext()), false,
             GlobalValue::CommonLinkage,
             Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-            Twine("viscTimerSet_") + TargetName);
+            Twine("hpvmTimerSet_") + TargetName);
         DEBUG(errs() << "New global variable: " << *TimerSet << "\n");
 
-        Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
+        Value *TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet,
                                                None, "", InsertBefore);
         new StoreInst(TimerSetAddr, TimerSet, InsertBefore););
 }
 
-void CodeGenTraversal::switchToTimer(enum visc_TimerID timer,
+void CodeGenTraversal::switchToTimer(enum hpvm_TimerID timer,
                                      Instruction *InsertBefore) {
   Value *switchArgs[] = {TimerSet, getTimerID(M, timer)};
-  TIMER(CallInst::Create(llvm_visc_switchToTimer,
+  TIMER(CallInst::Create(llvm_hpvm_switchToTimer,
                          ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
 }
 
@@ -430,16 +430,16 @@ void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) {
   TIMER(TimerName =
             getStringPointer(TargetName + Twine("_Timer"), InsertBefore));
   Value *printArgs[] = {TimerSet, TimerName};
-  TIMER(CallInst::Create(llvm_visc_printTimerSet,
+  TIMER(CallInst::Create(llvm_hpvm_printTimerSet,
                          ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
 }
 
 // Implementation of Helper Functions
-static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) {
+static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
 }
 
-static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) {
+static inline ConstantInt *getTargetID(Module &M, enum hpvm::Target T) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), T);
 }
 
diff --git a/hpvm/include/SupportVISC/DFGTreeTraversal.h b/hpvm/include/SupportHPVM/DFGTreeTraversal.h
similarity index 100%
rename from hpvm/include/SupportVISC/DFGTreeTraversal.h
rename to hpvm/include/SupportHPVM/DFGTreeTraversal.h
diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportHPVM/DFGraph.h
similarity index 94%
rename from hpvm/include/SupportVISC/DFGraph.h
rename to hpvm/include/SupportHPVM/DFGraph.h
index 0c224a344c4ec342f52f4816280e101518ba43dd..d904e2401d7e9a58a38e9bca024de1a437cd56d1 100644
--- a/hpvm/include/SupportVISC/DFGraph.h
+++ b/hpvm/include/SupportHPVM/DFGraph.h
@@ -20,8 +20,8 @@
 #ifndef LLVM_IR_DFGRAPH_H
 #define LLVM_IR_DFGRAPH_H
 
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -158,7 +158,7 @@ public:
   }
 };
 
-// DFNode represents a single VISC Dataflow Node in LLVM.
+// DFNode represents a single HPVM Dataflow Node in LLVM.
 //
 // A Dataflow Node basically consists of
 // 1. Pointer to a function describing this dataflow node
@@ -210,8 +210,8 @@ private:
                                   ///< hierarchy
   unsigned Rank;                  ///< Ordering based on toplogical sort
   const DFNodeKind Kind;          ///< Kind of Node Internal/Leaf
-  visc::Target Tag;               ///< Code Generated for which backend
-  visc::Target Hint;              ///< To store preferred backend
+  hpvm::Target Tag;               ///< Code Generated for which backend
+  hpvm::Target Hint;              ///< To store preferred backend
 
 public:
   virtual ~DFNode() {
@@ -287,13 +287,13 @@ public:
 
   DFNodeKind getKind() const { return Kind; }
 
-  DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+  DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
          DFInternalNode *_Parent, unsigned _NumOfDim,
          std::vector<Value *> _DimLimits, DFNodeKind _K);
 
   bool isRoot() const {
     // It is a root node is it was created from a launch intrinsic
-    if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) {
+    if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) {
       assert(Level == 0 && "Root node's level is zero.");
       return true;
     }
@@ -326,9 +326,9 @@ public:
 
   unsigned getRank() const { return Rank; }
 
-  void setTag(visc::Target T) { Tag = T; }
+  void setTag(hpvm::Target T) { Tag = T; }
 
-  visc::Target getTag() const { return Tag; }
+  hpvm::Target getTag() const { return Tag; }
 
   void *getProperty(PropertyKind PType) {
     assert(PropertyList.count(PType) == 1 &&
@@ -342,24 +342,24 @@ public:
     PropertyList[PType] = PValue;
   }
 
-  void setGenFunc(Function *F, visc::Target T) {
+  void setGenFunc(Function *F, hpvm::Target T) {
     GenFunc = F;
     Tag = T;
   }
 
   Function *getGenFunc() const { return GenFunc; }
 
-  void setHasX86FuncForTarget(visc::Target T, bool isX86Func) {
+  void setHasX86FuncForTarget(hpvm::Target T, bool isX86Func) {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return; // Do nothing.
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       GenFuncInfo.cpu_hasX86Func = isX86Func;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       GenFuncInfo.gpu_hasX86Func = isX86Func;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       break;
     default:
       assert(false && "Unknown target\n");
@@ -368,15 +368,15 @@ public:
     return;
   }
 
-  bool hasX86GenFuncForTarget(visc::Target T) const {
+  bool hasX86GenFuncForTarget(hpvm::Target T) const {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return false;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       return GenFuncInfo.cpu_hasX86Func;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       return GenFuncInfo.gpu_hasX86Func;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
     default:
       assert(false && "Unknown target\n");
@@ -384,10 +384,10 @@ public:
     return false;
   }
 
-  void addGenFunc(Function *F, visc::Target T, bool isX86Func) {
+  void addGenFunc(Function *F, hpvm::Target T, bool isX86Func) {
 
     switch (T) {
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       if (GenFuncs.CPUGenFunc != NULL) {
         DEBUG(errs() << "Warning: Second generated CPU function for node "
                      << FuncPointer->getName() << "\n");
@@ -395,7 +395,7 @@ public:
       GenFuncs.CPUGenFunc = F;
       GenFuncInfo.cpu_hasX86Func = isX86Func;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       if (GenFuncs.GPUGenFunc != NULL) {
         DEBUG(errs() << "Warning: Second generated GPU function for node "
                      << FuncPointer->getName() << "\n");
@@ -403,25 +403,25 @@ public:
       GenFuncs.GPUGenFunc = F;
       GenFuncInfo.gpu_hasX86Func = isX86Func;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "A node function should be set with a tag specifying its \
                 type, not the node hint itself\n");
     default:
       assert(false && "Unknown target for generated function\n");
     }
 
-    Tag = viscUtils::getUpdatedTag(Tag, T);
+    Tag = hpvmUtils::getUpdatedTag(Tag, T);
   }
 
-  Function *getGenFuncForTarget(visc::Target T) const {
+  Function *getGenFuncForTarget(hpvm::Target T) const {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return NULL;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       return GenFuncs.CPUGenFunc;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       return GenFuncs.GPUGenFunc;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Requesting genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
@@ -431,19 +431,19 @@ public:
     return NULL;
   }
 
-  void removeGenFuncForTarget(visc::Target T) {
+  void removeGenFuncForTarget(hpvm::Target T) {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       GenFuncs.CPUGenFunc = NULL;
       GenFuncInfo.cpu_hasX86Func = false;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       GenFuncs.GPUGenFunc = NULL;
       GenFuncInfo.gpu_hasX86Func = false;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Removing genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
@@ -453,9 +453,9 @@ public:
     return;
   }
 
-  void setTargetHint(visc::Target T) { Hint = T; }
+  void setTargetHint(hpvm::Target T) { Hint = T; }
 
-  visc::Target getTargetHint() const { return Hint; }
+  hpvm::Target getTargetHint() const { return Hint; }
 
   bool isDummyNode() const { return isEntryNode() || isExitNode(); }
 
@@ -496,7 +496,7 @@ private:
   DFGraph *childGraph; ///< Pointer to dataflow graph
 
   // Constructor
-  DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
                  DFInternalNode *Parent, int NumOfDim,
                  std::vector<Value *> DimLimits)
       : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits,
@@ -508,7 +508,7 @@ private:
 public:
   static DFInternalNode *
   Create(IntrinsicInst *II, Function *FuncPointer,
-         visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL,
+         hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL,
          int NumOfDim = 0,
          std::vector<Value *> DimLimits = std::vector<Value *>()) {
 
@@ -539,14 +539,14 @@ class DFLeafNode : public DFNode {
 
 private:
   // Constructor
-  DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
              DFInternalNode *Parent, int NumOfDim = 0,
              std::vector<Value *> DimLimits = std::vector<Value *>())
       : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {}
 
 public:
   static DFLeafNode *
-  Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
          DFInternalNode *Parent, int NumOfDim = 0,
          std::vector<Value *> DimLimits = std::vector<Value *>()) {
     return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits);
@@ -558,7 +558,7 @@ public:
   //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
 };
 
-// DFEdge represents a single VISC Dataflow Edge in LLVM.
+// DFEdge represents a single HPVM Dataflow Edge in LLVM.
 //
 // A Dataflow Edge basically consists of
 // 1. Pointer to the dataflow node that is the source of this edge
@@ -634,8 +634,8 @@ DFGraph::DFGraph(DFInternalNode *P) {
   Parent = P;
   // Create dummy entry and exit nodes and add them to the graph
   Entry =
-      DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
-  Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
+      DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent);
+  Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent);
   addChildDFNode(Entry);
   addChildDFNode(Exit);
 }
@@ -655,7 +655,7 @@ bool DFGraph::isStreaming() {
 }
 
 //===--------------------- DFNode Outlined Functions --------------===//
-DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
                DFInternalNode *_Parent, unsigned _NumOfDim,
                std::vector<Value *> _DimLimits, DFNodeKind _K)
     : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim),
@@ -663,7 +663,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
 
   Type *Ty = FuncPointer->getFunctionType()->getReturnType();
 
-  // Allow the return type to be void too, in the hVISC IR. If return type is
+  // Allow the return type to be void too, in the hHPVM IR. If return type is
   // void, create an empty struct type and keep that as the return type of the
   // node.
   if (Ty->isVoidTy())
@@ -683,7 +683,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
   Level = (_Parent) ? _Parent->getLevel() + 1 : 0;
   Rank = 0;
 
-  Tag = visc::None;
+  Tag = hpvm::None;
   GenFuncs.CPUGenFunc = NULL;
   GenFuncs.GPUGenFunc = NULL;
   GenFuncs.SPIRGenFunc = NULL;
diff --git a/hpvm/include/SupportVISC/VISCHint.h b/hpvm/include/SupportHPVM/HPVMHint.h
similarity index 78%
rename from hpvm/include/SupportVISC/VISCHint.h
rename to hpvm/include/SupportHPVM/HPVMHint.h
index 99266b071843ab0417ea73c6e4533dfa381d52cd..1ef4c6eb3b986328080caa9e99e96f444978c03e 100644
--- a/hpvm/include/SupportVISC/VISCHint.h
+++ b/hpvm/include/SupportHPVM/HPVMHint.h
@@ -1,4 +1,4 @@
-//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
+//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef VISC_HINT_HEADER
-#define VISC_HINT_HEADER
+#ifndef HPVM_HINT_HEADER
+#define HPVM_HINT_HEADER
 
 /************************** Hint Routines ***************************/
 #ifdef __cplusplus
-namespace visc {
+namespace hpvm {
 #endif
 
 enum Target {
@@ -32,4 +32,4 @@ enum Target {
 }
 #endif
 
-#endif // VISC_HINT_HEADER
+#endif // HPVM_HINT_HEADER
diff --git a/hpvm/include/SupportHPVM/HPVMTimer.h b/hpvm/include/SupportHPVM/HPVMTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..05b24d41d6d50c61cd38b458676dbf79d28a917f
--- /dev/null
+++ b/hpvm/include/SupportHPVM/HPVMTimer.h
@@ -0,0 +1,151 @@
+//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HPVM_TIMER_HEADER
+#define HPVM_TIMER_HEADER
+
+/************************** Timer Routines ***************************/
+extern "C" {
+
+/* A time or duration. */
+//#if _POSIX_VERSION >= 200112L
+typedef unsigned long long hpvm_Timestamp; /* time in microseconds */
+//#else
+//# error "Timestamps not implemented"
+//#endif
+
+enum hpvm_TimerState {
+  hpvm_Timer_STOPPED,
+  hpvm_Timer_RUNNING,
+};
+
+struct hpvm_Timer {
+  enum hpvm_TimerState state;
+  hpvm_Timestamp elapsed; /* Amount of time elapsed so far */
+  hpvm_Timestamp init;    /* Beginning of the current time interval,
+                           * if state is RUNNING.  End of the last
+                           * recorded time interfal otherwise.  */
+};
+
+/* Reset a timer.
+ * Use this to initialize a timer or to clear
+ * its elapsed time.  The reset timer is stopped.
+ */
+void hpvm_ResetTimer(struct hpvm_Timer *timer);
+
+/* Start a timer.  The timer is set to RUNNING mode and
+ * time elapsed while the timer is running is added to
+ * the timer.
+ * The timer should not already be running.
+ */
+void hpvm_StartTimer(struct hpvm_Timer *timer);
+
+/* Stop a timer.
+ * This stops adding elapsed time to the timer.
+ * The timer should not already be stopped.
+ */
+void hpvm_StopTimer(struct hpvm_Timer *timer);
+
+/* Get the elapsed time in seconds. */
+double hpvm_GetElapsedTime(struct hpvm_Timer *timer);
+
+/* Execution time is assigned to one of these categories. */
+enum hpvm_TimerID {
+  hpvm_TimerID_NONE = 0,
+  hpvm_TimerID_IO,         /* Time spent in input/output */
+  hpvm_TimerID_KERNEL,     /* Time spent computing on the device,
+                            * recorded asynchronously */
+  hpvm_TimerID_COPY,       /* Time spent synchronously moving data
+                            * to/from device and allocating/freeing
+                            * memory on the device */
+  hpvm_TimerID_DRIVER,     /* Time spent in the host interacting with the
+                            * driver, primarily for recording the time
+                            * spent queueing asynchronous operations */
+  hpvm_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
+  hpvm_TimerID_COMPUTE,    /* Time for all program execution other
+                            * than parsing command line arguments,
+                            * I/O, kernel, and copy */
+  hpvm_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
+                            * host activity: automatically filled in,
+                            * not intended for direct usage */
+  // GPU FUNCTION
+  hpvm_TimerID_INIT_CTX,
+  hpvm_TimerID_CLEAR_CTX,
+  hpvm_TimerID_COPY_SCALAR,
+  hpvm_TimerID_COPY_PTR,
+  hpvm_TimerID_MEM_FREE,
+  hpvm_TimerID_READ_OUTPUT,
+  hpvm_TimerID_SETUP,
+  hpvm_TimerID_MEM_TRACK,
+  hpvm_TimerID_MEM_UNTRACK,
+  hpvm_TimerID_MISC,
+  // LAUNCH FUNCTION
+  hpvm_TimerID_PTHREAD_CREATE,
+  hpvm_TimerID_ARG_PACK,
+  hpvm_TimerID_ARG_UNPACK,
+  hpvm_TimerID_COMPUTATION,
+  hpvm_TimerID_OUTPUT_PACK,
+  hpvm_TimerID_OUTPUT_UNPACK,
+
+  hpvm_TimerID_LAST /* Number of timer IDs */
+};
+
+/* Dynamic list of asynchronously tracked times between events */
+struct hpvm_async_time_marker_list {
+  char *label;               // actually just a pointer to a string
+  enum hpvm_TimerID timerID; /* The ID to which the interval beginning
+                              * with this marker should be attributed */
+  void *marker;
+  // cudaEvent_t marker; 		/* The driver event for this marker */
+  struct hpvm_async_time_marker_list *next;
+};
+
+struct hpvm_SubTimer {
+  char *label;
+  struct hpvm_Timer timer;
+  struct hpvm_SubTimer *next;
+};
+
+struct hpvm_SubTimerList {
+  struct hpvm_SubTimer *current;
+  struct hpvm_SubTimer *subtimer_list;
+};
+
+/* A set of timers for recording execution times. */
+struct hpvm_TimerSet {
+  enum hpvm_TimerID current;
+  struct hpvm_async_time_marker_list *async_markers;
+  hpvm_Timestamp async_begin;
+  hpvm_Timestamp wall_begin;
+  struct hpvm_Timer timers[hpvm_TimerID_LAST];
+  struct hpvm_SubTimerList *sub_timer_list[hpvm_TimerID_LAST];
+};
+
+/* Reset all timers in the set. */
+void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers);
+
+void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
+                      enum hpvm_TimerID hpvm_Category);
+
+/* Select which timer the next interval of time should be accounted
+ * to. The selected timer is started and other timers are stopped.
+ * Using hpvm_TimerID_NONE stops all timers. */
+inline void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers,
+                               enum hpvm_TimerID timer);
+
+void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
+                           enum hpvm_TimerID category);
+
+/* Print timer values to standard output. */
+void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers);
+
+/* Release timer resources */
+void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers);
+}
+#endif // HPVM_RT_HEADER
diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h
similarity index 84%
rename from hpvm/include/SupportVISC/VISCUtils.h
rename to hpvm/include/SupportHPVM/HPVMUtils.h
index 0efd20b5b5eb57943de1feb6d2afa886c6c48a5c..25b9880180f2cb4590f5b5fcbb3f3f2fbe025f8f 100644
--- a/hpvm/include/SupportVISC/VISCUtils.h
+++ b/hpvm/include/SupportHPVM/HPVMUtils.h
@@ -1,5 +1,5 @@
 //
-//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef VISC_UTILS_HEADER
-#define VISC_UTILS_HEADER
+#ifndef HPVM_UTILS_HEADER
+#define HPVM_UTILS_HEADER
 
 #include <assert.h>
 
-#include "SupportVISC/VISCHint.h"
+#include "SupportHPVM/HPVMHint.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -29,31 +29,31 @@
 
 using namespace llvm;
 
-namespace viscUtils {
+namespace hpvmUtils {
 // Helper Functions
 
-static bool isViscCreateNodeIntrinsic(Instruction *I) {
+static bool isHPVMCreateNodeIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   return (II->getCalledFunction()->getName())
-      .startswith("llvm.visc.createNode");
+      .startswith("llvm.hpvm.createNode");
 }
 
-static bool isViscCreateNodeCall(Instruction *I) {
+static bool isHPVMCreateNodeCall(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
   return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .startswith("__visc__createNode");
+      .startswith("__hpvm__createNode");
 }
 
-static bool isViscLaunchCall(Instruction *I) {
+static bool isHPVMLaunchCall(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
   return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .startswith("__visc__launch");
+      .startswith("__hpvm__launch");
 }
 // Creates a new createNode intrinsic, similar to II but with different
 // associated function F instead
@@ -69,22 +69,22 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
 
   ArrayRef<Value *> CreateNodeArgs;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::visc_createNode: {
+  case Intrinsic::hpvm_createNode: {
     CreateNodeArgs = ArrayRef<Value *>(Fp);
     break;
   }
-  case Intrinsic::visc_createNode1D: {
+  case Intrinsic::hpvm_createNode1D: {
     Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2);
     break;
   }
-  case Intrinsic::visc_createNode2D: {
+  case Intrinsic::hpvm_createNode2D: {
     Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
                                  II->getArgOperand(2)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3);
     break;
   }
-  case Intrinsic::visc_createNode3D: {
+  case Intrinsic::hpvm_createNode3D: {
     Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2),
                                  II->getArgOperand(3)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4);
@@ -101,7 +101,7 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
   return CreateNodeII;
 }
 
-// Fix VISC hints for this function
+// Fix HPVM hints for this function
 void fixHintMetadata(Module &M, Function *F, Function *G) {
   Metadata *MD_F = ValueAsMetadata::getIfExists(F);
   MDTuple *MDT_F =
@@ -119,9 +119,9 @@ void fixHintMetadata(Module &M, Function *F, Function *G) {
     }
   };
 
-  FixHint("visc_hint_gpu");
-  FixHint("visc_hint_cpu");
-  FixHint("visc_hint_cpu_gpu");
+  FixHint("hpvm_hint_gpu");
+  FixHint("hpvm_hint_cpu");
+  FixHint("hpvm_hint_cpu_gpu");
 }
 
 // Assuming that the changed function is a node function, it is only used as a
@@ -138,7 +138,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
          ++i) {
       Instruction *I = &*i; // Grab pointer to Instruction
 
-      if (isViscCreateNodeIntrinsic(I)) {
+      if (isHPVMCreateNodeIntrinsic(I)) {
         IntrinsicInst *II = cast<IntrinsicInst>(I);
         // The found createNode is not associated with the changed function
         if (II->getArgOperand(0) != F)
@@ -150,7 +150,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
             createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
         II->replaceAllUsesWith(CreateNodeII);
         toBeErased.push_back(II);
-      } else if (isViscCreateNodeCall(I)) {
+      } else if (isHPVMCreateNodeCall(I)) {
         CallInst *CI = cast<CallInst>(I);
         // The found createNode is not associated with the changed function
         if (CI->getArgOperand(1) != F)
@@ -161,7 +161,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
         // Replace use of F with use of G
         CI->setArgOperand(1, G);
         DEBUG(errs() << "Fixed use: " << *CI << "\n");
-      } else if (isViscLaunchCall(I)) {
+      } else if (isHPVMLaunchCall(I)) {
         CallInst *CI = cast<CallInst>(I);
         // The found launch call is not associated with the changed function
         if (CI->getArgOperand(1)->stripPointerCasts() != F)
@@ -370,21 +370,21 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg,
 //------------------- Helper Functions For Handling Hints -------------------//
 
 // Return true if 1st arg (tag) contains 2nd (target)
-bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
+bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
   switch (Tag) {
-  case visc::None:
+  case hpvm::None:
     return false;
-  case visc::CPU_TARGET:
-    if (T == visc::CPU_TARGET)
+  case hpvm::CPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
       return true;
     return false;
-  case visc::GPU_TARGET:
-    if (T == visc::GPU_TARGET)
+  case hpvm::GPU_TARGET:
+    if (T == hpvm::GPU_TARGET)
       return true;
     return false;
-  case visc::CPU_OR_GPU_TARGET:
-    if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) ||
-        (T == visc::CPU_OR_GPU_TARGET))
+  case hpvm::CPU_OR_GPU_TARGET:
+    if ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) ||
+        (T == hpvm::CPU_OR_GPU_TARGET))
       return true;
     return false;
   default:
@@ -392,41 +392,41 @@ bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
   }
 }
 
-bool isSingleTargetTag(visc::Target T) {
-  return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET));
+bool isSingleTargetTag(hpvm::Target T) {
+  return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET));
 }
 
 // Add the specified target to the given tag
-visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
-  assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) &&
+hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) {
+  assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)) &&
          "The target is only allowed to be a single target: CPU, GPU, SPIR, "
          "CUDNN, PROMISE\n");
 
   switch (Tag) {
-  case visc::None:
+  case hpvm::None:
     return T;
-  case visc::CPU_TARGET:
-    if (T == visc::CPU_TARGET)
-      return visc::CPU_TARGET;
-    if (T == visc::GPU_TARGET)
-      return visc::CPU_OR_GPU_TARGET;
+  case hpvm::CPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
+      return hpvm::CPU_TARGET;
+    if (T == hpvm::GPU_TARGET)
+      return hpvm::CPU_OR_GPU_TARGET;
     return T;
-  case visc::GPU_TARGET:
-    if (T == visc::CPU_TARGET)
-      return visc::CPU_OR_GPU_TARGET;
-    if (T == visc::GPU_TARGET)
-      return visc::GPU_TARGET;
+  case hpvm::GPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
+      return hpvm::CPU_OR_GPU_TARGET;
+    if (T == hpvm::GPU_TARGET)
+      return hpvm::GPU_TARGET;
     return T;
-  case visc::CPU_OR_GPU_TARGET:
-    return visc::CPU_OR_GPU_TARGET;
+  case hpvm::CPU_OR_GPU_TARGET:
+    return hpvm::CPU_OR_GPU_TARGET;
   default:
     assert(false && "Unknown Target\n");
   }
   return T;
 }
 
-// This functions add the hint as metadata in visc code
-void addHint(Function *F, visc::Target T) {
+// This functions add the hint as metadata in hpvm code
+void addHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
@@ -434,17 +434,17 @@ void addHint(Function *F, visc::Target T) {
   // Based on the hint, get the hint metadata
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
+  case hpvm::GPU_TARGET:
     DEBUG(errs() << "GPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::CPU_TARGET:
+  case hpvm::CPU_TARGET:
     DEBUG(errs() << "CPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
-  case visc::CPU_OR_GPU_TARGET:
+  case hpvm::CPU_OR_GPU_TARGET:
     DEBUG(errs() << "CPU or GPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu");
     break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
@@ -457,8 +457,8 @@ void addHint(Function *F, visc::Target T) {
   HintNode->addOperand(N);
 }
 
-// This function removes the hint as metadata in visc code
-void removeHint(Function *F, visc::Target T) {
+// This function removes the hint as metadata in hpvm code
+void removeHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T
@@ -467,14 +467,14 @@ void removeHint(Function *F, visc::Target T) {
   // Based on the hint, get the hint metadata
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  case hpvm::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::CPU_OR_GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+  case hpvm::CPU_OR_GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu");
     break;
-  case visc::CPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  case hpvm::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
@@ -501,7 +501,7 @@ void removeHint(Function *F, visc::Target T) {
   }
 }
 
-visc::Target getPreferredTarget(Function *F) {
+hpvm::Target getPreferredTarget(Function *F) {
   DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
   Module *M = F->getParent();
 
@@ -517,16 +517,16 @@ visc::Target getPreferredTarget(Function *F) {
     return false;
   };
 
-  if (FoundPrefTarget("visc_hint_cpu"))
-    return visc::CPU_TARGET;
-  if (FoundPrefTarget("visc_hint_gpu"))
-    return visc::GPU_TARGET;
-  if (FoundPrefTarget("visc_hint_cpu_gpu"))
-    return visc::CPU_OR_GPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_cpu"))
+    return hpvm::CPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_gpu"))
+    return hpvm::GPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_cpu_gpu"))
+    return hpvm::CPU_OR_GPU_TARGET;
 
-  return visc::None;
+  return hpvm::None;
 }
 
-} // namespace viscUtils
+} // namespace hpvmUtils
 
-#endif // VISC_UTILS_HEADER
+#endif // HPVM_UTILS_HEADER
diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h
deleted file mode 100644
index ce3dc8a5e0f7c77ff06fec5857f223ca4f0e142f..0000000000000000000000000000000000000000
--- a/hpvm/include/SupportVISC/VISCTimer.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef VISC_TIMER_HEADER
-#define VISC_TIMER_HEADER
-
-/************************** Timer Routines ***************************/
-extern "C" {
-
-/* A time or duration. */
-//#if _POSIX_VERSION >= 200112L
-typedef unsigned long long visc_Timestamp; /* time in microseconds */
-//#else
-//# error "Timestamps not implemented"
-//#endif
-
-enum visc_TimerState {
-  visc_Timer_STOPPED,
-  visc_Timer_RUNNING,
-};
-
-struct visc_Timer {
-  enum visc_TimerState state;
-  visc_Timestamp elapsed; /* Amount of time elapsed so far */
-  visc_Timestamp init;    /* Beginning of the current time interval,
-                           * if state is RUNNING.  End of the last
-                           * recorded time interfal otherwise.  */
-};
-
-/* Reset a timer.
- * Use this to initialize a timer or to clear
- * its elapsed time.  The reset timer is stopped.
- */
-void visc_ResetTimer(struct visc_Timer *timer);
-
-/* Start a timer.  The timer is set to RUNNING mode and
- * time elapsed while the timer is running is added to
- * the timer.
- * The timer should not already be running.
- */
-void visc_StartTimer(struct visc_Timer *timer);
-
-/* Stop a timer.
- * This stops adding elapsed time to the timer.
- * The timer should not already be stopped.
- */
-void visc_StopTimer(struct visc_Timer *timer);
-
-/* Get the elapsed time in seconds. */
-double visc_GetElapsedTime(struct visc_Timer *timer);
-
-/* Execution time is assigned to one of these categories. */
-enum visc_TimerID {
-  visc_TimerID_NONE = 0,
-  visc_TimerID_IO,         /* Time spent in input/output */
-  visc_TimerID_KERNEL,     /* Time spent computing on the device,
-                            * recorded asynchronously */
-  visc_TimerID_COPY,       /* Time spent synchronously moving data
-                            * to/from device and allocating/freeing
-                            * memory on the device */
-  visc_TimerID_DRIVER,     /* Time spent in the host interacting with the
-                            * driver, primarily for recording the time
-                            * spent queueing asynchronous operations */
-  visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
-  visc_TimerID_COMPUTE,    /* Time for all program execution other
-                            * than parsing command line arguments,
-                            * I/O, kernel, and copy */
-  visc_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
-                            * host activity: automatically filled in,
-                            * not intended for direct usage */
-  // GPU FUNCTION
-  visc_TimerID_INIT_CTX,
-  visc_TimerID_CLEAR_CTX,
-  visc_TimerID_COPY_SCALAR,
-  visc_TimerID_COPY_PTR,
-  visc_TimerID_MEM_FREE,
-  visc_TimerID_READ_OUTPUT,
-  visc_TimerID_SETUP,
-  visc_TimerID_MEM_TRACK,
-  visc_TimerID_MEM_UNTRACK,
-  visc_TimerID_MISC,
-  // LAUNCH FUNCTION
-  visc_TimerID_PTHREAD_CREATE,
-  visc_TimerID_ARG_PACK,
-  visc_TimerID_ARG_UNPACK,
-  visc_TimerID_COMPUTATION,
-  visc_TimerID_OUTPUT_PACK,
-  visc_TimerID_OUTPUT_UNPACK,
-
-  visc_TimerID_LAST /* Number of timer IDs */
-};
-
-/* Dynamic list of asynchronously tracked times between events */
-struct visc_async_time_marker_list {
-  char *label;               // actually just a pointer to a string
-  enum visc_TimerID timerID; /* The ID to which the interval beginning
-                              * with this marker should be attributed */
-  void *marker;
-  // cudaEvent_t marker; 		/* The driver event for this marker */
-  struct visc_async_time_marker_list *next;
-};
-
-struct visc_SubTimer {
-  char *label;
-  struct visc_Timer timer;
-  struct visc_SubTimer *next;
-};
-
-struct visc_SubTimerList {
-  struct visc_SubTimer *current;
-  struct visc_SubTimer *subtimer_list;
-};
-
-/* A set of timers for recording execution times. */
-struct visc_TimerSet {
-  enum visc_TimerID current;
-  struct visc_async_time_marker_list *async_markers;
-  visc_Timestamp async_begin;
-  visc_Timestamp wall_begin;
-  struct visc_Timer timers[visc_TimerID_LAST];
-  struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST];
-};
-
-/* Reset all timers in the set. */
-void visc_InitializeTimerSet(struct visc_TimerSet *timers);
-
-void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
-                      enum visc_TimerID visc_Category);
-
-/* Select which timer the next interval of time should be accounted
- * to. The selected timer is started and other timers are stopped.
- * Using visc_TimerID_NONE stops all timers. */
-inline void visc_SwitchToTimer(struct visc_TimerSet *timers,
-                               enum visc_TimerID timer);
-
-void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
-                           enum visc_TimerID category);
-
-/* Print timer values to standard output. */
-void visc_PrintTimerSet(struct visc_TimerSet *timers);
-
-/* Release timer resources */
-void visc_DestroyTimerSet(struct visc_TimerSet *timers);
-}
-#endif // VISC_RT_HEADER
diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 058419f1dc80a8650e7a3b834090a88099741431..be3e6cae3dae775716fc3e2206879e978febddb0 100644
--- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -10,8 +10,8 @@
 #define DEBUG_TYPE "buildDFG"
 #include "BuildDFG/BuildDFG.h"
 
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -35,7 +35,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
     for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
       Instruction *I = &*i; // Grab pointer to Instruction
-      if (isViscLaunchIntrinsic(I)) {
+      if (isHPVMLaunchIntrinsic(I)) {
         DEBUG(errs() << "------------ Found launch site --------------\n");
         II = cast<IntrinsicInst>(I);
 
@@ -43,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
         // Intrinsic Instruction has been initialized from this point on.
         Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts());
-        Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F));
+        Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F));
         Roots.push_back(Root);
         BuildGraph(Root, F);
 
@@ -118,37 +118,37 @@ void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) {
   HandleToDFEdgeMap.erase(V);
 }
 
-// Returns true if instruction I is a visc launch intrinsic, false otherwise
-bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm launch intrinsic, false otherwise
+bool BuildDFG::isHPVMLaunchIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).equals("llvm.visc.launch");
+  return (II->getCalledFunction()->getName()).equals("llvm.hpvm.launch");
 }
 
-// Returns true if instruction I is a visc graph intrinsic, false otherwise
-bool BuildDFG::isViscGraphIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm graph intrinsic, false otherwise
+bool BuildDFG::isHPVMGraphIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") ||
-         (II->getCalledFunction()->getName()).startswith("llvm.visc.bind");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.create") ||
+         (II->getCalledFunction()->getName()).startswith("llvm.hpvm.bind");
 }
 
-// Returns true if instruction I is a visc query intrinsic, false otherwise
-bool BuildDFG::isViscQueryIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm query intrinsic, false otherwise
+bool BuildDFG::isHPVMQueryIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.get");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.get");
 }
 
-// Returns true if instruction I is a visc intrinsic, false otherwise
-bool BuildDFG::isViscIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm intrinsic, false otherwise
+bool BuildDFG::isHPVMIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm");
 }
 
 // Two types are "congruent" if they are identical, or if they are both
@@ -163,7 +163,7 @@ bool BuildDFG::isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-// Handles all the createNodeXX visc intrinsics.
+// Handles all the createNodeXX hpvm intrinsics.
 void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   bool isInternalNode = false;
 
@@ -173,7 +173,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   // internal node
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
     Instruction *I = &*i; // Grab pointer to Instruction
-    if (isViscGraphIntrinsic(I))
+    if (isHPVMGraphIntrinsic(I))
       isInternalNode = true;
   }
 
@@ -196,14 +196,14 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
     // Create Internal DFNode, add it to the map and recursively build its
     // dataflow graph
     DFInternalNode *childDFNode = DFInternalNode::Create(
-        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
     BuildGraph(childDFNode, F);
   } else {
     // Create Leaf DFnode and add it to the map.
     DFLeafNode *childDFNode = DFLeafNode::Create(
-        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
   }
@@ -336,11 +336,11 @@ void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) {
 
 void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
   DEBUG(errs() << "FUNCTION: " << F->getName() << "\n");
-  // TODO: Place checks for valid visc functions. For example one of the
-  // check can be that any function that contains visc dataflow graph
+  // TODO: Place checks for valid hpvm functions. For example one of the
+  // check can be that any function that contains hpvm dataflow graph
   // construction intrinsics should not have other llvm IR statements.
 
-  // Iterate over all the instructions of a function and look for visc
+  // Iterate over all the instructions of a function and look for hpvm
   // intrinsics.
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
     Instruction *I = &*i; // Grab pointer to Instruction
@@ -349,25 +349,25 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
       DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": "
                    << II->getCalledFunction()->getName() << "\n");
       switch (II->getIntrinsicID()) {
-      case Intrinsic::visc_createNode:
-      case Intrinsic::visc_createNode1D:
-      case Intrinsic::visc_createNode2D:
-      case Intrinsic::visc_createNode3D:
+      case Intrinsic::hpvm_createNode:
+      case Intrinsic::hpvm_createNode1D:
+      case Intrinsic::hpvm_createNode2D:
+      case Intrinsic::hpvm_createNode3D:
         handleCreateNode(N, II);
         break;
-      case Intrinsic::visc_createEdge:
+      case Intrinsic::hpvm_createEdge:
         handleCreateEdge(N, II);
         break;
-      case Intrinsic::visc_bind_input:
+      case Intrinsic::hpvm_bind_input:
         handleBindInput(N, II);
         break;
-      case Intrinsic::visc_bind_output:
+      case Intrinsic::hpvm_bind_output:
         handleBindOutput(N, II);
         break;
 
       // TODO: Reconsider launch within a dataflow graph (recursion?)
-      case Intrinsic::visc_wait:
-      case Intrinsic::visc_launch:
+      case Intrinsic::hpvm_wait:
+      case Intrinsic::hpvm_launch:
         DEBUG(errs()
               << "Error: Launch/wait intrinsic used within a dataflow graph\n\t"
               << *II << "\n");
@@ -375,7 +375,7 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
 
       default:
         DEBUG(
-            errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t"
+            errs() << "Error: Invalid HPVM Intrinsic inside Internal node!\n\t"
                    << *II << "\n");
         break;
       }
diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt
index 68724684e56648d307df52624e47ed7393bfd3f9..5c9b8b9fe026ea5612caa124535e02d28d619c53 100644
--- a/hpvm/lib/Transforms/CMakeLists.txt
+++ b/hpvm/lib/Transforms/CMakeLists.txt
@@ -2,5 +2,5 @@ add_subdirectory(BuildDFG)
 add_subdirectory(ClearDFG)
 add_subdirectory(DFG2LLVM_NVPTX)
 add_subdirectory(DFG2LLVM_X86)
-add_subdirectory(GenVISC)
+add_subdirectory(GenHPVM)
 add_subdirectory(LocalMem)
diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 6dae9e6977d31a0b62a9fa903966ec10810a2f71..c23043e7829a8947a995f7ad97688091c46cf23d 100644
--- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -18,7 +18,7 @@
 using namespace llvm;
 using namespace builddfg;
 
-// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
+// STATISTIC(IntrinsicCounter, "Counts number of hpvm intrinsics greeted");
 
 namespace {
 
@@ -101,8 +101,8 @@ bool ClearDFG::runOnModule(Module &M) {
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  Function *VI = M.getFunction("llvm.visc.init");
-  assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->hasOneUse() && "More than one use of llvm.hpvm.init\n");
   for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end();
        ui != ue; ui++) {
     Instruction *I = dyn_cast<Instruction>(*ui);
@@ -111,8 +111,8 @@ bool ClearDFG::runOnModule(Module &M) {
   VI->replaceAllUsesWith(UndefValue::get(VI->getType()));
   VI->eraseFromParent();
 
-  Function *VC = M.getFunction("llvm.visc.cleanup");
-  assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n");
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
+  assert(VC->hasOneUse() && "More than one use of llvm.hpvm.cleanup\n");
   for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end();
        ui != ue; ui++) {
     Instruction *I = dyn_cast<Instruction>(*ui);
diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 8a36e3b8af5c031715d1e341f3ac166501c0a5b9..584da07e6e4786c8c1f06c89ff1cd2a8780f0cb2 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -15,40 +15,39 @@
 #define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_NVPTX"
+#include "SupportHPVM/DFG2LLVM.h"
+#include "SupportHPVM/HPVMTimer.h"
+#include "SupportHPVM/HPVMUtils.h"
+#include "llvm-c/Core.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm-c/Core.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/DFG2LLVM.h"
-#include "SupportVISC/VISCUtils.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/IR/UseListOrder.h"
-
+#include "llvm/Support/ToolOutputFile.h"
 
 #include <sstream>
 
 using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
-using namespace viscUtils;
+using namespace hpvmUtils;
 
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers"));
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer_NVPTX("hpvm-timers-ptx",
+                                     cl::desc("Enable hpvm timers"));
 
 namespace {
 // Helper class declarations
@@ -57,94 +56,88 @@ namespace {
 // in bytes. Would have preferred to use tuple but support not yet available
 class OutputPtr {
 public:
-  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
-    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
+      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
 
-  Value* h_ptr;
-  Value* d_ptr;
-  Value* bytes;
+  Value *h_ptr;
+  Value *d_ptr;
+  Value *bytes;
 };
 
 // Class to maintain important kernel info required for generating runtime
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
-           std::map<unsigned, unsigned>(),
-         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
-           std::map<unsigned, std::pair<Value*, unsigned> >(),
-         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
-         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
-    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
-      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
-
-    assert(gridDim == globalWGSize.size()
-           && "gridDim should be same as the size of vector globalWGSize");
-    assert(blockDim == localWGSize.size()
-           && "blockDim should be same as the size of vector localWGSize");
+  Kernel(
+      Function *_KF, DFLeafNode *_KLeafNode,
+      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
+      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
+          std::map<unsigned, std::pair<Value *, unsigned>>(),
+      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+      unsigned _gridDim = 0,
+      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
+      unsigned _blockDim = 0,
+      std::vector<Value *> _localWGSize = std::vector<Value *>())
+      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
+        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+        localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size() &&
+           "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size() &&
+           "blockDim should be same as the size of vector localWGSize");
   }
 
-  Function* KernelFunction;
-  DFLeafNode* KernelLeafNode;
+  Function *KernelFunction;
+  DFLeafNode *KernelLeafNode;
   std::map<unsigned, unsigned> inArgMap;
   // Map for shared memory arguments
-  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
   // Fields for (potential) allocation node
-  DFLeafNode* AllocationNode;
-  Function* AllocationFunction;
+  DFLeafNode *AllocationNode;
+  Function *AllocationFunction;
   std::map<unsigned, unsigned> allocInArgMap;
 
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
-  std::vector<Value*> globalWGSize;
+  std::vector<Value *> globalWGSize;
   unsigned blockDim;
-  std::vector<Value*> localWGSize;
+  std::vector<Value *> localWGSize;
   std::vector<int> localDimMap;
 
-  std::map<unsigned, unsigned> &getInArgMap() {
-    return inArgMap;
-  }
-  void setInArgMap(std::map<unsigned, unsigned> map) {
-    inArgMap = map;
-  }
+  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
+  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
 
-  std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
     return sharedInArgMap;
   }
-  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
     sharedInArgMap = map;
   }
 
-  std::vector<unsigned> &getOutArgMap() {
-    return outArgMap;
-  }
-  void setOutArgMap(std::vector<unsigned> map) {
-    outArgMap = map;
-  }
+  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
+  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
 
-  void setLocalWGSize(std::vector<Value*> V) {
-    localWGSize = V;
-  }
+  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
 
-  bool hasLocalWG() const {
-    return blockDim != 0;
-  }
+  bool hasLocalWG() const { return blockDim != 0; }
 };
 
 // Helper function declarations
-static bool canBePromoted(Argument* arg, Function* F);
-static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
-                                 ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
-                              Instruction*, const Twine& WGName = "WGSize");
-static std::string getPTXFilename(const Module&);
-static std::string getFilenameFromModule(const Module& M);
+static bool canBePromoted(Argument *arg, Function *F);
+static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
+                                 Kernel *, ValueToValueMapTy &, Instruction *);
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
+                              ValueToValueMapTy &, Instruction *,
+                              const Twine &WGName = "WGSize");
+static std::string getPTXFilename(const Module &);
+static std::string getFilenameFromModule(const Module &M);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID,
+                              std::vector<IntrinsicInst *> &);
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
 static std::string getAtomicOpName(Intrinsic::ID);
 
@@ -154,7 +147,6 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM {
   DFG2LLVM_NVPTX() : DFG2LLVM(ID) {}
 
 private:
-
 public:
   bool runOnModule(Module &M);
 };
@@ -163,57 +155,60 @@ public:
 class CGT_NVPTX : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
   std::unique_ptr<Module> KernelM;
-  DFNode* KernelLaunchNode = NULL;
-  Kernel* kernel;
-
-  // VISC Runtime API
-  FunctionCallee llvm_visc_ocl_launch;
-  FunctionCallee llvm_visc_ocl_wait;
-  FunctionCallee llvm_visc_ocl_initContext;
-  FunctionCallee llvm_visc_ocl_clearContext;
-  FunctionCallee llvm_visc_ocl_argument_shared;
-  FunctionCallee llvm_visc_ocl_argument_scalar;
-  FunctionCallee llvm_visc_ocl_argument_ptr;
-  FunctionCallee llvm_visc_ocl_output_ptr;
-  FunctionCallee llvm_visc_ocl_free;
-  FunctionCallee llvm_visc_ocl_getOutput;
-  FunctionCallee llvm_visc_ocl_executeNode;
-
-  //Functions
+  DFNode *KernelLaunchNode = NULL;
+  Kernel *kernel;
+
+  // HPVM Runtime API
+  FunctionCallee llvm_hpvm_ocl_launch;
+  FunctionCallee llvm_hpvm_ocl_wait;
+  FunctionCallee llvm_hpvm_ocl_initContext;
+  FunctionCallee llvm_hpvm_ocl_clearContext;
+  FunctionCallee llvm_hpvm_ocl_argument_shared;
+  FunctionCallee llvm_hpvm_ocl_argument_scalar;
+  FunctionCallee llvm_hpvm_ocl_argument_ptr;
+  FunctionCallee llvm_hpvm_ocl_output_ptr;
+  FunctionCallee llvm_hpvm_ocl_free;
+  FunctionCallee llvm_hpvm_ocl_getOutput;
+  FunctionCallee llvm_hpvm_ocl_executeNode;
+
+  // Functions
   std::string getKernelsModuleName(Module &M);
-  void fixValueAddrspace(Value* V, unsigned addrspace);
-  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*);
-  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
-  void addCLMetadata(Function* F);
-  Function* transformFunctionToVoid(Function* F);
-  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
+  void fixValueAddrspace(Value *V, unsigned addrspace);
+  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
+                                                  Function *);
+  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
+                               unsigned i);
+  void addCLMetadata(Function *F);
+  Function *transformFunctionToVoid(Function *F);
+  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
 
   // Virtual Functions
   void init() {
-    VISCTimer = VISCTimer_NVPTX;
+    HPVMTimer = HPVMTimer_NVPTX;
     TargetName = "NVPTX";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
 
 public:
-
   // Constructor
-  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG)
+      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
     init();
     initRuntimeAPI();
-    errs() << "Old module pointer: " << &_M << "\n";
-    errs() << "New module pointer: " <<  KernelM.get() << "\n";
+    DEBUG(errs() << "Old module pointer: " << &_M << "\n");
+    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
 
-    // Copying instead of creating new, in order to preserve required info (metadata)
-    // Remove functions, global variables and aliases
-    std::vector<GlobalVariable*> GVVect;
+    // Copying instead of creating new, in order to preserve required info
+    // (metadata) Remove functions, global variables and aliases
+    std::vector<GlobalVariable *> GVVect;
     for (Module::global_iterator mi = KernelM->global_begin(),
-         me = KernelM->global_end(); (mi != me); ++mi) {
-      GlobalVariable* GV = &*mi;
+                                 me = KernelM->global_end();
+         (mi != me); ++mi) {
+      GlobalVariable *GV = &*mi;
       GVVect.push_back(GV);
     }
     for (auto *GV : GVVect) {
@@ -221,10 +216,10 @@ public:
       GV->eraseFromParent();
     }
 
-    std::vector<Function*> FuncVect;
-    for (Module::iterator mi = KernelM->begin(),
-         me = KernelM->end(); (mi != me); ++mi) {
-      Function* F = &*mi;
+    std::vector<Function *> FuncVect;
+    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
+         (mi != me); ++mi) {
+      Function *F = &*mi;
       FuncVect.push_back(F);
     }
     for (auto *F : FuncVect) {
@@ -232,10 +227,11 @@ public:
       F->eraseFromParent();
     }
 
-    std::vector<GlobalAlias*> GAVect;
+    std::vector<GlobalAlias *> GAVect;
     for (Module::alias_iterator mi = KernelM->alias_begin(),
-         me = KernelM->alias_end(); (mi != me); ++mi) {
-      GlobalAlias* GA = &*mi;
+                                me = KernelM->alias_end();
+         (mi != me); ++mi) {
+      GlobalAlias *GA = &*mi;
       GAVect.push_back(GA);
     }
     for (auto *GA : GAVect) {
@@ -246,73 +242,69 @@ public:
     changeDataLayout(*KernelM);
     changeTargetTriple(*KernelM);
 
-
     DEBUG(errs() << *KernelM);
-
   }
 
   void writeKernelsModule();
 };
 
-// Initialize the VISC runtime API. This makes it easier to insert these calls
+// Initialize the HPVM runtime API. This makes it easier to insert these calls
 void CGT_NVPTX::initRuntimeAPI() {
 
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc";
+  Twine runtimeAPI =
+      llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  if(runtimeModule == nullptr) {
+  if (runtimeModule == nullptr) {
     DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
     assert(false && "couldn't parse runtime");
-  }
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
 
   // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_ocl_launch);
-  DECLARE(llvm_visc_ocl_wait);
-  DECLARE(llvm_visc_ocl_initContext);
-  DECLARE(llvm_visc_ocl_clearContext);
-  DECLARE(llvm_visc_ocl_argument_shared);
-  DECLARE(llvm_visc_ocl_argument_scalar);
-  DECLARE(llvm_visc_ocl_argument_ptr);
-  DECLARE(llvm_visc_ocl_output_ptr);
-  DECLARE(llvm_visc_ocl_free);
-  DECLARE(llvm_visc_ocl_getOutput);
-  DECLARE(llvm_visc_ocl_executeNode);
+  DECLARE(llvm_hpvm_ocl_launch);
+  DECLARE(llvm_hpvm_ocl_wait);
+  DECLARE(llvm_hpvm_ocl_initContext);
+  DECLARE(llvm_hpvm_ocl_clearContext);
+  DECLARE(llvm_hpvm_ocl_argument_shared);
+  DECLARE(llvm_hpvm_ocl_argument_scalar);
+  DECLARE(llvm_hpvm_ocl_argument_ptr);
+  DECLARE(llvm_hpvm_ocl_output_ptr);
+  DECLARE(llvm_hpvm_ocl_free);
+  DECLARE(llvm_hpvm_ocl_getOutput);
+  DECLARE(llvm_hpvm_ocl_executeNode);
 
   // Get or insert timerAPI functions as well if you plan to use timers
   initTimerAPI();
 
   // Insert init context in main
   DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n");
-  Function* VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
 
   InitCall = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(InitCall);
-  switchToTimer(visc_TimerID_INIT_CTX, InitCall);
-  CallInst::Create(llvm_visc_ocl_initContext,
-                   ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)),
-                   "", InitCall);
-  switchToTimer(visc_TimerID_NONE, InitCall);
+  switchToTimer(hpvm_TimerID_INIT_CTX, InitCall);
+  CallInst::Create(llvm_hpvm_ocl_initContext,
+                   ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "",
+                   InitCall);
+  switchToTimer(hpvm_TimerID_NONE, InitCall);
 
-  // Insert print instruction at visc exit
+  // Insert print instruction at hpvm exit
   DEBUG(errs() << "Gen Code to print NVPTX Timer\n");
-  Function* VC = M.getFunction("llvm.visc.cleanup");
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
   DEBUG(errs() << *VC << "\n");
-  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
+  assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once");
 
   CleanupCall = cast<Instruction>(*VC->user_begin());
   printTimerSet(CleanupCall);
-
-
 }
 
 // Generate Code to call the kernel
@@ -320,36 +312,37 @@ void CGT_NVPTX::initRuntimeAPI() {
 // used to generate a function to associate with this leaf node. The function
 // is responsible for all the memory allocation/transfer and invoking the
 // kernel call on the device
-void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
+void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
+                                   const Twine &FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
-//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
-  assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL &&
          "Code already generated for this node");
 
   // Useful values
-  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
 
   // If kernel struct has not been initialized with kernel function, then fail
   assert(K != NULL && "No kernel found!!");
 
   DEBUG(errs() << "Generating kernel call code\n");
 
-  Function* F = N->getFuncPointer();
-
+  Function *F = N->getFuncPointer();
 
   // Create of clone of F with no instructions. Only the type is the same as F
   // without the extra arguments.
-  Function* F_X86;
+  Function *F_X86;
 
   // Clone the function, if we are seeing this function for the first time. We
   // only need a clone in terms of type.
   ValueToValueMapTy VMap;
 
   // Create new function with the same type
-  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+  F_X86 =
+      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
 
   // Loop over the arguments, copying the names of arguments over.
   Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -362,26 +355,25 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   // Add a basic block to this empty function
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-  ReturnInst* RI = ReturnInst::Create(M.getContext(),
-                                      UndefValue::get(F_X86->getReturnType()), BB);
+  ReturnInst *RI = ReturnInst::Create(
+      M.getContext(), UndefValue::get(F_X86->getReturnType()), BB);
 
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   BB = &*F_X86->begin();
   RI = cast<ReturnInst>(BB->getTerminator());
 
-  //Add the generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
-  errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
-         << N->getFuncPointer()->getName() << "\n";
-
+  // Add the generated function info to DFNode
+  //  N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+  N->addGenFunc(F_X86, hpvm::GPU_TARGET, true);
+  DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
+               << N->getFuncPointer()->getName() << "\n");
 
   // Loop over the arguments, to create the VMap
   dest_iterator = F_X86->arg_begin();
@@ -414,51 +406,53 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       break;
   }
 
-  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy
+  nodes!");
 
   Function* CF = C->getFuncPointer();
   */
-  Function* KF = K->KernelLeafNode->getFuncPointer();
+  Function *KF = K->KernelLeafNode->getFuncPointer();
   // Initialize context
-  //DEBUG(errs() << "Initializing context" << "\n");
-  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+  // DEBUG(errs() << "Initializing context" << "\n");
+  // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI);
 
-  DEBUG(errs() << "Initializing commandQ" << "\n");
+  DEBUG(errs() << "Initializing commandQ"
+               << "\n");
   // Initialize command queue
-  switchToTimer(visc_TimerID_SETUP, InitCall);
-  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
+  switchToTimer(hpvm_TimerID_SETUP, InitCall);
+  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
   DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
-  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
-
-  Value* LaunchInstArgs[] = {fileStr, kernelStr};
-
-  DEBUG(errs() << "Inserting launch call" << "\n");
-  CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
-                                         ArrayRef<Value*>(LaunchInstArgs, 2),
-                                         "graph"+KF->getName(),
-                                         InitCall);
+  DEBUG(errs() << "Generating code for kernel - "
+               << K->KernelFunction->getName() << "\n");
+  Value *kernelStr =
+      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");
+
+  Value *LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call"
+               << "\n");
+  CallInst *NVPTX_Ctx = CallInst::Create(llvm_hpvm_ocl_launch,
+                                         ArrayRef<Value *>(LaunchInstArgs, 2),
+                                         "graph" + KF->getName(), InitCall);
   DEBUG(errs() << *NVPTX_Ctx << "\n");
-  GraphIDAddr = new GlobalVariable(M,
-                                   NVPTX_Ctx->getType(),
-                                   false,
+  GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false,
                                    GlobalValue::CommonLinkage,
                                    Constant::getNullValue(NVPTX_Ctx->getType()),
-                                   "graph"+KF->getName()+".addr");
+                                   "graph" + KF->getName() + ".addr");
   DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
-  StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
+  StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
   DEBUG(errs() << *SI << "\n");
-  switchToTimer(visc_TimerID_NONE, InitCall);
-  switchToTimer(visc_TimerID_SETUP, RI);
-  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
+  switchToTimer(hpvm_TimerID_NONE, InitCall);
+  switchToTimer(hpvm_TimerID_SETUP, RI);
+  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
 
-  // Iterate over the required input edges of the node and use the visc-rt API
+  // Iterate over the required input edges of the node and use the hpvm-rt API
   // to set inputs
-  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+  DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n");
   std::vector<OutputPtr> OutputPointers;
-  // Vector to hold the device memory object that need to be cleared before we release
-  // context
-  std::vector<Value*> DevicePointers;
+  // Vector to hold the device memory object that need to be cleared before we
+  // release context
+  std::vector<Value *> DevicePointers;
 
   std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
   /*
@@ -470,133 +464,134 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   */
 
-  for(auto &InArgMapPair  : kernelInArgMap) {
+  for (auto &InArgMapPair : kernelInArgMap) {
     unsigned i = InArgMapPair.first;
-    Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second);
-    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+    Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second);
+    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
 
     // input value has been obtained.
     // Check if input is a scalar value or a pointer operand
     // For scalar values such as int, float, etc. the size is simply the size of
     // type on target machine, but for pointers, the size of data would be the
     // next integer argument
-    if(inputVal->getType()->isPointerTy()) {
+    if (inputVal->getType()->isPointerTy()) {
 
-      switchToTimer(visc_TimerID_COPY_PTR, RI);
+      switchToTimer(hpvm_TimerID_COPY_PTR, RI);
       // Pointer Input
       // CheckAttribute
-      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
-      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
-                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
-
-      Argument* A = getArgumentAt(KF, i);
-      if(isOutput == True) {
+      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
+      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
+                        !(hasAttribute(KF, i, Attribute::In)))
+                           ? False
+                           : True;
+
+      Argument *A = getArgumentAt(KF, i);
+      if (isOutput == True) {
         DEBUG(errs() << *A << " is an OUTPUT argument\n");
       }
-      if(isInput == True) {
+      if (isInput == True) {
         DEBUG(errs() << *A << " is an INPUT argument\n");
       }
 
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputVal, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
 
       // Assert that the pointer argument size (next argument) is in the map
-      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
-
-      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
-      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
-             && "Pointer type input must always be followed by size (integer type)");
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               inputSize,
-                               isInput,
-                               isOutput
-                              };
-      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());
+
+      Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]);
+      assert(
+          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
+          "Pointer type input must always be followed by size (integer type)");
+      Value *setInputArgs[] = {
+          GraphID,
+          inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          inputSize,
+          isInput,
+          isOutput};
+      Value *d_ptr =
+          CallInst::Create(llvm_hpvm_ocl_argument_ptr,
+                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
       DevicePointers.push_back(d_ptr);
       // If this has out attribute, store the returned device pointer in
       // memory to read device memory later
-      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
-    }
-    else {
-      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+      if (isOutput == True)
+        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    } else {
+      switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
       // Scalar Input
       // Store the scalar value on stack and then pass the pointer to its
       // location
-      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI);
-      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
-
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               ConstantExpr::getSizeOf(inputVal->getType())
-                              };
-      CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      AllocaInst *inputValPtr = new AllocaInst(
+          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
+      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputValPtr, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
+
+      Value *setInputArgs[] = {
+          GraphID, inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          ConstantExpr::getSizeOf(inputVal->getType())};
+      CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
     }
   }
 
-  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+  DEBUG(
+      errs() << "Setup shared memory arguments of node and insert hpvm api\n");
 
   // Check to see if all the allocation sizes are constant (determined
   // statically)
   bool constSizes = true;
-  for (auto& e: K->getSharedInArgMap()) {
+  for (auto &e : K->getSharedInArgMap()) {
     constSizes &= isa<Constant>(e.second.first);
   }
 
   // If the sizes are all constant
   if (constSizes) {
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = e.second.first;
+      Value *allocSize = e.second.first;
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
 
-        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+        assert(isa<Constant>(allocSize) &&
+               "Constant shared memory size is expected");
 
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
+        CallInst::Create(llvm_hpvm_ocl_argument_shared,
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0,
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
+        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   } else {
@@ -617,68 +612,64 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       ExtractValueInstVec.push_back(EI);
     }
 
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
+
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
+        CallInst::Create(llvm_hpvm_ocl_argument_shared,
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, 
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
+        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   }
 
-
-  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
+  DEBUG(errs() << "Setup output edges of node and insert hpvm api\n");
   // Set output if struct is not an empty struct
-  StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  std::vector<Value*> d_Outputs;
-  if(!OutputTy->isEmptyTy()) {
-    switchToTimer(visc_TimerID_COPY_PTR, RI);
+  StructType *OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value *> d_Outputs;
+  if (!OutputTy->isEmptyTy()) {
+    switchToTimer(hpvm_TimerID_COPY_PTR, RI);
     // Not an empty struct
     // Iterate over all elements of the struct and put them in
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
-      Value* setOutputArgs[] = {GraphID,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-
-      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
-                                            ArrayRef<Value*>(setOutputArgs, 3),
-                                            "d_output."+KF->getName(),
-                                            RI);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
+      Value *setOutputArgs[] = {
+          GraphID,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
+          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr,
+                                            ArrayRef<Value *>(setOutputArgs, 3),
+                                            "d_output." + KF->getName(), RI);
       d_Outputs.push_back(d_Output);
     }
   }
@@ -688,50 +679,41 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   // Allocate size_t[numDims] space on stack. Store the work group sizes and
   // pass it as an argument to ExecNode
 
-  switchToTimer(visc_TimerID_MISC, RI);
+  switchToTimer(hpvm_TimerID_MISC, RI);
   Value *workDim, *LocalWGPtr, *GlobalWGPtr;
   getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
-  switchToTimer(visc_TimerID_KERNEL, RI);
-  Value* ExecNodeArgs[] = {GraphID,
-                           workDim,
-                           LocalWGPtr,
-                           GlobalWGPtr
-                          };
-  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
-                                     ArrayRef<Value*>(ExecNodeArgs, 4),
-                                     "event."+KF->getName(),
-                                     RI);
+  switchToTimer(hpvm_TimerID_KERNEL, RI);
+  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
+  CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode,
+                                     ArrayRef<Value *>(ExecNodeArgs, 4),
+                                     "event." + KF->getName(), RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
 
   // Wait for Kernel to Finish
-  CallInst::Create(llvm_visc_ocl_wait,
-                   ArrayRef<Value*>(GraphID),
-                   "",
-                   RI);
+  CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
 
-  switchToTimer(visc_TimerID_READ_OUTPUT, RI);
+  switchToTimer(hpvm_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
-  if(!OutputTy->isEmptyTy()) {
-    std::vector<Value*>h_Outputs;
-    Value* KernelOutput = UndefValue::get(OutputTy);
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      Value* GetOutputArgs[] = {GraphID,
-                                Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                                d_Outputs[i],
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
-                                            ArrayRef<Value*>(GetOutputArgs, 4),
-                                            "h_output."+KF->getName()+".addr",
-                                            RI);
+  if (!OutputTy->isEmptyTy()) {
+    std::vector<Value *> h_Outputs;
+    Value *KernelOutput = UndefValue::get(OutputTy);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      Value *GetOutputArgs[] = {
+          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+      CallInst *h_Output = CallInst::Create(
+          llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
+          "h_output." + KF->getName() + ".addr", RI);
       // Read each device pointer listed in output struct
       // Load the output struct
-      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
-                     OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
-
-      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
-                                             KF->getName()+"output", RI);
+      CastInst *BI = BitCastInst::CreatePointerCast(
+          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
+          RI);
+
+      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
+                                             ArrayRef<unsigned>(i),
+                                             KF->getName() + "output", RI);
     }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
@@ -746,75 +728,76 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
     DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
     DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
 
-    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
-    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
+  output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput,
                                     ArrayRef<Value*>(GetOutputArgs, 4),
                                     "", RI);
   }*/
-  switchToTimer(visc_TimerID_MEM_FREE, RI);
+  switchToTimer(hpvm_TimerID_MEM_FREE, RI);
   // Clear Context and free device memory
-  DEBUG(errs() << "Clearing context" << "\n");
+  DEBUG(errs() << "Clearing context"
+               << "\n");
   // Free Device Memory
-  for(auto d_ptr: DevicePointers) {
-    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
+  for (auto d_ptr : DevicePointers) {
+    CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
   }
-  switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
+  switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall);
   // Clear Context
-  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
-  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
-  switchToTimer(visc_TimerID_NONE, CleanupCall);
+  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "",
+                   CleanupCall);
+  switchToTimer(hpvm_TimerID_NONE, CleanupCall);
 
-  switchToTimer(visc_TimerID_MISC, RI);
+  switchToTimer(hpvm_TimerID_MISC, RI);
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
   // Get Exit node
-  DFNode* C = N->getChildGraph()->getExit();
+  DFNode *C = N->getChildGraph()->getExit();
   // Get OutputType of this node
-  StructType* OutTy = N->getOutputType();
+  StructType *OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find the kernel's output arg map, to use instead of the bindings
   std::vector<unsigned> outArgMap = kernel->getOutArgMap();
   // Find all the input edges to exit node
-  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
     DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
-    DFEdge* E = C->getInDFEdgeAt(i);
+    DFEdge *E = C->getInDFEdgeAt(i);
 
     assert(E && "No Binding for output element!");
     // Find the Source DFNode associated with the incoming edge
-    DFNode* SrcDF = E->getSourceDF();
+    DFNode *SrcDF = E->getSourceDF();
 
-    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                 << "\n");
 
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
-    Value* inputVal;
-    if(SrcDF->isEntryNode()) {
+    Value *inputVal;
+    if (SrcDF->isEntryNode()) {
       inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-    }
-    else {
+      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+    } else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
       // FIXME: Since the 2-level kernel code gen has aspecific structure, we
       // can assume the SrcDF is same as Kernel Leaf node.
       // Use outArgMap to get correct mapping
       SrcDF = K->KernelLeafNode;
-      assert(OutputMap.count(SrcDF)
-             && "Source node call not found. Dependency violation!");
+      assert(OutputMap.count(SrcDF) &&
+             "Source node call not found. Dependency violation!");
 
       // Find Output Value associated with the Source DFNode using OutputMap
-      Value* CI = OutputMap[SrcDF];
+      Value *CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
       // i is the destination of DFEdge E
       // Use the mapping instead of the bindings
-//      IndexList.push_back(E->getSourcePosition());
+      //      IndexList.push_back(E->getSourcePosition());
       IndexList.push_back(outArgMap[i]);
-      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "",RI);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
       inputVal = EI;
     }
     std::vector<unsigned> IdxList;
@@ -823,31 +806,33 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   }
 
   DEBUG(errs() << "Extracted all\n");
-  switchToTimer(visc_TimerID_NONE, RI);
+  switchToTimer(hpvm_TimerID_NONE, RI);
   retVal->setName("output");
-  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
 }
 
-
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
-void CGT_NVPTX::codeGen(DFInternalNode* N) {
-  errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n";
-  if(KernelLaunchNode == NULL)
-    errs () << "No kernel launch node\n";
+void CGT_NVPTX::codeGen(DFInternalNode *N) {
+  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
+               << "\n");
+  if (KernelLaunchNode == NULL)
+    DEBUG(errs() << "No kernel launch node\n");
   else {
-    errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "KernelLaunchNode: "
+                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
   }
 
   if (!KernelLaunchNode) {
-    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    DEBUG(errs()
+          << "No code generated (host code for kernel launch complete).\n");
     return;
   }
 
   if (N == KernelLaunchNode) {
     DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-    //TODO
+    // TODO
 
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
@@ -862,7 +847,8 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // TODO: Structure assumed: one thread node, one allocation node (at most),
     // TB node
     std::map<unsigned, unsigned> inmapFinal;
-    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(),
+                                                ie = inmap2.end();
          ib != ie; ++ib) {
       inmapFinal[ib->first] = inmap1[ib->second];
     }
@@ -879,8 +865,9 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // 0 ... outmap2.size()-1
     // The limit is the size of outmap2, because this is the number of kernel
     // output arguments for which the mapping matters
-    // For now, it reasonable to assume that all the kernel arguments are returned,
-    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    // For now, it reasonable to assume that all the kernel arguments are
+    // returned, maybe plys some others from other nodes, thus outmap2.size() <=
+    // outmap1.size()
     for (unsigned i = 0; i < outmap2.size(); i++) {
       outmap1[i] = outmap2[outmap1[i]];
     }
@@ -888,15 +875,14 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
 
     // Track the source of local dimlimits for the kernel
     // Dimension limit can either be a constant or an argument of parent
-    // function. Since Internal node would no longer exist, we need to insert the
-    // localWGSize with values from the parent of N.
-    std::vector<Value*> localWGSizeMapped;
+    // function. Since Internal node would no longer exist, we need to insert
+    // the localWGSize with values from the parent of N.
+    std::vector<Value *> localWGSizeMapped;
     for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
       if (isa<Constant>(kernel->localWGSize[i])) {
         // if constant, use as it is
         localWGSizeMapped.push_back(kernel->localWGSize[i]);
-      }
-      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+      } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
         // if argument, find the argument location in N. Use InArgMap of N to
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
@@ -906,46 +892,49 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
         assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
 
         unsigned parentArgNum = N->getInArgMap()[argNum];
-        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        Argument *A =
+            getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
-      }
-      else {
-        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      } else {
+        assert(
+            false &&
+            "LocalWGsize using value which is neither argument nor constant!");
       }
     }
     // Update localWGSize vector of kernel
     kernel->setLocalWGSize(localWGSizeMapped);
   }
-
 }
 
-void CGT_NVPTX::codeGen(DFLeafNode* N) {
-  errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n";
+void CGT_NVPTX::codeGen(DFLeafNode *N) {
+  DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName()
+               << "\n");
 
   // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
+  if (N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
 
   // Skip code generation if it is an allocation node
-  if(N->isAllocationNode()) {
+  if (N->isAllocationNode()) {
     DEBUG(errs() << "Skipping allocation node\n");
     return;
   }
 
   // Generate code only if it has the right hint
-//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
-//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-//    return;
-//  }
-  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
-    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+  //  if(!checkPreferredTarget(N, hpvm::GPU_TARGET)) {
+  //    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+  //    return;
+  //  }
+  if (!preferredTargetIncludes(N, hpvm::GPU_TARGET)) {
+    DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName()
+                 << "\n");
     return;
   }
 
   // Checking which node is the kernel launch
-  DFNode* PNode = N->getParent();
+  DFNode *PNode = N->getParent();
   int pLevel = PNode->getLevel();
   int pReplFactor = PNode->getNumOfDim();
 
@@ -953,42 +942,40 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // (1) Parent is the top level node i.e., Root of DFG
   //                    OR
   // (2) Parent does not have multiple instances
-  errs() << "pLevel = " << pLevel << "\n";
-  errs() << "pReplFactor = " << pReplFactor << "\n";
+  DEBUG(errs() << "pLevel = " << pLevel << "\n");
+  DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n");
   assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
 
   // Only these options are supported
-  enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy;
-  if(pLevel == 1 || !pReplFactor) {
-    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
+  enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy;
+  if (pLevel == 1 || !pReplFactor) {
+    DEBUG(errs()
+          << "*************** Kernel Gen: 1-Level Hierarchy **************\n");
     SelectedHierarchy = ONE_LEVEL;
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL,
-                        N,
-                        N->getInArgMap(),
-                        N->getSharedInArgMap(),
-                        N->getOutArgMap(),
-                        N->getNumOfDim(),
-                        N->getDimLimits());
-  }
-  else {
+    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(),
+                        N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits());
+  } else {
     // Converting a 2-level DFG to opencl kernel
-    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
-    assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node.");
+    DEBUG(errs()
+          << "*************** Kernel Gen: 2-Level Hierarchy **************\n");
+    assert((pLevel >= 2) &&
+           "Selected node not nested deep enough to be Kernel Node.");
     SelectedHierarchy = TWO_LEVEL;
     KernelLaunchNode = PNode->getParent();
-    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) &&
+           "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    kernel = new Kernel(NULL,                 // kernel function
-                        N,                    // kernel leaf node
-                        N->getInArgMap(),     // kenel argument mapping
+    kernel = new Kernel(NULL,             // kernel function
+                        N,                // kernel leaf node
+                        N->getInArgMap(), // kenel argument mapping
                         N->getSharedInArgMap(),
-                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
-                        PNode->getNumOfDim(), // gridDim
-                        PNode->getDimLimits(),// grid size
-                        N->getNumOfDim(),     // blockDim
-                        N->getDimLimits());   // block size
-
+                        N->getOutArgMap(),     // kernel output mapping from the
+                                               // leaf to the interemediate node
+                        PNode->getNumOfDim(),  // gridDim
+                        PNode->getDimLimits(), // grid size
+                        N->getNumOfDim(),      // blockDim
+                        N->getDimLimits());    // block size
   }
 
   std::vector<Instruction *> IItoRemove;
@@ -1000,58 +987,62 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
-//  Function *F_nvptx = N->getGenFunc();
-  Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
+  //  Function *F_nvptx = N->getGenFunc();
+  Function *F_nvptx = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-  assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
+  assert(F_nvptx == NULL &&
+         "Error: Visiting a node for which code already generated");
   // Clone the function
   ValueToValueMapTy VMap;
 
-  //F_nvptx->setName(FName+"_nvptx");
+  // F_nvptx->setName(FName+"_nvptx");
 
   Twine FName = F->getName();
   StringRef fStr = FName.getSingleStringRef();
-  Twine newFName = Twine(fStr, "_nvptx"); 
+  Twine newFName = Twine(fStr, "_nvptx");
   F_nvptx = CloneFunction(F, VMap);
   F_nvptx->setName(newFName);
 
-  
   //  errs() << "Old Function Name: " << F->getName() << "\n";
   //  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
 
   F_nvptx->removeFromParent();
 
-
   // Insert the cloned function into the kernels module
   KernelM->getFunctionList().push_back(F_nvptx);
 
-
-  //TODO: Iterate over all the instructions of F_nvptx and identify the
-  //callees and clone them into this module.
+  // TODO: Iterate over all the instructions of F_nvptx and identify the
+  // callees and clone them into this module.
   DEBUG(errs() << *F_nvptx->getType());
   DEBUG(errs() << *F_nvptx);
 
   // Transform  the function to void and remove all target dependent attributes
   // from the function
   F_nvptx = transformFunctionToVoid(F_nvptx);
-  
-  //Add generated function info to DFNode
-//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
-  N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
 
-  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
-  F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
+  // Add generated function info to DFNode
+  //  N->setGenFunc(F_nvptx, hpvm::GPU_TARGET);
+  N->addGenFunc(F_nvptx, hpvm::GPU_TARGET, false);
+
+  DEBUG(
+      errs()
+      << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_nvptx->removeAttributes(AttributeList::FunctionIndex,
+                            F_nvptx->getAttributes().getFnAttributes());
   F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
 
-  //FIXME: For now, assume only one allocation node
+  // FIXME: For now, assume only one allocation node
   kernel->AllocationNode = NULL;
 
-  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(),
+                                       iee = N->indfedge_end();
        ieb != iee; ++ieb) {
     DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Found edge from node: "
+                 << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
     DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode()
+                 << "\n");
     if (!SrcDFNode->isDummyNode()) {
       assert(SrcDFNode->isAllocationNode());
       kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
@@ -1065,19 +1056,22 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
   // If no allocation node was found, SharedMemArgs is empty
   if (kernel->AllocationNode) {
+
     ValueToValueMapTy VMap;
-    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
-    //F_alloc->removeFromParent();
+    Function *F_alloc =
+        CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    // F_alloc->removeFromParent();
     // Insert the cloned function into the kernels module
-    //M.getFunctionList().push_back(F_alloc);
+    // M.getFunctionList().push_back(F_alloc);
 
-    std::vector<IntrinsicInst *> ViscMallocInstVec;
-    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+    std::vector<IntrinsicInst *> HPVMMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::hpvm_malloc, HPVMMallocInstVec);
 
-    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
-      IntrinsicInst *II = ViscMallocInstVec[i];
-      assert(II->hasOneUse() && "visc_malloc result is used more than once");
-      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+    for (unsigned i = 0; i < HPVMMallocInstVec.size(); i++) {
+      IntrinsicInst *II = HPVMMallocInstVec[i];
+      assert(II->hasOneUse() && "hpvm_malloc result is used more than once");
+      II->replaceAllUsesWith(
+          ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
       II->eraseFromParent();
     }
     kernel->AllocationFunction = F_alloc;
@@ -1092,15 +1086,19 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         assert(RetStructTy && "Allocation node does not return a struct type");
         unsigned numFields = RetStructTy->getNumElements();
     */
-    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
-    AllocationNodeProperty* APN =
-      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
-    for (auto& AllocPair: APN->getAllocationList()) {
+    std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap =
+        kernel->getSharedInArgMap();
+    AllocationNodeProperty *APN =
+        (AllocationNodeProperty *)kernel->AllocationNode->getProperty(
+            DFNode::Allocation);
+    for (auto &AllocPair : APN->getAllocationList()) {
       unsigned destPos = AllocPair.first->getDestPosition();
       unsigned srcPos = AllocPair.first->getSourcePosition();
       SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
-      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
+      sharedInMap[destPos + 1] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
     }
     kernel->setSharedInArgMap(sharedInMap);
   }
@@ -1110,12 +1108,14 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // global address space
   unsigned argIndex = 0;
   std::vector<unsigned> GlobalMemArgs;
-  for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end();
-      ai != ae; ++ai) {
-    if (ai->getType()->isPointerTy()) {    
-      // If the arguement is already chosen for shared memory arguemnt list, skip.
-      // Else put it in Global memory arguement list
-      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+  for (Function::arg_iterator ai = F_nvptx->arg_begin(),
+                              ae = F_nvptx->arg_end();
+       ai != ae; ++ai) {
+    if (ai->getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list,
+      // skip. Else put it in Global memory arguement list
+      if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) ==
+          0) {
         GlobalMemArgs.push_back(argIndex);
       }
     }
@@ -1129,20 +1129,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Optimization: Gloabl memory arguments, which are not modified and whose
   // loads are not dependent on node id of current node, should be moved to
   // constant memory, subject to size of course
-  std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
+  std::vector<unsigned> ConstantMemArgs =
+      globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
   F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
-// Function to replace call instructions to functions in the kernel
+  // Function to replace call instructions to functions in the kernel
   std::map<Function *, Function *> OrgToClonedFuncMap;
   std::vector<Function *> FuncToBeRemoved;
-  auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) {
-    Function* NewFunc;
+  auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) {
+    Function *NewFunc;
     // Check if the called function has already been cloned before.
     auto It = OrgToClonedFuncMap.find(OrgFunc);
-    if(It == OrgToClonedFuncMap.end()) {
+    if (It == OrgToClonedFuncMap.end()) {
       ValueToValueMapTy VMap;
       NewFunc = CloneFunction(OrgFunc, VMap);
       OrgToClonedFuncMap[OrgFunc] = NewFunc;
@@ -1151,43 +1152,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       NewFunc = (*It).second;
     }
     // Replace the calls to this function
-    std::vector<Value*> args;
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+    std::vector<Value *> args;
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
       args.push_back(CI->getArgOperand(i));
     }
-    CallInst* Inst = CallInst::Create(NewFunc, args,
-        OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+    CallInst *Inst = CallInst::Create(
+        NewFunc, args,
+        OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
     CI->replaceAllUsesWith(Inst);
     IItoRemove.push_back(CI);
     return NewFunc;
   };
 
-
   // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
     Instruction *I = &(*i);
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+    // Leaf nodes should not contain HPVM graph intrinsics or launch
+    assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
+           "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
+           "HPVM graph intrinsic within a leaf dataflow node!");
 
-    if (BuildDFG::isViscIntrinsic(I)) {
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
+    if (BuildDFG::isHPVMIntrinsic(I)) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst *ArgII;
+      DFNode *ArgDFNode;
 
-      /************************ Handle VISC Query intrinsics ************************/
+      /************************ Handle HPVM Query intrinsics
+       * ************************/
 
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *****************************/
-      case Intrinsic::visc_getNode: {
+      /**************************** llvm.hpvm.getNode()
+       * *****************************/
+      case Intrinsic::hpvm_getNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
-      }
-      break;
-      /************************* llvm.visc.getParentNode() **************************/
-      case Intrinsic::visc_getParentNode: {
+      } break;
+      /************************* llvm.hpvm.getParentNode()
+       * **************************/
+      case Intrinsic::hpvm_getParentNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
         // get the parent node of the arg node
         // get argument node
@@ -1200,10 +1206,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*************************** llvm.visc.getNumDims() ***************************/
-      case Intrinsic::visc_getNumDims: {
+      } break;
+      /*************************** llvm.hpvm.getNumDims()
+       * ***************************/
+      case Intrinsic::hpvm_getNumDims: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
         // get node from map
         // get the appropriate field
@@ -1211,47 +1217,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         int numOfDim = ArgDFNode->getNumOfDim();
         DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+        IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt *numOfDimConstant =
+            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
 
         // Replace the result of the intrinsic with the computed value
         II->replaceAllUsesWith(numOfDimConstant);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*********************** llvm.visc.getNodeInstanceID() ************************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n");
+      } break;
+      /*********************** llvm.hpvm.getNodeInstanceID()
+       * ************************/
+      case Intrinsic::hpvm_getNodeInstanceID_x:
+      case Intrinsic::hpvm_getNodeInstanceID_y:
+      case Intrinsic::hpvm_getNodeInstanceID_z: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"
+                     << "\t: " << *II << "\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         assert(ArgDFNode && "Arg node is NULL");
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNodeInstanceID_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
+        Function *OpenCLFunction;
 
-        FunctionType* FT =
-          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                            Type::getInt32Ty(KernelM->getContext()),
-                            false);
+        FunctionType *FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()), false);
         if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
@@ -1260,838 +1267,867 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           // itself
           DEBUG(errs() << "Substitute with get_global_id()\n");
           DEBUG(errs() << *II << "\n");
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          //DEBUG(errs() << "Here inside cond 2\n");
+          // DEBUG(errs() << "Here inside cond 2\n");
           // We are asking for this node's id with respect to its parent
           // this is a local id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee());
-          //DEBUG(errs() << "exiting condition 2\n");
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT))
+                  .getCallee());
+          // DEBUG(errs() << "exiting condition 2\n");
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's id with respect to its
           // parent: this is a group id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT))
+                  .getCallee());
         } else {
-          errs() << N->getFuncPointer()->getName() << "\n";
-          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
-          errs() << *II << "\n";
+          DEBUG(errs() << N->getFuncPointer()->getName() << "\n");
+          DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n");
+          DEBUG(errs() << *II << "\n");
 
           assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
-        //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n");
-        //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n");
-        //DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
-        //DEBUG(errs() << "Argument: " << Args[0] << "\n");
-        //DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
+        // DEBUG(errs() << "Create call instruction, insert it before the
+        // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction <<
+        // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
+        // DEBUG(errs() << "Argument: " << Args[0] << "\n");
+        // DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        //DEBUG(errs() << "Replace uses\n");
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        // DEBUG(errs() << "Replace uses\n");
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /********************** llvm.visc.getNumNodeInstances() ***********************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
+      } break;
+      /********************** llvm.hpvm.getNumNodeInstances()
+       * ***********************/
+      case Intrinsic::hpvm_getNumNodeInstances_x:
+      case Intrinsic::hpvm_getNumNodeInstances_y:
+      case Intrinsic::hpvm_getNumNodeInstances_z: {
         // TODO: think about whether this is the best way to go there are hw
         // specific registers. therefore it is good to have the intrinsic but
         // then, why do we need to keep that info in the graph?  (only for the
         // kernel configuration during the call)
 
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
+        DEBUG(errs() << F_nvptx->getName()
+                     << "\t: Handling getNumNodeInstances\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNumNodeInstances_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
-        FunctionType* FT =
+        Function *OpenCLFunction;
+        FunctionType *FT =
             FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()),
-                              false);
+                              Type::getInt32Ty(KernelM->getContext()), false);
 
         if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's instances
           // this is a (global_size/local_size) (grid dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT))
+                  .getCallee());
         } else {
           assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      case Intrinsic::visc_barrier:
-      {
+      } break;
+      case Intrinsic::hpvm_barrier: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
         DEBUG(errs() << "Substitute with barrier()\n");
         DEBUG(errs() << *II << "\n");
-        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
-                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
-                                             false);
-        Function* OpenCLFunction = cast<Function>
-                                   ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee());
-        CallInst* CI = CallInst::Create(OpenCLFunction,
-                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
-                                        "", II);
+        FunctionType *FT = FunctionType::get(
+            Type::getVoidTy(KernelM->getContext()),
+            std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())),
+            false);
+        Function *OpenCLFunction = cast<Function>(
+            (KernelM->getOrInsertFunction(StringRef("barrier"), FT))
+                .getCallee());
+        CallInst *CI =
+            CallInst::Create(OpenCLFunction,
+                             ArrayRef<Value *>(ConstantInt::get(
+                                 Type::getInt32Ty(KernelM->getContext()), 1)),
+                             "", II);
         II->replaceAllUsesWith(CI);
         IItoRemove.push_back(II);
+      } break;
+      case Intrinsic::hpvm_atomic_cmpxchg:
+        break;
+      case Intrinsic::hpvm_atomic_add:
+      case Intrinsic::hpvm_atomic_sub:
+      case Intrinsic::hpvm_atomic_xchg:
+      case Intrinsic::hpvm_atomic_min:
+      case Intrinsic::hpvm_atomic_max:
+      case Intrinsic::hpvm_atomic_and:
+      case Intrinsic::hpvm_atomic_or:
+      case Intrinsic::hpvm_atomic_xor:
+        // case Intrinsic::hpvm_atomic_inc:
+        // case Intrinsic::hpvm_atomic_dec:
+        {
+          DEBUG(errs() << *II << "\n");
+          // Only have support for i32 atomic intrinsics
+          assert(II->getType() == Type::getInt32Ty(II->getContext()) &&
+                 "Only support i32 atomic intrinsics for now");
+          // Substitute with atomicrmw instruction
+          assert(II->getNumArgOperands() == 2 &&
+                 "Expecting 2 operands for these atomics");
+          Value *Ptr = II->getArgOperand(0);
+          Value *Val = II->getArgOperand(1);
+          assert(Ptr->getType()->isPointerTy() &&
+                 "First argument of supported atomics is expected to be a "
+                 "pointer");
+          PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+          PointerType *TargetTy =
+              Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
+          if (PtrTy != TargetTy) {
+            Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
+            PtrTy = TargetTy;
+          }
+
+          std::string name;
+          if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_add)
+            name = "atomic_add";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_sub)
+            name = "atomic_sub";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xchg)
+            name = "atomic_xchg";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_min)
+            name = "atomic_min";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_max)
+            name = "atomic_max";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_and)
+            name = "atomic_and";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_or)
+            name = "atomic_or";
+          else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xor)
+            name = "atomic_xor";
+          Type *paramTypes[] = {PtrTy, Val->getType()};
+          FunctionType *AtomFuncT = FunctionType::get(
+              II->getType(), ArrayRef<Type *>(paramTypes, 2), false);
+          FunctionCallee AtomFunc =
+              KernelM->getOrInsertFunction(name, AtomFuncT);
+
+          Value *Params[] = {Ptr, Val};
+          CallInst *AtomCI = CallInst::Create(
+              AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II);
+          DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
+          II->replaceAllUsesWith(AtomCI);
+          IItoRemove.push_back(II);
+        }
+        break;
+      default:
+        llvm_unreachable("Unknown HPVM Intrinsic!");
+        break;
       }
-      break;
-      case Intrinsic::visc_atomic_add:
-      case Intrinsic::visc_atomic_sub:
-      case Intrinsic::visc_atomic_xchg:
-      case Intrinsic::visc_atomic_min:
-      case Intrinsic::visc_atomic_max:
-      case Intrinsic::visc_atomic_and:
-      case Intrinsic::visc_atomic_or:
-      case Intrinsic::visc_atomic_xor:
-      {
-        DEBUG(errs() << *II << "\n");
-        // Only have support for i32 atomic intrinsics
-        assert(II->getType() == Type::getInt32Ty(II->getContext())
-               && "Only support i32 atomic intrinsics for now");
-        // Substitute with atomicrmw instruction
-        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
-        Value* Ptr = II->getArgOperand(0);
-        Value* Val = II->getArgOperand(1);
-        assert(Ptr->getType()->isPointerTy()
-               && "First argument of supported atomics is expected to be a pointer");
-        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
-        PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
-        if (PtrTy != TargetTy) {
-          Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
-          PtrTy = TargetTy;
+
+    } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+      IRBuilder<> Builder(I);
+      Value *Source = MemCpyI->getSource();
+      Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+      Value *Length = MemCpyI->getOperand(2);
+      DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+      DEBUG(errs() << "Source: " << *Source << "\n");
+      DEBUG(errs() << "Destination: " << *Destination << "\n");
+      DEBUG(errs() << "Length: " << *Length << "\n");
+
+      size_t memcpy_length;
+      unsigned int memcpy_count;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) {
+        if (CI->getBitWidth() <= 64) {
+          memcpy_length = CI->getSExtValue();
+          DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+          Type *Source_Type = Source->getType()->getPointerElementType();
+          DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+          memcpy_count =
+              memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+          DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+          if (GetElementPtrInst *sourceGEPI =
+                  dyn_cast<GetElementPtrInst>(Source)) {
+            if (GetElementPtrInst *destGEPI =
+                    dyn_cast<GetElementPtrInst>(Destination)) {
+              Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+              Value *DestPtrOperand = destGEPI->getPointerOperand();
+              for (int i = 0; i < memcpy_count; ++i) {
+                Constant *increment;
+                LoadInst *newLoadI;
+                StoreInst *newStoreI;
+                // First, need to increment the correct index for both source
+                // and dest This invluves checking to see how many indeces the
+                // GEP has Assume for now only 1 or 2 are the viable options.
+
+                std::vector<Value *> GEPlIndex;
+                if (sourceGEPI->getNumIndices() == 1) {
+                  Value *Index = sourceGEPI->getOperand(1);
+                  increment = ConstantInt::get(Index->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPlIndex.push_back(incAdd);
+                  Value *newGEPIl = Builder.CreateGEP(
+                      SourcePtrOperand, ArrayRef<Value *>(GEPlIndex));
+                  DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+                  newLoadI = Builder.CreateLoad(newGEPIl);
+                  DEBUG(errs() << "Load: " << *newLoadI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where source GEPI has more "
+                                   "than 1 indices!\n");
+                }
+
+                std::vector<Value *> GEPsIndex;
+                if (destGEPI->getNumIndices() == 1) {
+
+                } else if (destGEPI->getNumIndices() == 2) {
+                  Value *Index0 = destGEPI->getOperand(1);
+                  GEPsIndex.push_back(Index0);
+                  Value *Index1 = destGEPI->getOperand(2);
+                  increment = ConstantInt::get(Index1->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index1, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPsIndex.push_back(incAdd);
+                  Value *newGEPIs = Builder.CreateGEP(
+                      DestPtrOperand, ArrayRef<Value *>(GEPsIndex));
+                  DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+                  newStoreI = Builder.CreateStore(newLoadI, newGEPIs,
+                                                  MemCpyI->isVolatile());
+                  DEBUG(errs() << "Store: " << *newStoreI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where dest GEPI has more "
+                                   "than 2 indices!\n");
+                }
+              }
+              IItoRemove.push_back(sourceGEPI);
+              IItoRemove.push_back(destGEPI);
+              Instruction *destBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+              Instruction *sourceBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+              IItoRemove.push_back(destBitcastI);
+              IItoRemove.push_back(sourceBitcastI);
+              IItoRemove.push_back(MemCpyI);
+            }
+          }
         }
+      } else {
+        llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+      }
+      //      llvm_unreachable("HERE!");
+    }
 
-			 std::string name;
-			 if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-				 name = "atomic_add";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-				 name = "atomic_sub";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-				 name = "atomic_xchg";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-				 name = "atomic_min";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-				 name = "atomic_max";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-				 name = "atomic_and";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-				 name = "atomic_or";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-				 name = "atomic_xor";
-			 Type* paramTypes[] = {PtrTy, Val->getType()};
-			 FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
-			 FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
-
-			 Value* Params[] = {Ptr, Val};
-			 CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
-			 DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
-			 II->replaceAllUsesWith(AtomCI);
-			 IItoRemove.push_back(II);
-			}
-			break;
-			default:
-			llvm_unreachable("Unknown VISC Intrinsic!");
-			break;
-			}
-
-		}
-		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-			IRBuilder<> Builder(I);
-			Value *Source = MemCpyI->getSource();
-			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-			Value *Length = MemCpyI->getOperand(2);
-			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-			DEBUG(errs() << "Source: " << *Source << "\n"); 
-			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
-			DEBUG(errs() << "Length: " << *Length << "\n");
-
-			size_t memcpy_length;
-			unsigned int memcpy_count;
-			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
-				if (CI->getBitWidth() <= 64) {
-					memcpy_length = CI->getSExtValue();
-					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-					Type *Source_Type = Source->getType()->getPointerElementType();
-					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
-						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
-							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-							Value *DestPtrOperand = destGEPI->getPointerOperand();
-							for(int i = 0; i < memcpy_count; ++i) {
-								Constant *increment;
-								LoadInst *newLoadI;
-								StoreInst *newStoreI;
-								// First, need to increment the correct index for both source and dest 
-								// This invluves checking to see how many indeces the GEP has
-								// Assume for now only 1 or 2 are the viable options.
-
-								std::vector<Value*> GEPlIndex;
-								if (sourceGEPI->getNumIndices() == 1) {
-									Value *Index = sourceGEPI->getOperand(1);      
-									increment = ConstantInt::get(Index->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPlIndex.push_back(incAdd);
-									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
-									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-									newLoadI = Builder.CreateLoad(newGEPIl);
-									DEBUG(errs() << "Load: " << *newLoadI << "\n");
-								} else { 
-									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
-								}
-
-
-								std::vector<Value*> GEPsIndex;
-								if (destGEPI->getNumIndices() == 1) {
-
-								} else if (destGEPI->getNumIndices() == 2) {
-									Value *Index0 = destGEPI->getOperand(1);      
-									GEPsIndex.push_back(Index0);
-									Value *Index1 = destGEPI->getOperand(2);      
-									increment = ConstantInt::get(Index1->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index1, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPsIndex.push_back(incAdd);
-									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
-									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
-									DEBUG(errs() << "Store: " << *newStoreI << "\n");
-								} else {
-									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
-								}
-							}
-							IItoRemove.push_back(sourceGEPI);
-							IItoRemove.push_back(destGEPI);
-							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-							IItoRemove.push_back(destBitcastI);
-							IItoRemove.push_back(sourceBitcastI);
-							IItoRemove.push_back(MemCpyI);
-						}
-					}
-
-				}
-			} else {
-				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-			}
-			//      llvm_unreachable("HERE!");
-		}
-
-		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-			DEBUG(errs() << "Found a call: " << *CI << "\n");
-			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-			if(calleeF->isDeclaration()) {
-				// Add the declaration to kernel module
-				if (calleeF->getName() == "sqrtf") {
-					calleeF->setName(Twine("sqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				} else if (calleeF->getName() == "rsqrtf") {
-					calleeF->setName(Twine("rsqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				}  
-				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-			}
-			else {
-				// Check if the called function has already been cloned before.
-				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-				// Iterate over the new function to see if it calls any other functions
-				// in the module.
-				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-					if(auto *Call = dyn_cast<CallInst>(&*i)) {
-						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-						CloneAndReplaceCall(Call, CalledFunc);
-					}
-				}
-			}
-			//TODO: how to handle address space qualifiers in load/store
-		}
-
-	}
-	// search for pattern where float is being casted to int and loaded/stored and change it.	
-	DEBUG(errs() << "finding pattern for replacement!\n");
-	for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-		bool cont = false;
-		bool keepGEPI = false;
-		bool keepGEPI2= false;
-		Instruction *I = &(*i);
-		GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
-
-		if (!GEPI) {
-			// did nod find pattern start, continue
-			continue;
-		}
-		// may have found pattern, check
-		DEBUG(errs() << "GEPI " << *GEPI << "\n");
-		// print whatever we want for debug
-		Value* PtrOp = GEPI->getPointerOperand();
-		Type *SrcTy = GEPI->getSourceElementType();
-		unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-		if (SrcTy->isArrayTy()) 
-			DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
-		else
-			DEBUG(errs() << *SrcTy << " is not an array type!\n");
-		// check that source element type is float
-		if (SrcTy->isArrayTy()) {
-			if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-				DEBUG(errs() << "GEPI type is array but not float!\n");
-				continue;
-			}
-		}
-		else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
-			DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-			// does not fit this pattern - no float GEP instruction
-			continue;
-		}
-		// check that addressspace is 1
-		//	  if (GEPIaddrspace != 1) {
-		//			// does not fit this pattern - addrspace of pointer argument is not global
-		//			continue;
-		//		}
-		if (!(GEPI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI = true;
-		}
-		DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-		// 1st GEPI it has one use
-		//		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-		// See if it is a bitcast
-		BitCastInst *BitCastI;
-		for (User * U : GEPI->users()) {
-			if(Instruction *ui = dyn_cast<Instruction> (U)) { 
-				DEBUG(errs() << "--" << *ui << "\n");
-				if (isa<BitCastInst>(ui)) {
-					BitCastI = dyn_cast<BitCastInst>(ui);
-					DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = GEPI->user_begin(),
-		//				ue = GEPI->user_end(); ui!=ue; ++ui) {
-		//        DEBUG(errs() << "--" << *ui << "\n");
-		//			if (isa<BitCastInst>(*ui)) {
-		//				BitCastI = dyn_cast<BitCastInst>(*ui);
-		//        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-		//			}
-		//		}
-
-		if (cont/*!BitCastI*/) {
-			continue; // not in pattern
-		}
-
-		//    DEBUG(errs() << *BitCastI << "\n");
-		// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
-		Value *Op2 = BitCastI->getOperand(0);
-		DEBUG(errs() << "----" << *Op2 << "\n");
-		//		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-		//		Type *OpTy = cast<Type>(Op2);
-		Type *OpTy = BitCastI->getDestTy();
-		DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-		//    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
-		if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-			// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-			continue; // not in pattern
-		}
-
-		DEBUG(errs() << "----Here!\n");
-		// We are in GEP, bitcast.
-
-		// user_iterator, to find the load.
-
-		if (!(BitCastI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-		}
-		DEBUG(errs() << "----Bitcast has one use!\n");
-		// it has one use
-		assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-		LoadInst *LoadI;
-		for (User * U : BitCastI->users()) { 
-			if (Instruction *ui = dyn_cast<Instruction> (U)) {
-				DEBUG(errs() << "-----" << *ui << "\n");
-				if (isa<LoadInst>(ui)) {
-					LoadI = dyn_cast<LoadInst>(ui);
-					DEBUG(errs() << "-----Found load as only use of bitcast\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = BitCastI->user_begin(),
-		//				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-		//			if (isa<LoadInst>(*ui)) {
-		//				LoadI = dyn_cast<LoadInst>(*ui);
-		//        errs() << "Found load as only use of bitcast\n";
-		//			}
-		//		}
-
-		if (cont) {
-			continue; // not in pattern
-		}
-
-		DEBUG("HERE!\n");
-		// check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
-		assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
-
-		// Copy user_iterator, to find the store.
-
-		if (!(LoadI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-			// TODO: generalize: one load can have more than one store users
-		}
-
-		// it has one use
-		assert(LoadI->hasOneUse() && "LoadI has a single use");
-		Value::user_iterator ui = LoadI->user_begin();
-		// skipped loop, because is has a single use
-		StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-		if (!StoreI) {
-			continue; // not in pattern
-		}
-
-		// Also check that the store uses the loaded value as the value operand
-		if (StoreI->getValueOperand() != LoadI) {
-			continue;
-		}
-
-		DEBUG(errs() << "-------Found store instruction\n");
-
-		// Look for its bitcast, which is its pointer operand
-		Value *StPtrOp = StoreI->getPointerOperand();
-		DEBUG(errs() << "-------" << *StPtrOp << "\n");
-		BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-		DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-		if (!BitCastI2) {
-			continue; //not in pattern
-		}
-
-		DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
-		// found bitcast. Look for the second GEP, its from operand.
-		Value *BCFromOp = BitCastI2->getOperand(0);
-		GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-		DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-		if (!GEPI2) {
-			continue; //not in pattern
-		}
-
-		if (!(GEPI2->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI2 = true;
-		}
-		DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
-
-		Value *PtrOp2 = GEPI2->getPointerOperand();
-
-		// Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
-
-		// Assume we found pattern
-		if (!keepGEPI) {  
-			IItoRemove.push_back(GEPI);
-			DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-		} else {
-			DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-		}
-		IItoRemove.push_back(BitCastI);
-		DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-		IItoRemove.push_back(LoadI);
-		DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-		IItoRemove.push_back(GEPI2);
-		DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-		IItoRemove.push_back(BitCastI2);
-		DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-		if (!keepGEPI2) {
-			IItoRemove.push_back(StoreI);
-			DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-		} else {
-
-			DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
-		}
-
-		std::vector<Value*> GEPlIndex;
-		if (GEPI->hasIndices()) {
-			for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-				GEPlIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-		std::vector<Value*> GEPsIndex;
-		if (GEPI2->hasIndices()) {
-			for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-				GEPsIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-
-
-		//    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-		GetElementPtrInst* newlGEP =
-			GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
-					PtrOp, // operand from 1st GEP
-					ArrayRef<Value*>(GEPlIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-		// insert load before GEPI
-		LoadInst *newLoadI =
-			new LoadInst(Type::getFloatTy(M.getContext()),
-					newlGEP, // new GEP
-					Twine(),
-					LoadI->isVolatile(),
-					LoadI->getAlignment(),
-					LoadI->getOrdering(),
-					LoadI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-		// same for GEP for store, for store operand
-		GetElementPtrInst* newsGEP =
-			GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-					PtrOp2, // operand from 2nd GEP
-					ArrayRef<Value*>(GEPsIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-		// insert store before GEPI
-		StoreInst *newStoreI =
-			new StoreInst(newLoadI,
-					newsGEP, // new GEP
-					StoreI->isVolatile(),
-					StoreI->getAlignment(),
-					StoreI->getOrdering(),
-					StoreI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newStoreI << "\n");
-
-	}
-
-	// We need to do this explicitly: DCE pass will not remove them because we
-	// have assumed theworst memory behaviour for these function calls
-	// Traverse the vector backwards, otherwise definitions are deleted while
-	// their subsequent uses are still around
-	for (auto *I : reverse(IItoRemove)) {
-		DEBUG(errs() << "Erasing: " << *I << "\n");
-		I->eraseFromParent();
-	}
-
-	// Removed the cloned functions from the parent module into the new module 
-	for(auto *F : FuncToBeRemoved) {
-		F->removeFromParent(); //TODO: MARIA check
-		KernelM->getFunctionList().push_back(F);
-	}
-
-	addCLMetadata(F_nvptx);
-	kernel->KernelFunction = F_nvptx;
-	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-	DEBUG(errs() << *KernelM);
-
-	return;
-}
+    else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function *calleeF =
+          cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if (calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        if (calleeF->getName() == "sqrtf") {
+          calleeF->setName(Twine("sqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        } else if (calleeF->getName() == "rsqrtf") {
+          calleeF->setName(Twine("rsqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        }
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF
+                     << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(),
+                                     calleeF->getFunctionType());
+      } else {
+        // Check if the called function has already been cloned before.
+        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+        // Iterate over the new function to see if it calls any other functions
+        // in the module.
+        for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc);
+             i != e; ++i) {
+          if (auto *Call = dyn_cast<CallInst>(&*i)) {
+            Function *CalledFunc =
+                cast<Function>(Call->getCalledValue()->stripPointerCasts());
+            CloneAndReplaceCall(Call, CalledFunc);
+          }
+        }
+      }
+      // TODO: how to handle address space qualifiers in load/store
+    }
+  }
+  // search for pattern where float is being casted to int and loaded/stored and
+  // change it.
+  DEBUG(errs() << "finding pattern for replacement!\n");
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
+    bool cont = false;
+    bool keepGEPI = false;
+    bool keepGEPI2 = false;
+    Instruction *I = &(*i);
+    GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
 
-bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-	errs() << "\nDFG2LLVM_NVPTX PASS\n";
+    if (!GEPI) {
+      // did nod find pattern start, continue
+      continue;
+    }
+    // may have found pattern, check
+    DEBUG(errs() << "GEPI " << *GEPI << "\n");
+    // print whatever we want for debug
+    Value *PtrOp = GEPI->getPointerOperand();
+    Type *SrcTy = GEPI->getSourceElementType();
+    unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+    if (SrcTy->isArrayTy())
+      DEBUG(errs() << *SrcTy << " is an array type! "
+                   << *(SrcTy->getArrayElementType()) << "\n");
+    else
+      DEBUG(errs() << *SrcTy << " is not an array type!\n");
+    // check that source element type is float
+    if (SrcTy->isArrayTy()) {
+      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+        DEBUG(errs() << "GEPI type is array but not float!\n");
+        continue;
+      }
+    } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) {
+      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+      // does not fit this pattern - no float GEP instruction
+      continue;
+    }
+    // check that addressspace is 1
+    //	  if (GEPIaddrspace != 1) {
+    //			// does not fit this pattern - addrspace of pointer
+    // argument is not global 			continue;
+    //		}
+    if (!(GEPI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI = true;
+    }
+    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+    // 1st GEPI it has one use
+    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+    // See if it is a bitcast
+    BitCastInst *BitCastI;
+    for (User *U : GEPI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "--" << *ui << "\n");
+        if (isa<BitCastInst>(ui)) {
+          BitCastI = dyn_cast<BitCastInst>(ui);
+          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = GEPI->user_begin(),
+    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
+    //        DEBUG(errs() << "--" << *ui << "\n");
+    //			if (isa<BitCastInst>(*ui)) {
+    //				BitCastI = dyn_cast<BitCastInst>(*ui);
+    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+    //			}
+    //		}
+
+    if (cont /*!BitCastI*/) {
+      continue; // not in pattern
+    }
+
+    //    DEBUG(errs() << *BitCastI << "\n");
+    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand
+    // has to be the GEP, since this is a use of the GEP.
+    Value *Op2 = BitCastI->getOperand(0);
+    DEBUG(errs() << "----" << *Op2 << "\n");
+    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+    //		Type *OpTy = cast<Type>(Op2);
+    Type *OpTy = BitCastI->getDestTy();
+    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) <<
+    //    "\n");
+    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+      continue; // not in pattern
+    }
+
+    DEBUG(errs() << "----Here!\n");
+    // We are in GEP, bitcast.
+
+    // user_iterator, to find the load.
+
+    if (!(BitCastI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+    }
+    DEBUG(errs() << "----Bitcast has one use!\n");
+    // it has one use
+    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+    LoadInst *LoadI;
+    for (User *U : BitCastI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "-----" << *ui << "\n");
+        if (isa<LoadInst>(ui)) {
+          LoadI = dyn_cast<LoadInst>(ui);
+          DEBUG(errs() << "-----Found load as only use of bitcast\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = BitCastI->user_begin(),
+    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+    //			if (isa<LoadInst>(*ui)) {
+    //				LoadI = dyn_cast<LoadInst>(*ui);
+    //        errs() << "Found load as only use of bitcast\n";
+    //			}
+    //		}
+
+    if (cont) {
+      continue; // not in pattern
+    }
 
-	// Get the BuildDFG Analysis Results:
-	// - Dataflow graph
-	// - Maps from i8* hansles to DFNode and DFEdge
-	BuildDFG &DFG = getAnalysis<BuildDFG>();
+    DEBUG("HERE!\n");
+    // check that we load from pointer we got from bitcast - assert - the unique
+    // argument must be the use we found it from
+    assert(LoadI->getPointerOperand() == BitCastI &&
+           "Unexpected Load Instruction Operand\n");
 
-	// DFInternalNode *Root = DFG.getRoot();
-	std::vector<DFInternalNode*> Roots = DFG.getRoots();
-	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+    // Copy user_iterator, to find the store.
 
-	// Visitor for Code Generation Graph Traversal
-	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+    if (!(LoadI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+      // TODO: generalize: one load can have more than one store users
+    }
+
+    // it has one use
+    assert(LoadI->hasOneUse() && "LoadI has a single use");
+    Value::user_iterator ui = LoadI->user_begin();
+    // skipped loop, because is has a single use
+    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+    if (!StoreI) {
+      continue; // not in pattern
+    }
+
+    // Also check that the store uses the loaded value as the value operand
+    if (StoreI->getValueOperand() != LoadI) {
+      continue;
+    }
+
+    DEBUG(errs() << "-------Found store instruction\n");
+
+    // Look for its bitcast, which is its pointer operand
+    Value *StPtrOp = StoreI->getPointerOperand();
+    DEBUG(errs() << "-------" << *StPtrOp << "\n");
+    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+    if (!BitCastI2) {
+      continue; // not in pattern
+    }
+
+    DEBUG(errs() << "-------- Found Bit Cast of store!\n");
+    // found bitcast. Look for the second GEP, its from operand.
+    Value *BCFromOp = BitCastI2->getOperand(0);
+    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+    if (!GEPI2) {
+      continue; // not in pattern
+    }
+
+    if (!(GEPI2->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI2 = true;
+    }
+    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n");
+
+    Value *PtrOp2 = GEPI2->getPointerOperand();
+
+    // Found GEPI2. TODO: kind of confused as o what checks I need to add here,
+    // let's add them together- all the code for int-float type checks is
+    // already above.
+
+    // Assume we found pattern
+    if (!keepGEPI) {
+      IItoRemove.push_back(GEPI);
+      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+    } else {
+      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+    }
+    IItoRemove.push_back(BitCastI);
+    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+    IItoRemove.push_back(LoadI);
+    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+    IItoRemove.push_back(GEPI2);
+    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+    IItoRemove.push_back(BitCastI2);
+    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+    if (!keepGEPI2) {
+      IItoRemove.push_back(StoreI);
+      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+    } else {
+
+      DEBUG(errs() << "Keeping " << *StoreI
+                   << " since it has multiple uses!\n");
+    }
+
+    std::vector<Value *> GEPlIndex;
+    if (GEPI->hasIndices()) {
+      for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+        GEPlIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+    std::vector<Value *> GEPsIndex;
+    if (GEPI2->hasIndices()) {
+      for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+        GEPsIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+    GetElementPtrInst *newlGEP = GetElementPtrInst::Create(
+        GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp,                        // operand from 1st GEP
+        ArrayRef<Value *>(GEPlIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+    // insert load before GEPI
+    LoadInst *newLoadI =
+        new LoadInst(Type::getFloatTy(M.getContext()),
+                     newlGEP, // new GEP
+                     Twine(), LoadI->isVolatile(), LoadI->getAlignment(),
+                     LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+    // same for GEP for store, for store operand
+    GetElementPtrInst *newsGEP = GetElementPtrInst::Create(
+        GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp2,                        // operand from 2nd GEP
+        ArrayRef<Value *>(GEPsIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+    // insert store before GEPI
+    StoreInst *newStoreI =
+        new StoreInst(newLoadI,
+                      newsGEP, // new GEP
+                      StoreI->isVolatile(), StoreI->getAlignment(),
+                      StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+  }
+
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (auto *I : reverse(IItoRemove)) {
+    DEBUG(errs() << "Erasing: " << *I << "\n");
+    I->eraseFromParent();
+  }
+
+  // Removed the cloned functions from the parent module into the new module
+  for (auto *F : FuncToBeRemoved) {
+    F->removeFromParent(); // TODO: MARIA check
+    KernelM->getFunctionList().push_back(F);
+  }
+
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName()
+               << "\n");
+  DEBUG(errs() << *KernelM);
 
-	// Iterate over all the DFGs and produce code for each one of them
-	for (auto rootNode: Roots) {
-		// Initiate code generation for root DFNode
-		CGTVisitor->visit(rootNode);
-	}
+  return;
+}
+
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+  DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n");
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap =
+  //    DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap
+  //    = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode : Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
 
-	CGTVisitor->writeKernelsModule();
+  CGTVisitor->writeKernelsModule();
 
-	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
-	delete CGTVisitor;
+  // TODO: Edit module epilogue to remove the HPVM intrinsic declarations
+  delete CGTVisitor;
 
-	return true;
+  return true;
 }
 
 std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-	/*SmallString<128> currentDir;
-		llvm::sys::fs::current_path(currentDir);
-		std::string fileName = getFilenameFromModule(M);
-		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-		return output.str().append(".kernels.ll");*/
-	std::string mid = M.getModuleIdentifier();
-	return mid.append(".kernels.ll");
+  /*SmallString<128> currentDir;
+          llvm::sys::fs::current_path(currentDir);
+          std::string fileName = getFilenameFromModule(M);
+          Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+          return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
 }
 
-void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-	assert(isa<PointerType>(V->getType())
-			&& "Value should be of Pointer Type!");
-	PointerType* OldTy = cast<PointerType>(V->getType());
-	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-	V->mutateType(NewTy);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-		// Change all uses producing pointer type in same address space to new
-		// addressspace.
-		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-				fixValueAddrspace(*ui, addrspace);
-			}
-		}
-	}
+void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!");
+  PointerType *OldTy = cast<PointerType>(V->getType());
+  PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if (PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
 }
 
-
-std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-	std::vector<unsigned> ConstantMemArgs;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument* arg = &*ai; 
-		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-				GlobalMemArgs->end(), arg->getArgNo());
-		// It has to be a global memory argument to be promotable
-		if(pos == GlobalMemArgs->end())
-			continue;
-
-		// Check if it can/should be promoted
-		if(canBePromoted(arg, F)) {
-			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
-			ConstantMemArgs.push_back(arg->getArgNo());
-			GlobalMemArgs->erase(pos);
-		}
-	}
-	return ConstantMemArgs;
+std::vector<unsigned>
+CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs,
+                                     Function *F) {
+  std::vector<unsigned> ConstantMemArgs;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    std::vector<unsigned>::iterator pos = std::find(
+        GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo());
+    // It has to be a global memory argument to be promotable
+    if (pos == GlobalMemArgs->end())
+      continue;
+
+    // Check if it can/should be promoted
+    if (canBePromoted(arg, F)) {
+      DEBUG(errs() << "Promoting << " << arg->getName()
+                   << " to constant memory."
+                   << "\n");
+      ConstantMemArgs.push_back(arg->getArgNo());
+      GlobalMemArgs->erase(pos);
+    }
+  }
+  return ConstantMemArgs;
 }
 
-Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-	unsigned idx = 0;
-	std::vector<Type*> ArgTypes;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument *arg = &*ai;
-		DEBUG(errs() << *arg << "\n");
-		unsigned argno = arg->getArgNo();
-		if ((idx < Args.size()) && (argno == Args[idx])) {
-			fixValueAddrspace(arg, addrspace);
-			idx++;
-		}
-		ArgTypes.push_back(arg->getType());
-	}
-	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-	return newF;
+Function *CGT_NVPTX::changeArgAddrspace(Function *F,
+                                        std::vector<unsigned> &Args,
+                                        unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type *> ArgTypes;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    DEBUG(errs() << *arg << "\n");
+    unsigned argno = arg->getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg->getType());
+  }
+  FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n");
+  return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-	IRBuilder<> Builder(&*F->begin());
+  IRBuilder<> Builder(&*F->begin());
+
+  SmallVector<Metadata *, 8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
+
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
+
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels =
+      KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(ValueAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations =
+      KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+}
 
-	SmallVector<Metadata*,8> KernelMD;
-	KernelMD.push_back(ValueAsMetadata::get(F));
+void CGT_NVPTX::writeKernelsModule() {
 
-	// TODO: There is additional metadata used by kernel files but we skip them as
-	// they are not mandatory. In future they might be useful to enable
-	// optimizations
+  // In addition to deleting all other functions, we also want to spiff it
+  // up a little bit.  Do this now.
+  legacy::PassManager Passes;
 
-	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-	MDN_kernels->addOperand(MDKernelNode);
+  DEBUG(errs() << "Writing to File --- ");
+  DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
+  std::error_code EC;
+  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  if (EC) {
+    DEBUG(errs() << EC.message() << '\n');
+  }
 
-	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-	// TODO: Replace 1 with the number of the kernel.
-	// Add when support for multiple launces is added
-	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+  Passes.add(createPrintModulePass(Out.os()));
 
+  Passes.run(*KernelM);
+
+  // Declare success.
+  Out.keep();
 }
 
-void CGT_NVPTX::writeKernelsModule() {
+Function *CGT_NVPTX::transformFunctionToVoid(Function *F) {
 
-	// In addition to deleting all other functions, we also want to spiff it
-	// up a little bit.  Do this now.
-	legacy::PassManager Passes;
+  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+  // FIXME: Maybe do that using the Node?
+  StructType *FRetTy = dyn_cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
 
-	errs() << "Writing to File --- ";
-	errs() << getKernelsModuleName(M).c_str() << "\n";
-	std::error_code EC;
-	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-	if (EC) {
-		errs() << EC.message() << '\n';
-	}
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
 
-	Passes.add(
-			createPrintModulePass(Out.os()));
+  std::vector<Type *> RetArgTypes;
+  std::vector<Argument *> RetArgs;
+  std::vector<Argument *> Args;
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
 
-	Passes.run(*KernelM);
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
 
-	// Declare success.
-	Out.keep();
-}
+    // Replacing return statements with others returning void
+    for (auto *RI : RItoRemove) {
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  } else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    for (unsigned i = 0; i < FRetTy->getNumElements(); i++) {
+      Argument *RetArg =
+          new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      RetArgs.push_back(RetArg);
+      RetArgTypes.push_back(RetArg->getType());
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
 
-Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
-
-	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-	// FIXME: Maybe do that using the Node?
-	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-	assert(FRetTy && "Return Type must always be a struct");
-
-	// Keeps return statements, because we will need to replace them
-	std::vector<ReturnInst *> RItoRemove;
-	findReturnInst(F, RItoRemove);
-
-	std::vector<Type *> RetArgTypes;
-	std::vector<Argument*> RetArgs;
-	std::vector<Argument*> Args;
-	// Check for { } return struct, which means that the function returns void
-	if (FRetTy->isEmptyTy()) {
-
-		DEBUG(errs() << "\tFunction output struct is void\n");
-		DEBUG(errs() << "\tNo parameters added\n");
-
-		// Replacing return statements with others returning void
-		for (auto *RI : RItoRemove) {
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-		}
-		DEBUG(errs() << "\tChanged return statements to return void\n");
-	}
-	else {
-		// The struct has return values, thus needs to be converted to parameter
-
-		// Iterate over all element types of return struct and add arguments to the
-		// function
-		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-			RetArgs.push_back(RetArg);
-			RetArgTypes.push_back(RetArg->getType());
-			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-		}
-
-		DEBUG(errs() << "\tReplacing Return statements\n");
-		// Replace return statements with extractValue and store instructions
-		for (auto *RI : RItoRemove) {
-			Value* RetVal = RI->getReturnValue();
-			for(unsigned i = 0; i < RetArgs.size(); i++) {
-				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-						RetArgs[i]->getName()+".val", RI);
-				new StoreInst(EI, RetArgs[i], RI);
-			}
-			// assert(RetVal && "Return value should not be null at this point");
-			// StructType* RetType = cast<StructType>(RetVal->getType());
-			// assert(RetType && "Return type is not a struct");
-
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-
-		}
-	}
-	DEBUG(errs() << "\tReplaced return statements\n");
-
-	// Create the argument type list with the added argument's type
-	std::vector<Type*> ArgTypes;
-	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		ArgTypes.push_back(ai->getType());
-	}
-	for(auto *RATy: RetArgTypes) {
-		ArgTypes.push_back(RATy);
-	}
-
-	// Creating Args vector to use in cloning!
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Args.push_back(&*ai);
-	}
-	for(auto *ai : RetArgs) {
-		Args.push_back(ai);
-	}
-
-	// Adding new arguments to the function argument list, would not change the
-	// function type. We need to change the type of this function to reflect the
-	// added arguments
-	Type* VoidRetType = Type::getVoidTy(F->getContext());
-	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-	// Change the function type
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-	//F->eraseFromParent();
-	return newF;
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (auto *RI : RItoRemove) {
+      Value *RetVal = RI->getReturnValue();
+      for (unsigned i = 0; i < RetArgs.size(); i++) {
+        ExtractValueInst *EI = ExtractValueInst::Create(
+            RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI);
+        new StoreInst(EI, RetArgs[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  for (auto *RATy : RetArgTypes) {
+    ArgTypes.push_back(RATy);
+  }
+
+  // Creating Args vector to use in cloning!
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Args.push_back(&*ai);
+  }
+  for (auto *ai : RetArgs) {
+    Args.push_back(ai);
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type *VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false, NULL, &Args);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  // F->eraseFromParent();
+  return newF;
 }
 
 /******************************************************************************
@@ -2102,314 +2138,344 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 1. No stores
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
-static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	VisitedList->push_back(V);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-			ui != ue; ++ui) {
-		Instruction* I = dyn_cast<Instruction>(*ui);
-		if(!I) {
-			// if use is not an instruction, then skip it
-			continue;
-		}
-		DEBUG(errs() << "\t" << *I << "\n");
-		if(isa<LoadInst>(I)) {
-			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-			UseList->push_back(V);
-		}
-		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-			// found a store in use chain
-			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-			return true;
-		}
-		else if(BuildDFG::isViscIntrinsic(I)) {
-			// If it is an atomic intrinsic, we found a store
-			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-					&& "Only visc atomic intrinsics can have an argument as input");
-			return true;
-		}
-		else {
-			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-			if(findLoadStoreUses(I, UseList, VisitedList))
-				return true;
-		}
-	}
-	return false;
+static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList,
+                              std::vector<Value *> *VisitedList) {
+  if (std::find(VisitedList->begin(), VisitedList->end(), V) !=
+      VisitedList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  VisitedList->push_back(V);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ++ui) {
+    Instruction *I = dyn_cast<Instruction>(*ui);
+    if (!I) {
+      // if use is not an instruction, then skip it
+      continue;
+    }
+    DEBUG(errs() << "\t" << *I << "\n");
+    if (isa<LoadInst>(I)) {
+      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+      UseList->push_back(V);
+    } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+      // found a store in use chain
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+      return true;
+    } else if (BuildDFG::isHPVMIntrinsic(I)) {
+      // If it is an atomic intrinsic, we found a store
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      assert(II &&
+             II->getCalledValue()->getName().startswith("llvm.hpvm.atomic") &&
+             "Only hpvm atomic intrinsics can have an argument as input");
+      return true;
+    } else {
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+      if (findLoadStoreUses(I, UseList, VisitedList))
+        return true;
+    }
+  }
+  return false;
 }
 
-static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	DependenceList->push_back(V);
-	// If not an instruction, then not dependent on node instance id
-	if(!isa<Instruction>(V) || isa<Constant>(V)) {
-		DEBUG(errs() << "\tStop\n");
-		return false;
-	}
-
-	Instruction* I = cast<Instruction>(V);
-	for(unsigned i = 0; i < I->getNumOperands(); i++) {
-		Value* operand = I->getOperand(i);
-		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-				Value* Node = II->getArgOperand(0);
-				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-					return true;
-				}
-			}
-		}
-		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-			continue;
-		}
-		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-			return true;
-		}
-	}
-	return false;
+static bool isDependentOnNodeInstanceID(Value *V,
+                                        std::vector<Value *> *DependenceList) {
+  if (std::find(DependenceList->begin(), DependenceList->end(), V) !=
+      DependenceList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  DependenceList->push_back(V);
+  // If not an instruction, then not dependent on node instance id
+  if (!isa<Instruction>(V) || isa<Constant>(V)) {
+    DEBUG(errs() << "\tStop\n");
+    return false;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  for (unsigned i = 0; i < I->getNumOperands(); i++) {
+    Value *operand = I->getOperand(i);
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) {
+      if ((II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_x ||
+           II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_y ||
+           II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_z)) {
+        Value *Node = II->getArgOperand(0);
+        IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node);
+        assert(
+            GN &&
+            "NodeInstanceID operande should be node/parent node intrinsic\n");
+        if (GN->getIntrinsicID() == Intrinsic::hpvm_getNode) {
+          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II
+                       << "\n");
+          return true;
+        }
+      }
+    }
+    if (CmpInst *CI = dyn_cast<CmpInst>(operand)) {
+      DEBUG(errs() << "Found compare instruction: " << *CI
+                   << "\nNot following its dependency list\n");
+      continue;
+    }
+    DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+    if (isDependentOnNodeInstanceID(operand, DependenceList)) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
-static bool canBePromoted(Argument* arg, Function* F) {
-	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-	std::vector<Value*> UseList;
-	std::vector<Value*> VisitedList;
-	// recursively traverse use chain
-	// if find a store instruction return false, everything fails, cannot be
-	// promoted
-	// if find a load instruction as use, add the GEP instruction to list
-	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-	if(foundStore == true)
-		return false;
-	// See that the GEP instructions are not dependent on getNodeInstanceID
-	// intrinsic
-	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-	std::vector<Value*>DependenceList;
-	for(auto U: UseList) {
-		if(isDependentOnNodeInstanceID(U, &DependenceList))
-			return false;
-	}
-	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-	return true;
+static bool canBePromoted(Argument *arg, Function *F) {
+  DEBUG(errs() << "OPT: Check if Argument " << *arg
+               << " can be changed to constant memory\n");
+  std::vector<Value *> UseList;
+  std::vector<Value *> VisitedList;
+  // recursively traverse use chain
+  // if find a store instruction return false, everything fails, cannot be
+  // promoted
+  // if find a load instruction as use, add the GEP instruction to list
+  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+  if (foundStore == true)
+    return false;
+  // See that the GEP instructions are not dependent on getNodeInstanceID
+  // intrinsic
+  DEBUG(errs() << foundStore
+               << "\tNo Store Instruction found. Check dependence on node "
+                  "instance ID\n");
+  std::vector<Value *> DependenceList;
+  for (auto U : UseList) {
+    if (isDependentOnNodeInstanceID(U, &DependenceList))
+      return false;
+  }
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+  return true;
 }
 
-
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
-static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-	// Assign number of dimenstions a constant value
-	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-	// If local work group size if null
-	if(!kernel->hasLocalWG()) {
-		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-	}
-	else {
-		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-			if(isa<Argument>(kernel->localWGSize[i]))
-				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-		}
-		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-	}
-
-	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-		if(isa<Argument>(kernel->globalWGSize[i]))
-			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-	}
-
-	// For OpenCL, global work group size is the total bumber of instances in each
-	// dimension. So, multiply local and global dim limits.
-	std::vector<Value*> globalWGSizeInsts;
-	if(kernel->hasLocalWG()) {
-		for (unsigned i = 0; i < kernel->gridDim; i++) {
-			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-			globalWGSizeInsts.push_back(MulInst);
-		}
-	}
-	else {
-		globalWGSizeInsts = kernel->globalWGSize;
-	}
-	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr,
+                                 Value *&GlobalWGPtr, Kernel *kernel,
+                                 ValueToValueMapTy &VMap, Instruction *IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if (!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  } else {
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr =
+        genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if (isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value *> globalWGSizeInsts;
+  if (kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator *MulInst =
+          BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i],
+                                 kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  } else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-	Value* WGPtr;
-	// Get int64_t and or ease of use
-	Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-	// Work Group type is [#dim x i64]
-	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-	// Allocate space of Global work group data on stack and get pointer to
-	// first element.
-	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-	Value* nextDim = WGPtr;
-	DEBUG(errs() << *WGPtr << "\n");
-
-	// Iterate over the number of dimensions and store the global work group
-	// size in that dimension
-	for(unsigned i=0; i < WGSize.size(); i++) {
-		DEBUG(errs() << *WGSize[i] << "\n");
-		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-		if(WGSize[i]->getType() != Int64Ty) {
-			// If number of dimensions are mentioned in any other integer format,
-			// generate code to extend it to i64. We need to use the mapped value in
-			// the new generated function, hence the use of VMap
-			// FIXME: Why are we changing the kernel WGSize vector here?
-			DEBUG(errs() << "Not i64. Zero extend required.\n");
-			DEBUG(errs() << *WGSize[i] << "\n");
-			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-			DEBUG(errs() << "Bitcast done.\n");
-			StoreInst* SI = new StoreInst(CI, nextDim, IB);
-			DEBUG(errs() << "Zero extend done.\n");
-			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-		} else {
-			// Store the value representing work group size in ith dimension on
-			// stack
-			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-		}
-		if(i+1 < WGSize.size()) {
-			// Move to next dimension
-			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-					WG->getName()+"."+Twine(i+1),
-					IB);
-			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-			nextDim = GEP;
-		}
-	}
-	return WGPtr;
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
+                              ValueToValueMapTy &VMap, Instruction *IB,
+                              const Twine &WGName) {
+  Value *WGPtr;
+  // Get int64_t and or ease of use
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type *WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(),
+                                         WG->getName() + ".0", IB);
+  Value *nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for (unsigned i = 0; i < WGSize.size(); i++) {
+    DEBUG(errs() << *WGSize[i] << "\n");
+    assert(WGSize[i]->getType()->isIntegerTy() &&
+           "Dimension not an integer type!");
+
+    if (WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst *CI =
+          BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst *SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB);
 
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if (i + 1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)),
+          WG->getName() + "." + Twine(i + 1), IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
 }
 
 // Get generated PTX binary name
-static std::string getPTXFilename(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	moduleID.append(".kernels.cl");
-	return moduleID;
+static std::string getPTXFilename(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".kernels.cl");
+  return moduleID;
 }
 
 // Get the name of the input file from module ID
-static std::string getFilenameFromModule(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	return moduleID.substr(moduleID.find_last_of("/")+1);
+static std::string getFilenameFromModule(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/") + 1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-	if (TARGET_PTX == 32)
-		M.setDataLayout(StringRef(nvptx32_layoutStr));
-	else if (TARGET_PTX == 64)
-		M.setDataLayout(StringRef(nvptx64_layoutStr));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 static void changeTargetTriple(Module &M) {
-	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-	if (TARGET_PTX == 32)
-		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-	else if (TARGET_PTX == 64)
-		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 // Helper function, populate a vector with all return statements in a function
-static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-	for (auto &BB : *F) {
-		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-			ReturnInstVec.push_back(RI);
-	}	
+static void findReturnInst(Function *F,
+                           std::vector<ReturnInst *> &ReturnInstVec) {
+  for (auto &BB : *F) {
+    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      ReturnInstVec.push_back(RI);
+  }
 }
 
-// Helper function, populate a vector with all IntrinsicID intrinsics in a function
-static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-		Instruction *I = &(*i);
-		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-		if (II && II->getIntrinsicID() == IntrinsicID) {
-			IntrinsicInstVec.push_back(II);
-		}
-	}
+// Helper function, populate a vector with all IntrinsicID intrinsics in a
+// function
+static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID,
+                              std::vector<IntrinsicInst *> &IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
 }
 
-// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic
+// op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return AtomicRMWInst::Add;
-		case Intrinsic::visc_atomic_sub:
-			return AtomicRMWInst::Sub;
-		case Intrinsic::visc_atomic_min:
-			return AtomicRMWInst::Min;
-		case Intrinsic::visc_atomic_max:
-			return AtomicRMWInst::Max;
-		case Intrinsic::visc_atomic_xchg:
-			return AtomicRMWInst::Xchg;
-		case Intrinsic::visc_atomic_and:
-			return AtomicRMWInst::And;
-		case Intrinsic::visc_atomic_or:
-			return AtomicRMWInst::Or;
-		case Intrinsic::visc_atomic_xor:
-			return AtomicRMWInst::Xor;
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::hpvm_atomic_add:
+    return AtomicRMWInst::Add;
+  case Intrinsic::hpvm_atomic_sub:
+    return AtomicRMWInst::Sub;
+  case Intrinsic::hpvm_atomic_min:
+    return AtomicRMWInst::Min;
+  case Intrinsic::hpvm_atomic_umin:
+    return AtomicRMWInst::UMin;
+  case Intrinsic::hpvm_atomic_max:
+    return AtomicRMWInst::Max;
+  case Intrinsic::hpvm_atomic_umax:
+    return AtomicRMWInst::UMax;
+    // case Intrinsic::hpvm_atomic_inc: return AtomicRMWInst::Inc;
+    // case Intrinsic::hpvm_atomic_dec: return AtomicRMWInst::Dec;
+  case Intrinsic::hpvm_atomic_xchg:
+    return AtomicRMWInst::Xchg;
+  case Intrinsic::hpvm_atomic_and:
+    return AtomicRMWInst::And;
+  case Intrinsic::hpvm_atomic_or:
+    return AtomicRMWInst::Or;
+  case Intrinsic::hpvm_atomic_xor:
+    return AtomicRMWInst::Xor;
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
-
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return "atom_add";
-		case Intrinsic::visc_atomic_sub:
-			return "atom_sub";
-		case Intrinsic::visc_atomic_min:
-			return "atom_min";
-		case Intrinsic::visc_atomic_max:
-			return "atom_max";
-		case Intrinsic::visc_atomic_xchg:
-			return "atom_xchg";
-		case Intrinsic::visc_atomic_and:
-			return "atom_and";
-		case Intrinsic::visc_atomic_or:
-			return "atom_or";
-		case Intrinsic::visc_atomic_xor:
-			return "atom_xor";
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::hpvm_atomic_cmpxchg:
+    return "atom_cmpxchg";
+  case Intrinsic::hpvm_atomic_add:
+    return "atom_add";
+  case Intrinsic::hpvm_atomic_sub:
+    return "atom_sub";
+  case Intrinsic::hpvm_atomic_min:
+    return "atom_min";
+  case Intrinsic::hpvm_atomic_max:
+    return "atom_max";
+  case Intrinsic::hpvm_atomic_inc:
+    return "atom_inc";
+  case Intrinsic::hpvm_atomic_dec:
+    return "atom_dec";
+  case Intrinsic::hpvm_atomic_xchg:
+    return "atom_xchg";
+  case Intrinsic::hpvm_atomic_and:
+    return "atom_and";
+  case Intrinsic::hpvm_atomic_or:
+    return "atom_or";
+  case Intrinsic::hpvm_atomic_xor:
+    return "atom_xor";
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
 } // End of namespace
@@ -2420,4 +2486,3 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
 		false /* does not modify the CFG */,
 		true /* transformation,   *
 					* not just analysis */);
-
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index a0fa9fcde477018cf00af5c932512ce804105c9d..8152817d9a9dbdb9d0164ba8cb7b9a49ce2f081f 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "DFG2LLVM_X86"
-#include "SupportVISC/DFG2LLVM.h"
+#include "SupportHPVM/DFG2LLVM.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
@@ -25,29 +25,29 @@ using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
 
-// VISC Command line option to use timer or not
-static cl::opt<bool> VISCTimer_X86("visc-timers-x86",
-                                   cl::desc("Enable visc timers"));
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer_X86("hpvm-timers-x86",
+                                   cl::desc("Enable hpvm timers"));
 // Command line option to enable device abstraction or not
 static cl::opt<bool>
-    DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
-                      cl::desc("Enable visc device abstraction"));
+    DeviceAbstraction("hpvm-eda", cl::init(false), cl::Hidden,
+                      cl::desc("Enable hpvm device abstraction"));
 
 namespace {
 
 // Helper Functions
-static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
+static bool isHPVMCall_llvm_hpvm_policy_getVersion(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
   return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .equals("llvm_visc_policy_getVersion");
+      .equals("llvm_hpvm_policy_getVersion");
 }
 
-CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
+CallInst *get_llvm_hpvm_policy_getVersion_call(Function *F) {
   for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) {
     Instruction *I = &*ib;
-    if (isVISCCall_llvm_visc_policy_getVersion(I))
+    if (isHPVMCall_llvm_hpvm_policy_getVersion(I))
       return cast<CallInst>(I);
   }
   return NULL;
@@ -74,27 +74,27 @@ private:
   // Member variables
 
   FunctionCallee malloc;
-  // VISC Runtime API
-  FunctionCallee llvm_visc_x86_launch;
-  FunctionCallee llvm_visc_x86_wait;
-  FunctionCallee llvm_visc_x86_argument_ptr;
-
-  FunctionCallee llvm_visc_streamLaunch;
-  FunctionCallee llvm_visc_streamPush;
-  FunctionCallee llvm_visc_streamPop;
-  FunctionCallee llvm_visc_streamWait;
-  FunctionCallee llvm_visc_createBindInBuffer;
-  FunctionCallee llvm_visc_createBindOutBuffer;
-  FunctionCallee llvm_visc_createEdgeBuffer;
-  FunctionCallee llvm_visc_createLastInputBuffer;
-  FunctionCallee llvm_visc_createThread;
-  // Constant* llvm_visc_freeThreads;
-  FunctionCallee llvm_visc_bufferPush;
-  FunctionCallee llvm_visc_bufferPop;
-  FunctionCallee llvm_visc_x86_dstack_push;
-  FunctionCallee llvm_visc_x86_dstack_pop;
-  FunctionCallee llvm_visc_x86_getDimLimit;
-  FunctionCallee llvm_visc_x86_getDimInstance;
+  // HPVM Runtime API
+  FunctionCallee llvm_hpvm_x86_launch;
+  FunctionCallee llvm_hpvm_x86_wait;
+  FunctionCallee llvm_hpvm_x86_argument_ptr;
+
+  FunctionCallee llvm_hpvm_streamLaunch;
+  FunctionCallee llvm_hpvm_streamPush;
+  FunctionCallee llvm_hpvm_streamPop;
+  FunctionCallee llvm_hpvm_streamWait;
+  FunctionCallee llvm_hpvm_createBindInBuffer;
+  FunctionCallee llvm_hpvm_createBindOutBuffer;
+  FunctionCallee llvm_hpvm_createEdgeBuffer;
+  FunctionCallee llvm_hpvm_createLastInputBuffer;
+  FunctionCallee llvm_hpvm_createThread;
+  // Constant* llvm_hpvm_freeThreads;
+  FunctionCallee llvm_hpvm_bufferPush;
+  FunctionCallee llvm_hpvm_bufferPop;
+  FunctionCallee llvm_hpvm_x86_dstack_push;
+  FunctionCallee llvm_hpvm_x86_dstack_pop;
+  FunctionCallee llvm_hpvm_x86_getDimLimit;
+  FunctionCallee llvm_hpvm_x86_getDimInstance;
 
   // Functions
   std::vector<IntrinsicInst *> *getUseList(Value *LI);
@@ -120,7 +120,7 @@ private:
 
   // Virtual Functions
   void init() {
-    VISCTimer = VISCTimer_X86;
+    HPVMTimer = HPVMTimer_X86;
     TargetName = "X86";
   }
   void initRuntimeAPI();
@@ -177,7 +177,7 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   return true;
 }
 
-// Initialize the VISC runtime API. This makes it easier to insert these calls
+// Initialize the HPVM runtime API. This makes it easier to insert these calls
 void CGT_X86::initRuntimeAPI() {
 
   // Load Runtime API Module
@@ -187,51 +187,51 @@ void CGT_X86::initRuntimeAPI() {
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll";
+  Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/hpvm-rt/hpvm-rt.ll";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
 
   if (runtimeModule == NULL)
     DEBUG(errs() << Err.getMessage());
   else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
 
   // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_x86_launch);
+  DECLARE(llvm_hpvm_x86_launch);
   DECLARE(malloc);
-  DECLARE(llvm_visc_x86_wait);
-  DECLARE(llvm_visc_x86_argument_ptr);
-  DECLARE(llvm_visc_streamLaunch);
-  DECLARE(llvm_visc_streamPush);
-  DECLARE(llvm_visc_streamPop);
-  DECLARE(llvm_visc_streamWait);
-  DECLARE(llvm_visc_createBindInBuffer);
-  DECLARE(llvm_visc_createBindOutBuffer);
-  DECLARE(llvm_visc_createEdgeBuffer);
-  DECLARE(llvm_visc_createLastInputBuffer);
-  DECLARE(llvm_visc_createThread);
-  // DECLARE(llvm_visc_freeThreads);
-  DECLARE(llvm_visc_bufferPush);
-  DECLARE(llvm_visc_bufferPop);
-  DECLARE(llvm_visc_x86_dstack_push);
-  DECLARE(llvm_visc_x86_dstack_pop);
-  DECLARE(llvm_visc_x86_getDimLimit);
-  DECLARE(llvm_visc_x86_getDimInstance);
+  DECLARE(llvm_hpvm_x86_wait);
+  DECLARE(llvm_hpvm_x86_argument_ptr);
+  DECLARE(llvm_hpvm_streamLaunch);
+  DECLARE(llvm_hpvm_streamPush);
+  DECLARE(llvm_hpvm_streamPop);
+  DECLARE(llvm_hpvm_streamWait);
+  DECLARE(llvm_hpvm_createBindInBuffer);
+  DECLARE(llvm_hpvm_createBindOutBuffer);
+  DECLARE(llvm_hpvm_createEdgeBuffer);
+  DECLARE(llvm_hpvm_createLastInputBuffer);
+  DECLARE(llvm_hpvm_createThread);
+  // DECLARE(llvm_hpvm_freeThreads);
+  DECLARE(llvm_hpvm_bufferPush);
+  DECLARE(llvm_hpvm_bufferPop);
+  DECLARE(llvm_hpvm_x86_dstack_push);
+  DECLARE(llvm_hpvm_x86_dstack_pop);
+  DECLARE(llvm_hpvm_x86_getDimLimit);
+  DECLARE(llvm_hpvm_x86_getDimInstance);
 
   // Get or insert timerAPI functions as well if you plan to use timers
   initTimerAPI();
 
   // Insert init context in main
-  Function *VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
   DEBUG(errs() << "Inserting x86 timer initialization\n");
   Instruction *I = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
+  switchToTimer(hpvm_TimerID_NONE, I);
   // Insert code for initializing the sceduling policy
   FunctionCallee IP = M.getOrInsertFunction(
-      "llvm_visc_policy_init",
-      runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType());
+      "llvm_hpvm_policy_init",
+      runtimeModule->getFunction("llvm_hpvm_policy_init")->getFunctionType());
   CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
   DEBUG(errs() << *IPCallInst << "\n");
 
@@ -239,22 +239,22 @@ void CGT_X86::initRuntimeAPI() {
   // device status simulation
   if (DeviceAbstraction) {
     FunctionCallee ID = M.getOrInsertFunction(
-        "llvm_visc_deviceAbstraction_start",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")
+        "llvm_hpvm_deviceAbstraction_start",
+        runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_start")
             ->getFunctionType());
     CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
     DEBUG(errs() << *IDCallInst << "\n");
   }
 
-  // Insert print instruction at visc exit
-  Function *VC = M.getFunction("llvm.visc.cleanup");
-  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+  // Insert print instruction at hpvm exit
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
+  assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
 
   // Insert code for clearing the sceduling policy
   I = cast<Instruction>(*VC->user_begin());
   IP = M.getOrInsertFunction(
-      "llvm_visc_policy_clear",
-      runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType());
+      "llvm_hpvm_policy_clear",
+      runtimeModule->getFunction("llvm_hpvm_policy_clear")->getFunctionType());
   IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
   DEBUG(errs() << *IPCallInst << "\n");
 
@@ -265,8 +265,8 @@ void CGT_X86::initRuntimeAPI() {
   // device status simulation
   if (DeviceAbstraction) {
     FunctionCallee ID = M.getOrInsertFunction(
-        "llvm_visc_deviceAbstraction_end",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")
+        "llvm_hpvm_deviceAbstraction_end",
+        runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_end")
             ->getFunctionType());
     CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
     DEBUG(errs() << *IDCallInst << "\n");
@@ -542,7 +542,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args,
   // Call runtime to create the thread with these arguments
   DEBUG(errs() << "Start Thread for child node: "
                << C->getFuncPointer()->getName() << "\n");
-  // DEBUG(errs() << *llvm_visc_createThread << "\n");
+  // DEBUG(errs() << *llvm_hpvm_createThread << "\n");
   DEBUG(errs() << *graphID->getType() << "\n");
   DEBUG(errs() << *C_Pipeline->getType() << "\n");
   DEBUG(errs() << *Struct->getType() << "\n");
@@ -551,7 +551,7 @@ void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args,
                                                 Struct->getName(), IB);
   Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
   CallInst *CreateThread = CallInst::Create(
-      llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
+      llvm_hpvm_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
 }
 
 Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
@@ -639,17 +639,17 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
             Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
         Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
         CI = CallInst::Create(
-            llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
+            llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
             "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
       } else if (Edge->getDestDF()->isExitNode()) {
         // Bind Output Edge
         CI = CallInst::Create(
-            llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
+            llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
             "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
       } else {
         // Streaming Edge
         CI = CallInst::Create(
-            llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
+            llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
             Edge->getSourceDF()->getFuncPointer()->getName() + "." +
                 Edge->getDestDF()->getFuncPointer()->getName(),
             RI);
@@ -668,7 +668,7 @@ Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
     Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
     Value *CallArgs[] = {graphID, size};
     CallInst *CI = CallInst::Create(
-        llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
+        llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
         "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
     NodeLastInputMap[child] = CI;
   }
@@ -729,7 +729,7 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) {
   DEBUG(errs() << "Substitute launch intrinsic\n");
   Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
   CallInst *LaunchInst = CallInst::Create(
-      llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
       "graph" + Root->getFuncPointer()->getName(), LI);
   // ReplaceInstWithInst(LI, LaunchInst);
 
@@ -742,16 +742,16 @@ void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) {
     CallInst *CI;
     Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
     switch (II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst),
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_streamPush,
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_streamPush,
                             ArrayRef<Value *>(PushArgs, 2), "");
       break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst),
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
@@ -771,7 +771,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
   IntrinsicInst *LI = Root->getInstruction();
-  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
+  switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI);
   DEBUG(errs() << "Generating Launch Function\n");
 
   /* Now we have all the necessary global declarations necessary to generate the
@@ -802,14 +802,14 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   ReturnInst *RI =
       ReturnInst::Create(AppFunc->getContext(),
                          Constant::getNullValue(AppFunc->getReturnType()), BB);
-  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
+  switchToTimer(hpvm_TimerID_ARG_UNPACK, RI);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and
   //  Function* RootF_X86 = Root->getGenFunc();
-  Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *RootF_X86 = Root->getGenFuncForTarget(hpvm::CPU_TARGET);
   assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
-  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+  assert(Root->hasX86GenFuncForTarget(hpvm::CPU_TARGET) &&
          "Error: Generated Function for Root node with no x86 wrapper\n");
 
   // Generate a call to RootF_X86 with null parameters for now
@@ -837,8 +837,8 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
     CI->setArgOperand(i, elements[i]);
 
   // Add timers around Call to RootF_X86 function
-  switchToTimer(visc_TimerID_COMPUTATION, CI);
-  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
+  switchToTimer(hpvm_TimerID_COMPUTATION, CI);
+  switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI);
 
   StructType *RootRetTy =
       cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
@@ -888,7 +888,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
     new StoreInst(CI, OutputAddrCast, RI);
   }
 
-  switchToTimer(visc_TimerID_NONE, RI);
+  switchToTimer(hpvm_TimerID_NONE, RI);
 
   DEBUG(errs() << "Application specific function:\n");
   DEBUG(errs() << *AppFunc << "\n");
@@ -896,7 +896,7 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   // Substitute launch intrinsic main
   Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
   CallInst *LaunchInst = CallInst::Create(
-      llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      llvm_hpvm_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
       "graph" + Root->getFuncPointer()->getName(), LI);
   // ReplaceInstWithInst(LI, LaunchInst);
 
@@ -907,16 +907,16 @@ void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
     IntrinsicInst *II = UseList->at(i);
     CallInst *CI;
     switch (II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst),
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_x86_wait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst),
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst),
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
@@ -970,10 +970,10 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
   Function *CF = C->getFuncPointer();
 
   //  Function* CF_X86 = C->getGenFunc();
-  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *CF_X86 = C->getGenFuncForTarget(hpvm::CPU_TARGET);
   assert(CF_X86 != NULL &&
          "Found leaf node for which code generation has not happened yet!\n");
-  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+  assert(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET) &&
          "The generated function to be called from x86 backend is not an x86 "
          "function\n");
   DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
@@ -1040,7 +1040,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
       CI->getArgOperand(numArgs - 6 + 2)  // iZ
   };
 
-  CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push,
+  CallInst *Push = CallInst::Create(llvm_hpvm_x86_dstack_push,
                                     ArrayRef<Value *>(args, 7), "", CI);
   DEBUG(errs() << "Push on stack: " << *Push << "\n");
   // Insert call to runtime to pop the dim limits and instanceID from the depth
@@ -1053,7 +1053,7 @@ void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
   assert(NextI->getParent() == CI->getParent() &&
          "Next Instruction should also belong to the same basic block!");
 
-  CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  CallInst *Pop = CallInst::Create(llvm_hpvm_x86_dstack_pop, None, "", NextI);
   DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
   DEBUG(errs() << *CI->getParent()->getParent());
 }
@@ -1156,7 +1156,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
                   "streaming input edges\n");
   // First read the termination condition variable islastInput
   CallInst *isLastInputPop = CallInst::Create(
-      llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
+      llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
 
   CastInst *BI = BitCastInst::CreateIntegerCast(
       isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
@@ -1173,7 +1173,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
        ++i) {
     if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
       CallInst *bufferIn =
-          CallInst::Create(llvm_visc_bufferPop,
+          CallInst::Create(llvm_hpvm_bufferPop,
                            ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
       CastInst *BI;
       if (i->getType()->isPointerTy()) {
@@ -1196,7 +1196,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   //  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
   //  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
   //                                  C->getGenFunc()->getName()+".output", RI);
-  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
+  Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET);
   DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
   CallInst *CI =
       CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
@@ -1222,7 +1222,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
     // Push to Output buffer
     Value *bufferOutArgs[] = {OutputArgs[i], BI};
     CallInst *bufferOut = CallInst::Create(
-        llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI);
+        llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI);
   }
 
   // Add loop around the basic block, which exits the loop if isLastInput is
@@ -1236,9 +1236,9 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
   CondBB = CondStartI->getParent();
   BodyBB = CI->getParent();
   Instruction *CntI = NULL;
-  CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF);
+  CallInst *GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(CGenF);
 
-  // If the node function calls the visc runtime call to get policy, we update
+  // If the node function calls the hpvm runtime call to get policy, we update
   // it with the counter information. This means we need to pass an additional
   // argument to the generated function, that is the iteration number, and then
   // use it as an argument to the policy_getVersion call
@@ -1255,14 +1255,14 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
     }
     NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
     FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false);
-    Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false);
+    Function *NewCGenF = hpvmUtils::cloneFunction(CGenF, NewFT, false);
     // At least one (the last) argument exists (we added it)
     Function::arg_iterator ae = NewCGenF->arg_end();
     --ae;
     Argument *CntArg = &*ae;
     CntArg->setName("iteration");
     // Replace the old cpu gen func with this one
-    C->addGenFunc(NewCGenF, visc::CPU_TARGET, true);
+    C->addGenFunc(NewCGenF, hpvm::CPU_TARGET, true);
 
     // Add counter to the actual parameter list, to create the new call
     InputArgs.push_back(CntI);
@@ -1272,7 +1272,7 @@ Function *CGT_X86::createFunctionFilter(DFNode *C) {
 
     // Set second operand of the policy_getVersion call to the last function
     // argument
-    GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF);
+    GetPolicyCI = get_llvm_hpvm_policy_getVersion_call(NewCGenF);
     GetPolicyCI->setArgOperand(1, CntArg);
   }
 
@@ -1292,13 +1292,13 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   // function before and nothing else needs to be done for this leaf node.
   //  if(N->getGenFunc() != NULL)
   //    return;
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+  if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
     DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
                  << " : skipping it\n");
     return;
   }
 
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
          "Error: Visiting a node for which code already generated\n");
 
   // Sort children in topological order before code generation
@@ -1315,7 +1315,7 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     if (C->isDummyNode())
       continue;
 
-    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+    if (!(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET))) {
       errs() << "No CPU x86 version for child node "
              << C->getFuncPointer()->getName()
              << "\n  Skip code gen for parent node "
@@ -1361,8 +1361,8 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     RI = cast<ReturnInst>(BB->getTerminator());
 
     // Add generated function info to DFNode
-    //    N->setGenFunc(F_X86, visc::CPU_TARGET);
-    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+    //    N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+    N->addGenFunc(F_X86, hpvm::CPU_TARGET, true);
 
     // Loop over the arguments, to create the VMap.
     dest_iterator = F_X86->arg_begin();
@@ -1445,13 +1445,13 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   // If not, we see which version exists, check that it is in fact an x86
   // function and save it as the CPU_TARGET function
 
-  // TODO: visc_id per node, so we can use this for id for policies
+  // TODO: hpvm_id per node, so we can use this for id for policies
   // For now, use node function name and change it later
-  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET);
 
   DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
                << N->getTag() << "\n");
@@ -1460,7 +1460,7 @@ void CGT_X86::codeGen(DFInternalNode *N) {
   DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
   DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
 
-  if (N->getTag() == visc::None) {
+  if (N->getTag() == hpvm::None) {
     // No code is available for this node. This (usually) means that this
     // node is a node that
     // - from the accelerator backends has been mapped to an intermediate
@@ -1469,24 +1469,24 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     // take place
     DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
                  << N->getFuncPointer()->getName() << "\n");
-  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
+  } else if (hpvmUtils::isSingleTargetTag(N->getTag())) {
     // There is a single version for this node according to code gen hints.
     // Therefore, we do not need to check the policy, we simply use the
     // available implementation, whichever target it is for.
 
     // Sanity check - to be removed TODO
     switch (N->getTag()) {
-    case visc::CPU_TARGET:
-      assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
-      assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
-      assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-      assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+    case hpvm::CPU_TARGET:
+      assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && "");
       break;
-    case visc::GPU_TARGET:
-      assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-      assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-      assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
-      assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+    case hpvm::GPU_TARGET:
+      assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET) && "");
       break;
     default:
       assert(false && "Unreachable: we checked that tag was single target!\n");
@@ -1499,8 +1499,8 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     if (DeviceAbstraction) {
       Function *NodeGenFunc = NULL;
       switch (N->getTag()) {
-      case visc::GPU_TARGET:
-        NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+      case hpvm::GPU_TARGET:
+        NodeGenFunc = N->getGenFuncForTarget(hpvm::GPU_TARGET);
         break;
       default:
         break;
@@ -1512,9 +1512,9 @@ void CGT_X86::codeGen(DFInternalNode *N) {
         BasicBlock *BB = &*NodeGenFunc->begin();
         std::vector<Value *> Args; // TODO: add the device type as argument?
         FunctionCallee RTF = M.getOrInsertFunction(
-            "llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+            "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus",
             runtimeModule
-                ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")
+                ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus")
                 ->getFunctionType());
         CallInst *RTFInst =
             CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
@@ -1522,17 +1522,17 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     }
 
     Function *Ftmp = N->getGenFuncForTarget(N->getTag());
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->setTag(visc::None);
-    N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
-    N->setTag(visc::CPU_TARGET);
+    N->removeGenFuncForTarget(hpvm::GPU_TARGET);
+    N->setTag(hpvm::None);
+    N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true);
+    N->setTag(hpvm::CPU_TARGET);
 
     // Sanity checks - to be removed TODO
-    CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+    GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-    CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET);
 
     DEBUG(errs() << "After editing\n");
     DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
@@ -1545,11 +1545,11 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     DEBUG(errs() << "Node Name (for policy) : "
                  << N->getFuncPointer()->getName() << "\n");
 
-    Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-    Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+    Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+    Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-    bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-    bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+    bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET);
+    bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET);
 
     // These assertions express what we can support with the current runtime.
     // Code generation works the same way even for other target combinations.
@@ -1610,8 +1610,8 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     Args.push_back(
         ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true));
     FunctionCallee RTF = M.getOrInsertFunction(
-        "llvm_visc_policy_getVersion",
-        runtimeModule->getFunction("llvm_visc_policy_getVersion")
+        "llvm_hpvm_policy_getVersion",
+        runtimeModule->getFunction("llvm_hpvm_policy_getVersion")
             ->getFunctionType());
     CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent);
 
@@ -1646,9 +1646,9 @@ void CGT_X86::codeGen(DFInternalNode *N) {
         // call
         std::vector<Value *> Args; // TODO: add the device type as argument?
         FunctionCallee RTF = M.getOrInsertFunction(
-            "llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+            "llvm_hpvm_deviceAbstraction_waitOnDeviceStatus",
             runtimeModule
-                ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")
+                ->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus")
                 ->getFunctionType());
         CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
       }
@@ -1673,8 +1673,8 @@ void CGT_X86::codeGen(DFInternalNode *N) {
     // Prepare arguments and function for call to wait for device runtime call
     //  std::vector<Value *> Args; // TODO: add the device type as argument?
     // FunctionCallee RTF =
-    //  M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-    // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
+    //  M.getOrInsertFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus",
+    // runtimeModule->getFunction("llvm_hpvm_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
     // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
     // }
     // }
@@ -1684,9 +1684,9 @@ void CGT_X86::codeGen(DFInternalNode *N) {
 
     // Now, make the node cpu gen func to be this one
     // Remove all other versions and update the tag
-    N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
-    N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->setTag(visc::CPU_TARGET);
+    N->addGenFunc(F_wrapper, hpvm::CPU_TARGET, true);
+    N->removeGenFuncForTarget(hpvm::GPU_TARGET);
+    N->setTag(hpvm::CPU_TARGET);
 
     // assert(false && "got to the point where we have to combine\n");
   }
@@ -1715,7 +1715,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
   //  if(N->getGenFunc() != NULL)
   //    return;
 
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
+  if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
     DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
                  << " : skipping it\n");
 
@@ -1723,10 +1723,10 @@ void CGT_X86::codeGen(DFLeafNode *N) {
                  << N->getFuncPointer()->getName() << "\n");
 
     switch (N->getTag()) {
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       // A leaf node should not have an x86 function for GPU
       // by design of DFG2LLVM_NVPTX backend
-      assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && "");
       break;
     default:
       break;
@@ -1735,7 +1735,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
     return;
   }
 
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
          "Error: Visiting a node for which code already generated\n");
 
   std::vector<IntrinsicInst *> IItoRemove;
@@ -1759,8 +1759,8 @@ void CGT_X86::codeGen(DFLeafNode *N) {
     F_X86 = addIdxDimArgs(F_X86);
 
   // Add generated function info to DFNode
-  //  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+  //  N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+  N->addGenFunc(F_X86, hpvm::CPU_TARGET, true);
 
   // Go through the arguments, and any pointer arguments with in attribute need
   // to have x86_argument_ptr call to get the x86 ptr of the argument
@@ -1768,7 +1768,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
   // Create new BB
   BasicBlock *EntryBB = &*F_X86->begin();
   BasicBlock *BB =
-      BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+      BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_X86, EntryBB);
   BranchInst *Terminator = BranchInst::Create(EntryBB, BB);
   // Insert calls
   for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
@@ -1776,7 +1776,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
     if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1,
                                             Attribute::In)) {
       assert(ai->getType()->isPointerTy() &&
-             "Only pointer arguments can have visc in/out attributes ");
+             "Only pointer arguments can have hpvm in/out attributes ");
       Function::arg_iterator aiNext = ai;
       ++aiNext;
       Argument *size = &*aiNext;
@@ -1786,7 +1786,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
           &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr",
           Terminator);
       Value *ArgPtrCallArgs[] = {BI, size};
-      CallInst::Create(llvm_visc_x86_argument_ptr,
+      CallInst::Create(llvm_hpvm_x86_argument_ptr,
                        ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator);
     }
   }
@@ -1796,30 +1796,30 @@ void CGT_X86::codeGen(DFLeafNode *N) {
   for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
     Instruction *I = &(*i);
     DEBUG(errs() << *I << "\n");
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) &&
+    // Leaf nodes should not contain HPVM graph intrinsics or launch
+    assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
            "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) &&
-           "VISC graph intrinsic within a leaf dataflow node!");
+    assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
+           "HPVM graph intrinsic within a leaf dataflow node!");
 
-    if (BuildDFG::isViscQueryIntrinsic(I)) {
+    if (BuildDFG::isHPVMQueryIntrinsic(I)) {
       IntrinsicInst *II = cast<IntrinsicInst>(I);
       IntrinsicInst *ArgII;
       DFNode *ArgDFNode;
 
       /***********************************************************************
-       *                        Handle VISC Query intrinsics                  *
+       *                        Handle HPVM Query intrinsics                  *
        ***********************************************************************/
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *******************/
-      case Intrinsic::visc_getNode: {
+      /**************************** llvm.hpvm.getNode() *******************/
+      case Intrinsic::hpvm_getNode: {
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
         break;
       }
-      /************************* llvm.visc.getParentNode() ****************/
-      case Intrinsic::visc_getParentNode: {
+      /************************* llvm.hpvm.getParentNode() ****************/
+      case Intrinsic::hpvm_getParentNode: {
         // get the parent node of the arg node
         // get argument node
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
@@ -1832,8 +1832,8 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         IItoRemove.push_back(II);
         break;
       }
-      /*************************** llvm.visc.getNumDims() *****************/
-      case Intrinsic::visc_getNumDims: {
+      /*************************** llvm.hpvm.getNumDims() *****************/
+      case Intrinsic::hpvm_getNumDims: {
         // get node from map
         // get the appropriate field
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
@@ -1846,10 +1846,10 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         IItoRemove.push_back(II);
         break;
       }
-      /*********************** llvm.visc.getNodeInstanceID() **************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
+      /*********************** llvm.hpvm.getNodeInstanceID() **************/
+      case Intrinsic::hpvm_getNodeInstanceID_x:
+      case Intrinsic::hpvm_getNodeInstanceID_y:
+      case Intrinsic::hpvm_getNodeInstanceID_z: {
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
 
@@ -1864,7 +1864,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         // (dim = 1) => y
         // (dim = 2) => z
         int dim =
-            (int)(II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x);
+            (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x);
         assert((dim >= 0) && (dim < 3) &&
                "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic "
                "ID!");
@@ -1894,7 +1894,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
           Value *args[] = {
               ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
               ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
-          CallInst *CI = CallInst::Create(llvm_visc_x86_getDimInstance,
+          CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimInstance,
                                           ArrayRef<Value *>(args, 2),
                                           "nodeInstanceID", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
@@ -1903,10 +1903,10 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         }
         break;
       }
-      /********************** llvm.visc.getNumNodeInstances() *************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
+      /********************** llvm.hpvm.getNumNodeInstances() *************/
+      case Intrinsic::hpvm_getNumNodeInstances_x:
+      case Intrinsic::hpvm_getNumNodeInstances_y:
+      case Intrinsic::hpvm_getNumNodeInstances_z: {
 
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
@@ -1922,7 +1922,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
         // (dim = 1) => y
         // (dim = 2) => z
         int dim =
-            (int)(II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x);
+            (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x);
         assert((dim >= 0) && (dim < 3) &&
                "Invalid dimension for getNumNodeInstances_[xyz]. Check "
                "Intrinsic ID!");
@@ -1952,7 +1952,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
           Value *args[] = {
               ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
               ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
-          CallInst *CI = CallInst::Create(llvm_visc_x86_getDimLimit,
+          CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimLimit,
                                           ArrayRef<Value *>(args, 2),
                                           "numNodeInstances", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
@@ -1965,7 +1965,7 @@ void CGT_X86::codeGen(DFLeafNode *N) {
       default:
         DEBUG(errs() << "Found unknown intrinsic with ID = "
                      << II->getIntrinsicID() << "\n");
-        assert(false && "Unknown VISC Intrinsic!");
+        assert(false && "Unknown HPVM Intrinsic!");
         break;
       }
 
diff --git a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
similarity index 74%
rename from hpvm/lib/Transforms/GenVISC/CMakeLists.txt
rename to hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
index ed087f63b4933a33792d7cd773acdf8fab1ac8e3..967766e7058c1ef8bcc1414afb7ff0087e3ce188 100644
--- a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt
+++ b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WIN32 OR CYGWIN)
   set(LLVM_LINK_COMPONENTS Core Support)
 endif()
 
-add_llvm_library( LLVMGenVISC
+add_llvm_library( LLVMGenHPVM
   MODULE
-  GenVISC.cpp
+  GenHPVM.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..738b39905b885aa42bc861e3a19c3bdf9c65668e
--- /dev/null
+++ b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
@@ -0,0 +1,894 @@
+//=== GenHPVM.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "genhpvm"
+#include "GenHPVM/GenHPVM.h"
+
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define TIMER(X)                                                               \
+  do {                                                                         \
+    if (HPVMTimer) {                                                           \
+      X;                                                                       \
+    }                                                                          \
+  } while (0)
+
+using namespace llvm;
+using namespace hpvmUtils;
+
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer("hpvm-timers-gen",
+                               cl::desc("Enable GenHPVM timer"));
+
+namespace genhpvm {
+
+// Helper Functions
+
+static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID);
+static Function *transformReturnTypeToStruct(Function *F);
+static Type *getReturnTypeFromReturnInst(Function *F);
+
+// Check if the dummy function call is a __hpvm__node call
+#define IS_HPVM_CALL(callName)                                                 \
+  static bool isHPVMCall_##callName(Instruction *I) {                          \
+    if (!isa<CallInst>(I))                                                     \
+      return false;                                                            \
+    CallInst *CI = cast<CallInst>(I);                                          \
+    return (CI->getCalledValue()->stripPointerCasts()->getName())              \
+        .equals("__hpvm__" #callName);                                         \
+  }
+
+static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID,
+                                     std::vector<Instruction *> *Erase) {
+  // Check if the instruction is Call Instruction
+  assert(isa<CallInst>(I) && "Expecting CallInst");
+  CallInst *CI = cast<CallInst>(I);
+  DEBUG(errs() << "Found call: " << *CI << "\n");
+
+  // Find the correct intrinsic call
+  Module *M = CI->getParent()->getParent()->getParent();
+  Function *F;
+  std::vector<Type *> ArgTypes;
+  std::vector<Value *> args;
+  if (Intrinsic::isOverloaded(IntrinsicID)) {
+    // This is an overloaded intrinsic. The types must exactly match. Get the
+    // argument types
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      ArgTypes.push_back(CI->getArgOperand(i)->getType());
+      args.push_back(CI->getArgOperand(i));
+    }
+    F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
+    DEBUG(errs() << *F << "\n");
+  } else { // Non-overloaded intrinsic
+    F = Intrinsic::getDeclaration(M, IntrinsicID);
+    FunctionType *FTy = F->getFunctionType();
+    DEBUG(errs() << *F << "\n");
+
+    // Create argument list
+    assert(CI->getNumArgOperands() == FTy->getNumParams() &&
+           "Number of arguments of call do not match with Intrinsic");
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      Value *V = CI->getArgOperand(i);
+      // Either the type should match or both should be of pointer type
+      assert((V->getType() == FTy->getParamType(i) ||
+              (V->getType()->isPointerTy() &&
+               FTy->getParamType(i)->isPointerTy())) &&
+             "Dummy function call argument does not match with Intrinsic "
+             "argument!");
+      // If the types do not match, then both must be pointer type and pointer
+      // cast needs to be performed
+      if (V->getType() != FTy->getParamType(i)) {
+        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+      }
+      args.push_back(V);
+    }
+  }
+  // Insert call instruction
+  CallInst *Inst = CallInst::Create(
+      F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
+
+  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+
+  CI->replaceAllUsesWith(Inst);
+  // If the previous instruction needs to be erased, insert it in the vector
+  // Erased
+  if (Erase != NULL)
+    Erase->push_back(CI);
+}
+
+IS_HPVM_CALL(launch) /* Exists but not required */
+IS_HPVM_CALL(edge)   /* Exists but not required */
+IS_HPVM_CALL(createNodeND)
+// IS_HPVM_CALL(createNode)
+// IS_HPVM_CALL(createNode1D)
+// IS_HPVM_CALL(createNode2D)
+// IS_HPVM_CALL(createNode3D)
+IS_HPVM_CALL(bindIn)
+IS_HPVM_CALL(bindOut)
+IS_HPVM_CALL(push)
+IS_HPVM_CALL(pop)
+IS_HPVM_CALL(getNode)
+IS_HPVM_CALL(getParentNode)
+IS_HPVM_CALL(barrier)
+IS_HPVM_CALL(malloc)
+IS_HPVM_CALL(return )
+IS_HPVM_CALL(getNodeInstanceID_x)
+IS_HPVM_CALL(getNodeInstanceID_y)
+IS_HPVM_CALL(getNodeInstanceID_z)
+IS_HPVM_CALL(getNumNodeInstances_x)
+IS_HPVM_CALL(getNumNodeInstances_y)
+IS_HPVM_CALL(getNumNodeInstances_z)
+// Atomics
+IS_HPVM_CALL(atomic_cmpxchg)
+IS_HPVM_CALL(atomic_add)
+IS_HPVM_CALL(atomic_sub)
+IS_HPVM_CALL(atomic_xchg)
+IS_HPVM_CALL(atomic_inc)
+IS_HPVM_CALL(atomic_dec)
+IS_HPVM_CALL(atomic_min)
+IS_HPVM_CALL(atomic_max)
+IS_HPVM_CALL(atomic_umin)
+IS_HPVM_CALL(atomic_umax)
+IS_HPVM_CALL(atomic_and)
+IS_HPVM_CALL(atomic_or)
+IS_HPVM_CALL(atomic_xor)
+// Misc Fn
+IS_HPVM_CALL(floor)
+IS_HPVM_CALL(rsqrt)
+IS_HPVM_CALL(sqrt)
+IS_HPVM_CALL(sin)
+IS_HPVM_CALL(cos)
+
+IS_HPVM_CALL(init)
+IS_HPVM_CALL(cleanup)
+IS_HPVM_CALL(wait)
+IS_HPVM_CALL(trackMemory)
+IS_HPVM_CALL(untrackMemory)
+IS_HPVM_CALL(requestMemory)
+IS_HPVM_CALL(attributes)
+IS_HPVM_CALL(hint)
+
+// Return the constant integer represented by value V
+static unsigned getNumericValue(Value *V) {
+  assert(
+      isa<ConstantInt>(V) &&
+      "Value indicating the number of arguments should be a constant integer");
+  return cast<ConstantInt>(V)->getZExtValue();
+}
+
+// Take the __hpvm__return instruction and generate code for combining the
+// values being returned into a struct and returning it.
+// The first operand is the number of returned values
+static Value *genCodeForReturn(CallInst *CI) {
+  LLVMContext &Ctx = CI->getContext();
+  assert(isHPVMCall_return(CI) && "__hpvm__return instruction expected!");
+
+  // Parse the dummy function call here
+  assert(CI->getNumArgOperands() > 0 &&
+         "Too few arguments for __hpvm_return call!\n");
+  unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
+
+  assert(CI->getNumArgOperands() - 1 == numRetVals &&
+         "Too few arguments for __hpvm_return call!\n");
+  DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
+
+  std::vector<Type *> ArgTypes;
+  for (unsigned i = 1; i < CI->getNumArgOperands(); i++) {
+    ArgTypes.push_back(CI->getArgOperand(i)->getType());
+  }
+  Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
+  StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
+
+  InsertValueInst *IV = InsertValueInst::Create(
+      UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI);
+  DEBUG(errs() << "Code generation for return:\n");
+  DEBUG(errs() << *IV << "\n");
+
+  for (unsigned i = 2; i < CI->getNumArgOperands(); i++) {
+    IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(),
+                                 CI);
+    DEBUG(errs() << *IV << "\n");
+  }
+
+  return IV;
+}
+
+// Analyse the attribute call for this function. Add the in and out
+// attributes to pointer parameters.
+static void handleHPVMAttributes(Function *F, CallInst *CI) {
+  DEBUG(errs() << "Kernel before adding In/Out HPVM attributes:\n"
+               << *F << "\n");
+  // Parse the dummy function call here
+  unsigned offset = 0;
+  // Find number of In pointers
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __hpvm__attributes call!");
+  unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
+  DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
+
+  for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::In);
+    } else {
+      DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __hpvm__attributes call");
+    }
+  }
+  // Find number of Out Pointers
+  offset += 1 + numInPtrs;
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __hpvm__attributes call!");
+  unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
+  DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
+  for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::Out);
+    } else {
+      DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __hpvm__attributes call");
+    }
+  }
+  DEBUG(errs() << "Kernel after adding In/Out HPVM attributes:\n"
+               << *F << "\n");
+}
+
+// Public Functions of GenHPVM pass
+bool GenHPVM::runOnModule(Module &M) {
+  DEBUG(errs() << "\nGENHPVM PASS\n");
+  this->M = &M;
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI =
+      llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
+  DEBUG(errs() << llvmSrcRoot << "\n");
+
+  std::unique_ptr<Module> runtimeModule =
+      parseIRFile(runtimeAPI.str(), Err, M.getContext());
+
+  if (runtimeModule == NULL) {
+    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
+    assert(false && "couldn't parse runtime");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
+
+  llvm_hpvm_initializeTimerSet = M.getOrInsertFunction(
+      "llvm_hpvm_initializeTimerSet",
+      runtimeModule->getFunction("llvm_hpvm_initializeTimerSet")
+          ->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_initializeTimerSet);
+
+  llvm_hpvm_switchToTimer = M.getOrInsertFunction(
+      "llvm_hpvm_switchToTimer",
+      runtimeModule->getFunction("llvm_hpvm_switchToTimer")->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_switchToTimer);
+
+  llvm_hpvm_printTimerSet = M.getOrInsertFunction(
+      "llvm_hpvm_printTimerSet",
+      runtimeModule->getFunction("llvm_hpvm_printTimerSet")->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_printTimerSet);
+
+  // Insert init context in main
+  DEBUG(errs() << "Locate __hpvm__init()\n");
+  Function *VI = M.getFunction("__hpvm__init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
+  Instruction *I = cast<Instruction>(*VI->user_begin());
+
+  DEBUG(errs() << "Initialize Timer Set\n");
+  initializeTimerSet(I);
+  switchToTimer(hpvm_TimerID_NONE, I);
+
+  // Insert print instruction at hpvm exit
+  DEBUG(errs() << "Locate __hpvm__cleanup()\n");
+  Function *VC = M.getFunction("__hpvm__cleanup");
+  assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
+  I = cast<Instruction>(*VC->user_begin());
+  printTimerSet(I);
+
+  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
+
+  std::vector<Instruction *> toBeErased;
+  std::vector<Function *> functions;
+
+  for (auto &F : M)
+    functions.push_back(&F);
+
+  // Iterate over all functions in the module
+  for (Function *f : functions) {
+    DEBUG(errs() << "Function: " << f->getName() << "\n");
+
+    // List with the required additions in the function's return type
+    std::vector<Type *> FRetTypes;
+
+    enum mutateTypeCause {
+      mtc_None,
+      mtc_BIND,
+      mtc_RETURN,
+      mtc_NUM_CAUSES
+    } bind;
+    bind = mutateTypeCause::mtc_None;
+
+    // Iterate over all the instructions in this function
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
+      // If not a call instruction, move to next instruction
+      if (!isa<CallInst>(I))
+        continue;
+
+      CallInst *CI = cast<CallInst>(I);
+      LLVMContext &Ctx = CI->getContext();
+
+      if (isHPVMCall_init(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_init, &toBeErased);
+      }
+      if (isHPVMCall_cleanup(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_cleanup, &toBeErased);
+      }
+      if (isHPVMCall_wait(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_wait, &toBeErased);
+      }
+      if (isHPVMCall_trackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_trackMemory, &toBeErased);
+      }
+      if (isHPVMCall_untrackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_untrackMemory, &toBeErased);
+      }
+      if (isHPVMCall_requestMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_requestMemory, &toBeErased);
+      }
+      if (isHPVMCall_hint(I)) {
+        assert(isa<ConstantInt>(CI->getArgOperand(0)) &&
+               "Argument to hint must be constant integer!");
+        ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0));
+
+        hpvm::Target t = (hpvm::Target)hint->getZExtValue();
+        addHint(CI->getParent()->getParent(), t);
+        DEBUG(errs() << "Found hpvm hint call: " << *CI << "\n");
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_launch(I)) {
+        Function *LaunchF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_launch);
+        DEBUG(errs() << *LaunchF << "\n");
+        // Get i8* cast to function pointer
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+        assert(
+            F &&
+            "Function invoked by HPVM launch has to be define and constant.");
+
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0));
+        assert(Op && "HPVM launch's streaming argument is a constant value.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+
+        auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
+        assert(ArgTy && "HPVM launch argument should be pointer type.");
+        Value *Arg = CI->getArgOperand(2);
+        if (!ArgTy->getElementType()->isIntegerTy(8))
+          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2),
+                                               Type::getInt8PtrTy(Ctx), "", CI);
+        Value *LaunchArgs[] = {F, Arg, isStreaming};
+        CallInst *LaunchInst = CallInst::Create(
+            LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI);
+        DEBUG(errs() << "Found hpvm launch call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
+        CI->replaceAllUsesWith(LaunchInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_push(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_push, &toBeErased);
+      }
+      if (isHPVMCall_pop(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_pop, &toBeErased);
+      }
+      if (isHPVMCall_createNodeND(I)) {
+        assert(CI->getNumArgOperands() > 0 &&
+               "Too few arguments for __hpvm__createNodeND call");
+        unsigned numDims = getNumericValue(CI->getArgOperand(0));
+        // We need as meny dimension argments are there are dimensions
+        assert(CI->getNumArgOperands() - 2 == numDims &&
+               "Too few arguments for __hpvm_createNodeND call!\n");
+
+        Function *CreateNodeF;
+        switch (numDims) {
+        case 0:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode);
+          break;
+        case 1:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode1D);
+          break;
+        case 2:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode2D);
+          break;
+        case 3:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode3D);
+          break;
+        default:
+          llvm_unreachable("Unsupported number of dimensions\n");
+          break;
+        }
+        DEBUG(errs() << *CreateNodeF << "\n");
+        DEBUG(errs() << *I << "\n");
+        DEBUG(errs() << "in " << I->getParent()->getParent()->getName()
+                     << "\n");
+
+        // Get i8* cast to function pointer
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+
+        CallInst *CreateNodeInst;
+        switch (numDims) {
+        case 0:
+          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F),
+                                            graphFunc->getName() + ".node", CI);
+          break;
+        case 1: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 2: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+                                     CI->getArgOperand(3)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 3: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 4, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+                                     CI->getArgOperand(3),
+                                     CI->getArgOperand(4)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        default:
+          llvm_unreachable(
+              "Impossible path: number of dimensions is 0, 1, 2, 3\n");
+          break;
+        }
+
+        DEBUG(errs() << "Found hpvm createNode call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n");
+        CI->replaceAllUsesWith(CreateNodeInst);
+        toBeErased.push_back(CI);
+      }
+
+      if (isHPVMCall_edge(I)) {
+        Function *EdgeF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createEdge);
+        DEBUG(errs() << *EdgeF << "\n");
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5));
+        ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
+        assert(Op && EdgeTypeOp &&
+               "Arguments of CreateEdge are not constant integers.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx)
+                                                 : ConstantInt::getTrue(Ctx);
+        Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                             isAllToAll,           CI->getArgOperand(3),
+                             CI->getArgOperand(4), isStreaming};
+        CallInst *EdgeInst = CallInst::Create(
+            EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI);
+        DEBUG(errs() << "Found hpvm edge call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
+        CI->replaceAllUsesWith(EdgeInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_bindIn(I)) {
+        Function *BindInF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_input);
+        DEBUG(errs() << *BindInF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind in intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), isStreaming};
+        CallInst *BindInInst =
+            CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI);
+        DEBUG(errs() << "Found hpvm bindIn call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
+        CI->replaceAllUsesWith(BindInInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_bindOut(I)) {
+        Function *BindOutF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_output);
+        DEBUG(errs() << *BindOutF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind out intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                                CI->getArgOperand(2), isStreaming};
+        CallInst *BindOutInst = CallInst::Create(
+            BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI);
+        DEBUG(errs() << "Found hpvm bindOut call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
+
+        DEBUG(errs() << "Fixing the return type of the function\n");
+        // FIXME: What if the child node function has not been visited already.
+        // i.e., it's return type has not been fixed.
+        Function *F = I->getParent()->getParent();
+        DEBUG(errs() << F->getName() << "\n";);
+        IntrinsicInst *NodeIntrinsic =
+            cast<IntrinsicInst>(CI->getArgOperand(0));
+        assert(NodeIntrinsic &&
+               "Instruction value in bind out is not a create node intrinsic.");
+        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
+        assert(
+            (NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode1D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode2D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode3D) &&
+            "Instruction value in bind out is not a create node intrinsic.");
+        Function *ChildF = cast<Function>(
+            NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
+        DEBUG(errs() << ChildF->getName() << "\n";);
+        int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
+        int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
+        StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType());
+
+        Type *ReturnType = F->getReturnType();
+        DEBUG(errs() << *ReturnType << "\n";);
+        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) &&
+               "Return type should either be a struct or void type!");
+
+        FRetTypes.insert(FRetTypes.begin() + destpos,
+                         ChildReturnTy->getElementType(srcpos));
+        assert(((bind == mutateTypeCause::mtc_BIND) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+               "Both bind_out and hpvm_return detected");
+        bind = mutateTypeCause::mtc_BIND;
+
+        CI->replaceAllUsesWith(BindOutInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_attributes(I)) {
+        Function *F = CI->getParent()->getParent();
+        handleHPVMAttributes(F, CI);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_getNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNode, &toBeErased);
+      }
+      if (isHPVMCall_getParentNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getParentNode, &toBeErased);
+      }
+      if (isHPVMCall_barrier(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_barrier, &toBeErased);
+      }
+      if (isHPVMCall_malloc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_malloc, &toBeErased);
+      }
+      if (isHPVMCall_return(I)) {
+        DEBUG(errs() << "Function before hpvm return processing\n"
+                     << *I->getParent()->getParent() << "\n");
+        // The operands to this call are the values to be returned by the node
+        Value *ReturnVal = genCodeForReturn(CI);
+        DEBUG(errs() << *ReturnVal << "\n");
+        Type *ReturnType = ReturnVal->getType();
+        assert(isa<StructType>(ReturnType) &&
+               "Return type should be a struct type!");
+
+        assert(((bind == mutateTypeCause::mtc_RETURN) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+               "Both bind_out and hpvm_return detected");
+
+        if (bind == mutateTypeCause::mtc_None) {
+          // If this is None, this is the first __hpvm__return
+          // instruction we have come upon. Place the return type of the
+          // function in the return type vector
+          bind = mutateTypeCause::mtc_RETURN;
+          StructType *ReturnStructTy = cast<StructType>(ReturnType);
+          for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
+            FRetTypes.push_back(ReturnStructTy->getElementType(i));
+        } else { // bind == mutateTypeCause::mtc_RETURN
+          // This is not the first __hpvm__return
+          // instruction we have come upon.
+          // Check that the return types are the same
+          assert((ReturnType == FRetTypes[0]) &&
+                 "Multiple returns with mismatching types");
+        }
+
+        ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal);
+        DEBUG(errs() << "Found hpvm return call: " << *CI << "\n");
+        Instruction *oldReturn = CI->getParent()->getTerminator();
+        assert(isa<ReturnInst>(oldReturn) &&
+               "Expecting a return to be the terminator of this BB!");
+        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
+        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
+        // CI->replaceAllUsesWith(RetInst);
+        toBeErased.push_back(CI);
+        ReplaceInstWithInst(oldReturn, RetInst);
+        DEBUG(errs() << "Function after hpvm return processing\n"
+                     << *I->getParent()->getParent() << "\n");
+      }
+
+      if (isHPVMCall_getNodeInstanceID_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_x,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNodeInstanceID_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_y,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNodeInstanceID_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_z,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_x,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_y,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_z,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_atomic_add(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_add, &toBeErased);
+      }
+      if (isHPVMCall_atomic_sub(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_sub, &toBeErased);
+      }
+      if (isHPVMCall_atomic_xchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xchg, &toBeErased);
+      }
+      if (isHPVMCall_atomic_min(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_min, &toBeErased);
+      }
+      if (isHPVMCall_atomic_max(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_max, &toBeErased);
+      }
+      if (isHPVMCall_atomic_and(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_and, &toBeErased);
+      }
+      if (isHPVMCall_atomic_or(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_or, &toBeErased);
+      }
+      if (isHPVMCall_atomic_xor(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xor, &toBeErased);
+      }
+      if (isHPVMCall_sin(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
+      }
+      if (isHPVMCall_cos(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
+      }
+    }
+
+    // Erase the __hpvm__node calls
+    DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
+    for (auto I : toBeErased) {
+      DEBUG(errs() << *I << "\n");
+    }
+    while (!toBeErased.empty()) {
+      Instruction *I = toBeErased.back();
+      DEBUG(errs() << "\tErasing " << *I << "\n");
+      I->eraseFromParent();
+      toBeErased.pop_back();
+    }
+
+    if (bind == mutateTypeCause::mtc_BIND ||
+        bind == mutateTypeCause::mtc_RETURN) {
+      DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
+      // Argument type list.
+      std::vector<Type *> FArgTypes;
+      for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
+           ai != ae; ++ai) {
+        FArgTypes.push_back(ai->getType());
+      }
+
+      // Find new return type of function
+      Type *NewReturnTy;
+      if (bind == mutateTypeCause::mtc_BIND) {
+
+        std::vector<Type *> TyList;
+        for (unsigned i = 0; i < FRetTypes.size(); i++)
+          TyList.push_back(FRetTypes[i]);
+
+        NewReturnTy =
+            StructType::create(f->getContext(), TyList,
+                               Twine("struct.out." + f->getName()).str(), true);
+      } else {
+        NewReturnTy = getReturnTypeFromReturnInst(f);
+        assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
+      }
+
+      FunctionType *FTy =
+          FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
+
+      // Change the function type
+      Function *newF = cloneFunction(f, FTy, false);
+      DEBUG(errs() << *newF << "\n");
+
+      if (bind == mutateTypeCause::mtc_BIND) {
+        // This is certainly an internal node, and hence just one BB with one
+        // return terminator instruction. Change return statement
+        ReturnInst *RI =
+            cast<ReturnInst>(newF->getEntryBlock().getTerminator());
+        ReturnInst *newRI = ReturnInst::Create(newF->getContext(),
+                                               UndefValue::get(NewReturnTy));
+        ReplaceInstWithInst(RI, newRI);
+      }
+      if (bind == mutateTypeCause::mtc_RETURN) {
+        // Nothing
+      }
+      replaceNodeFunctionInIR(*f->getParent(), f, newF);
+      DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
+    }
+  }
+  return false; // TODO: What does returning "false" mean?
+}
+
+// Generate Code for declaring a constant string [L x i8] and return a pointer
+// to the start of it.
+Value *GenHPVM::getStringPointer(const Twine &S, Instruction *IB,
+                                 const Twine &Name) {
+  Constant *SConstant =
+      ConstantDataArray::getString(M->getContext(), S.str(), true);
+  Value *SGlobal =
+      new GlobalVariable(*M, SConstant->getType(), true,
+                         GlobalValue::InternalLinkage, SConstant, Name);
+  Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+  Value *GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst *SPtr = GetElementPtrInst::Create(
+      nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB);
+  return SPtr;
+}
+
+void GenHPVM::initializeTimerSet(Instruction *InsertBefore) {
+  Value *TimerSetAddr;
+  StoreInst *SI;
+  TIMER(TimerSet = new GlobalVariable(
+            *M, Type::getInt8PtrTy(M->getContext()), false,
+            GlobalValue::CommonLinkage,
+            Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
+            "hpvmTimerSet_GenHPVM"));
+  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
+               << "\n");
+  // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet <<
+  // "\n");
+
+  TIMER(TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "",
+                                        InsertBefore));
+  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
+  TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
+  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
+}
+
+void GenHPVM::switchToTimer(enum hpvm_TimerID timer,
+                            Instruction *InsertBefore) {
+  Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)};
+  TIMER(CallInst::Create(llvm_hpvm_switchToTimer,
+                         ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
+}
+
+void GenHPVM::printTimerSet(Instruction *InsertBefore) {
+  Value *TimerName;
+  TIMER(TimerName = getStringPointer("GenHPVM_Timer", InsertBefore));
+  Value *printArgs[] = {TimerSet, TimerName};
+  TIMER(CallInst::Create(llvm_hpvm_printTimerSet,
+                         ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
+}
+
+static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) {
+  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
+}
+
+static Function *transformReturnTypeToStruct(Function *F) {
+  // Currently only works for void return types
+  DEBUG(errs() << "Transforming return type of function to Struct: "
+               << F->getName() << "\n");
+
+  if (isa<StructType>(F->getReturnType())) {
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": "
+                 << *F->getReturnType() << "\n");
+    return F;
+  }
+
+  assert(F->getReturnType()->isVoidTy() &&
+         "Unhandled case - Only void return type handled\n");
+
+  // Create the argument type list with added argument types
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  StructType *RetTy =
+      StructType::create(F->getContext(), None, "emptyStruct", true);
+  FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
+
+  SmallVector<ReturnInst *, 8> Returns;
+  Function *newF = cloneFunction(F, FTy, false, &Returns);
+  // Replace ret void instruction with ret %RetTy undef
+  for (auto &RI : Returns) {
+    DEBUG(errs() << "Found return inst: " << *RI << "\n");
+    ReturnInst *newRI =
+        ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
+    ReplaceInstWithInst(RI, newRI);
+  }
+
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  return newF;
+}
+
+static Type *getReturnTypeFromReturnInst(Function *F) {
+  for (BasicBlock &BB : *F) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType()
+                   << "\n");
+      return RI->getReturnValue()->getType();
+    }
+  }
+}
+
+char genhpvm::GenHPVM::ID = 0;
+static RegisterPass<genhpvm::GenHPVM>
+    X("genhpvm",
+      "Pass to generate HPVM IR from LLVM IR (with dummy function calls)",
+      false, false);
+
+} // End of namespace genhpvm
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.exports b/hpvm/lib/Transforms/GenHPVM/GenHPVM.exports
similarity index 100%
rename from hpvm/lib/Transforms/GenVISC/GenVISC.exports
rename to hpvm/lib/Transforms/GenHPVM/GenHPVM.exports
diff --git a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
similarity index 88%
rename from hpvm/lib/Transforms/GenVISC/LLVMBuild.txt
rename to hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
index 9266b2c5972984a179beba227946964182761239..94ef73ac07ca5c1ff23a05e404b0ea1f751ef36c 100644
--- a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt
+++ b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===;
+;===- ./lib/Transforms/GenHPVM/LLVMBuild.txt -------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,5 +17,5 @@
 
 [component_0]
 type = Library
-name = GenVISC
+name = GenHPVM
 parent = Transforms
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
deleted file mode 100644
index cc505415396b4a0441d5a5bfe0cf58adc945b9f8..0000000000000000000000000000000000000000
--- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ /dev/null
@@ -1,866 +0,0 @@
-//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "genvisc"
-#include "GenVISC/GenVISC.h"
-
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "SupportVISC/VISCUtils.h"
-
-
-#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
-
-using namespace llvm;
-using namespace viscUtils;
-
-
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer"));
-
-namespace genvisc {
-
-// Helper Functions
-
-static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
-static Function* transformReturnTypeToStruct(Function* F);
-static Type* getReturnTypeFromReturnInst(Function* F);
-
-// Check if the dummy function call is a __visc__node call
-#define IS_VISC_CALL(callName) \
-  static bool isVISCCall_##callName(Instruction* I) { \
-    if(!isa<CallInst>(I)) \
-      return false; \
-    CallInst* CI = cast<CallInst>(I); \
-    return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
-  }
-
-static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
-  // Check if the instruction is Call Instruction
-  assert(isa<CallInst>(I) && "Expecting CallInst");
-  CallInst* CI = cast<CallInst>(I);
-  DEBUG(errs() << "Found call: " << *CI << "\n");
-
-  // Find the correct intrinsic call
-  Module* M = CI->getParent()->getParent()->getParent();
-  Function* F;
-  std::vector<Type*> ArgTypes;
-  std::vector<Value*> args;
-  if(Intrinsic::isOverloaded(IntrinsicID)) {
-    // This is an overloaded intrinsic. The types must exactly match. Get the
-    // argument types
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-      ArgTypes.push_back(CI->getArgOperand(i)->getType());
-      args.push_back(CI->getArgOperand(i));
-    }
-    F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
-    DEBUG(errs() << *F << "\n");
-  }
-  else { // Non-overloaded intrinsic
-    F = Intrinsic::getDeclaration(M, IntrinsicID);
-    FunctionType* FTy = F->getFunctionType();
-    DEBUG(errs() << *F << "\n");
-
-    // Create argument list
-    assert(CI->getNumArgOperands() == FTy->getNumParams()
-        && "Number of arguments of call do not match with Intrinsic");
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-      Value* V = CI->getArgOperand(i);
-      // Either the type should match or both should be of pointer type
-      assert((V->getType() == FTy->getParamType(i) ||
-          (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
-          && "Dummy function call argument does not match with Intrinsic argument!");
-      // If the types do not match, then both must be pointer type and pointer
-      // cast needs to be performed
-      if(V->getType() != FTy->getParamType(i)) {
-        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
-      }
-      args.push_back(V);
-    }
-  }
-  // Insert call instruction
-  CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
-
-  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
-
-  CI->replaceAllUsesWith(Inst);
-  // If the previous instruction needs to be erased, insert it in the vector
-  // Erased
-  if(Erase != NULL)
-    Erase->push_back(CI);
-}
-
-IS_VISC_CALL(launch) /* Exists but not required */
-IS_VISC_CALL(edge) /* Exists but not required */
-IS_VISC_CALL(createNodeND)
-//IS_VISC_CALL(createNode)
-//IS_VISC_CALL(createNode1D)
-//IS_VISC_CALL(createNode2D)
-//IS_VISC_CALL(createNode3D)
-IS_VISC_CALL(bindIn)
-IS_VISC_CALL(bindOut)
-IS_VISC_CALL(push)
-IS_VISC_CALL(pop)
-IS_VISC_CALL(getNode)
-IS_VISC_CALL(getParentNode)
-IS_VISC_CALL(barrier)
-IS_VISC_CALL(malloc)
-IS_VISC_CALL(return)
-IS_VISC_CALL(getNodeInstanceID_x)
-IS_VISC_CALL(getNodeInstanceID_y)
-IS_VISC_CALL(getNodeInstanceID_z)
-IS_VISC_CALL(getNumNodeInstances_x)
-IS_VISC_CALL(getNumNodeInstances_y)
-IS_VISC_CALL(getNumNodeInstances_z)
-// Atomics
-IS_VISC_CALL(atomic_cmpxchg)
-IS_VISC_CALL(atomic_add)
-IS_VISC_CALL(atomic_sub)
-IS_VISC_CALL(atomic_xchg)
-IS_VISC_CALL(atomic_inc)
-IS_VISC_CALL(atomic_dec)
-IS_VISC_CALL(atomic_min)
-IS_VISC_CALL(atomic_max)
-IS_VISC_CALL(atomic_umin)
-IS_VISC_CALL(atomic_umax)
-IS_VISC_CALL(atomic_and)
-IS_VISC_CALL(atomic_or)
-IS_VISC_CALL(atomic_xor)
-// Misc Fn
-IS_VISC_CALL(floor)
-IS_VISC_CALL(rsqrt)
-IS_VISC_CALL(sqrt)
-IS_VISC_CALL(sin)
-IS_VISC_CALL(cos)
-
-
-IS_VISC_CALL(init)
-IS_VISC_CALL(cleanup)
-IS_VISC_CALL(wait)
-IS_VISC_CALL(trackMemory)
-IS_VISC_CALL(untrackMemory)
-IS_VISC_CALL(requestMemory)
-IS_VISC_CALL(attributes)
-IS_VISC_CALL(hint)
-
-// Return the constant integer represented by value V
-static unsigned getNumericValue(Value* V) {
-  assert(isa<ConstantInt>(V)
-         && "Value indicating the number of arguments should be a constant integer");
-  return cast<ConstantInt>(V)->getZExtValue();
-}
-
-// Take the __visc__return instruction and generate code for combining the
-// values being returned into a struct and returning it.
-// The first operand is the number of returned values
-static Value* genCodeForReturn(CallInst* CI) {
-  LLVMContext& Ctx = CI->getContext();
-  assert(isVISCCall_return(CI)
-      && "__visc__return instruction expected!");
-
-  // Parse the dummy function call here
-  assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n");
-  unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
-
-  assert(CI->getNumArgOperands()-1 == numRetVals &&
-         "Too few arguments for __visc_return call!\n");
-  DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
-
-  std::vector<Type*> ArgTypes;
-  for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
-    ArgTypes.push_back(CI->getArgOperand(i)->getType());
-  }
-  Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
-  StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
-
-  InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy),
-                                                CI->getArgOperand(1),
-                                                0,
-                                                "returnStruct",
-                                                CI);
-  DEBUG(errs() << "Code generation for return:\n");
-  DEBUG(errs() << *IV << "\n");
-
-  for(unsigned i=2; i < CI->getNumArgOperands(); i++) {
-    IV = InsertValueInst::Create(IV,
-                                 CI->getArgOperand(i),
-                                 i-1,
-                                 IV->getName(),
-                                 CI);
-    DEBUG(errs() << *IV << "\n");
-  }
-  
-  return IV;
-}
-
-// Analyse the attribute call for this function. Add the in and out
-// attributes to pointer parameters.
-static void handleVISCAttributes(Function* F, CallInst* CI) {
-  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n");
-  // Parse the dummy function call here
-  unsigned offset = 0;
-  // Find number of In pointers
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
-  unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
-  DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
-
-  for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::In);
-    }
-    else {
-      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
-    }
-  }
-  // Find number of Out Pointers
-  offset += 1 + numInPtrs;
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
-  unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
-  DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
-  for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::Out);
-    }
-    else {
-      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
-    }
-  }
-  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n");
-}
-
-// Public Functions of GenVISC pass
-bool GenVISC::runOnModule(Module &M) {
-  errs() << "\nGENVISC PASS\n";
-  this->M = &M;
-
-  // Load Runtime API Module
-  SMDiagnostic Err;
-
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL &&
-         "Define LLVM_SRC_ROOT environment variable!");
-
-  Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc";
-  errs() << llvmSrcRoot << "\n";
-
-  std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-
-  if(runtimeModule == NULL) {
-    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
-    assert(false && "couldn't parse runtime");
-  }
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
-
-  llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet",
-                                 runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_initializeTimerSet);
-
-  llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer",
-                            runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
- // DEBUG(errs() << *llvm_visc_switchToTimer);
-
-  llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet",
-                            runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_printTimerSet);
-
-  // Insert init context in main
-  DEBUG(errs() << "Locate __visc__init()\n");
-  Function* VI = M.getFunction("__visc__init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
-
-  DEBUG(errs() << "Initialize Timer Set\n");
-  initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
-
-  // Insert print instruction at visc exit
-  DEBUG(errs() << "Locate __visc__cleanup()\n");
-  Function* VC = M.getFunction("__visc__cleanup");
-  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
-  I = cast<Instruction>(*VC->user_begin());
-  printTimerSet(I);
-
-
-  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
-
-  std::vector<Instruction*> toBeErased;
-  std::vector<Function*> functions;
-
-  for (auto &F : M) 
-    functions.push_back(&F);
-
-  // Iterate over all functions in the module
-  for (Function *f : functions) {
-    DEBUG(errs() << "Function: " << f->getName() << "\n");
-
-    // List with the required additions in the function's return type
-    std::vector<Type*> FRetTypes;
-
-    enum mutateTypeCause {
-      mtc_None,
-      mtc_BIND,
-      mtc_RETURN,
-      mtc_NUM_CAUSES
-    } bind;
-    bind = mutateTypeCause::mtc_None;
-
-    // Iterate over all the instructions in this function
-    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
-      Instruction* I = &*i; // Grab pointer to Instruction
-      // If not a call instruction, move to next instruction
-      if(!isa<CallInst>(I))
-        continue;
-
-      CallInst* CI = cast<CallInst>(I);
-      LLVMContext& Ctx = CI->getContext();
-
-      if(isVISCCall_init(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
-      }
-      if(isVISCCall_cleanup(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
-      }
-      if(isVISCCall_wait(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
-      }
-      if(isVISCCall_trackMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
-      }
-      if(isVISCCall_untrackMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
-      }
-      if(isVISCCall_requestMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
-      }
-      if(isVISCCall_hint(I)) {
-        assert(isa<ConstantInt>(CI->getArgOperand(0))
-               && "Argument to hint must be constant integer!");
-        ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
-
-        visc::Target t = (visc::Target) hint->getZExtValue();
-        addHint(CI->getParent()->getParent(), t);
-        DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_launch(I)) {
-        Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
-        DEBUG(errs() << *LaunchF << "\n");
-        // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
-        graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-	assert(F && "Function invoked by VISC launch has to be define and constant.");
-
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
-	assert(Op && "VISC launch's streaming argument is a constant value.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        
-        auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
-        assert(ArgTy && "VISC launch argument should be pointer type.");
-        Value *Arg = CI->getArgOperand(2);
-        if(!ArgTy->getElementType()->isIntegerTy(8))
-          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI);
-        Value* LaunchArgs[] = {F, Arg, isStreaming};
-        CallInst* LaunchInst = CallInst::Create(LaunchF,
-                                                ArrayRef<Value*>(LaunchArgs, 3),
-                                                "graphID", CI);
-        DEBUG(errs() << "Found visc launch call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
-        CI->replaceAllUsesWith(LaunchInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_push(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
-      }
-      if(isVISCCall_pop(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
-      }
-      if(isVISCCall_createNodeND(I)) {
-        assert(CI->getNumArgOperands() > 0 &&
-               "Too few arguments for __visc__createNodeND call");
-        unsigned numDims = getNumericValue(CI->getArgOperand(0));
-        // We need as meny dimension argments are there are dimensions
-        assert(CI->getNumArgOperands()-2 == numDims &&
-              "Too few arguments for __visc_createNodeND call!\n");
-
-        Function* CreateNodeF;
-        switch (numDims) {
-        case 0:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
-          break;
-        case 1:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
-          break;
-        case 2:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
-          break;
-        case 3:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
-          break;
-        default:
-          llvm_unreachable("Unsupported number of dimensions\n");
-          break;
-        }
-        DEBUG(errs() << *CreateNodeF << "\n");
-        DEBUG(errs() << *I << "\n");
-        DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n");
-
-        // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
-        graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-
-        CallInst* CreateNodeInst;
-        switch (numDims) {
-        case 0:
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(F),
-                                            graphFunc->getName()+".node", CI);
-          break;
-        case 1:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 2),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 2:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 3, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
-                                     CI->getArgOperand(3)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 3),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 3:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 3, expected to be i64\n");
-          assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 4, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
-                                     CI->getArgOperand(3),
-                                     CI->getArgOperand(4)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 4),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        default:
-          llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n");
-          break;
-        }
-
-        DEBUG(errs() << "Found visc createNode call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n");
-        CI->replaceAllUsesWith(CreateNodeInst);
-        toBeErased.push_back(CI);
-      }
-
-      if(isVISCCall_edge(I)) {
-        Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
-        DEBUG(errs() << *EdgeF << "\n");
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5));
-        ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
-	assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx)
-                                                : ConstantInt::getTrue(Ctx);
-        Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                             isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4),
-                             isStreaming
-                            };
-        CallInst* EdgeInst = CallInst::Create(EdgeF,
-                                              ArrayRef<Value*>(EdgeArgs, 6),
-                                              "output", CI);
-        DEBUG(errs() << "Found visc edge call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
-        CI->replaceAllUsesWith(EdgeInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_bindIn(I)) {
-        Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
-        DEBUG(errs() << *BindInF << "\n");
-        // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind in intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), isStreaming
-                              };
-        CallInst* BindInInst = CallInst::Create(BindInF,
-                                                ArrayRef<Value*>(BindInArgs, 4),
-                                                "", CI);
-        DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
-        CI->replaceAllUsesWith(BindInInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_bindOut(I)) {
-        Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
-        DEBUG(errs() << *BindOutF << "\n");
-        // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind out intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                                CI->getArgOperand(2), isStreaming
-                               };
-        CallInst* BindOutInst = CallInst::Create(BindOutF,
-                                ArrayRef<Value*>(BindOutArgs, 4),
-                                "", CI);
-        DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
-
-        DEBUG(errs() << "Fixing the return type of the function\n");
-        // FIXME: What if the child node function has not been visited already.
-        // i.e., it's return type has not been fixed.
-        Function* F = I->getParent()->getParent();
-        DEBUG(errs() << F->getName() << "\n";);
-        IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
-	assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic.");
-        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
-	assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
-		"Instruction value in bind out is not a create node intrinsic.");
-        Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
-        DEBUG(errs() << ChildF->getName() << "\n";);
-        int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
-        int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
-        StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
-
-        Type* ReturnType = F->getReturnType();
-        DEBUG(errs() << *ReturnType << "\n";);
-        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
-            && "Return type should either be a struct or void type!");
-
-        FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos));
-        assert(((bind == mutateTypeCause::mtc_BIND) ||
-                (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
-        bind = mutateTypeCause::mtc_BIND;
-
-        CI->replaceAllUsesWith(BindOutInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_attributes(I)) {
-        Function* F = CI->getParent()->getParent();
-        handleVISCAttributes(F, CI);
-        toBeErased.push_back(CI);
-      }
-      if (isVISCCall_getNode(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased);
-      }
-      if (isVISCCall_getParentNode(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased);
-      }
-      if (isVISCCall_barrier(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased);
-      }
-      if (isVISCCall_malloc(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
-      }
-      if (isVISCCall_return(I)) {
-        DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n");
-        // The operands to this call are the values to be returned by the node
-        Value* ReturnVal = genCodeForReturn(CI);
-        DEBUG(errs() << *ReturnVal << "\n");
-        Type* ReturnType = ReturnVal->getType();
-        assert(isa<StructType>(ReturnType)
-               && "Return type should be a struct type!");
-
-        assert(((bind == mutateTypeCause::mtc_RETURN) ||
-                (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
-
-        if (bind == mutateTypeCause::mtc_None) {
-          // If this is None, this is the first __visc__return
-          // instruction we have come upon. Place the return type of the
-          // function in the return type vector
-          bind = mutateTypeCause::mtc_RETURN;
-          StructType* ReturnStructTy = cast<StructType>(ReturnType);
-          for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
-            FRetTypes.push_back(ReturnStructTy->getElementType(i));
-        } else { // bind == mutateTypeCause::mtc_RETURN
-          // This is not the first __visc__return
-          // instruction we have come upon. 
-          // Check that the return types are the same
-          assert((ReturnType == FRetTypes[0])
-                 && "Multiple returns with mismatching types");
-        }
-
-        ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
-        DEBUG(errs() << "Found visc return call: " << *CI << "\n");
-        Instruction* oldReturn = CI->getParent()->getTerminator();
-        assert(isa<ReturnInst>(oldReturn)
-                && "Expecting a return to be the terminator of this BB!");
-        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
-        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
-        //CI->replaceAllUsesWith(RetInst);
-        toBeErased.push_back(CI);
-        ReplaceInstWithInst(oldReturn, RetInst);
-        DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n");
-      }
-
-      if (isVISCCall_getNodeInstanceID_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
-      }
-      if (isVISCCall_getNodeInstanceID_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
-      }
-      if (isVISCCall_getNodeInstanceID_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
-      }
-      if (isVISCCall_atomic_add(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
-      }
-      if (isVISCCall_atomic_sub(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased);
-      }
-      if (isVISCCall_atomic_xchg(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased);
-      }
-      if (isVISCCall_atomic_min(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased);
-      }
-      if (isVISCCall_atomic_max(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased);
-      }
-      if (isVISCCall_atomic_and(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased);
-      }
-      if (isVISCCall_atomic_or(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased);
-      }
-      if (isVISCCall_atomic_xor(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased);
-      }
-      if (isVISCCall_sin(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
-      }
-      if (isVISCCall_cos(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
-      }
-    }
-
-    // Erase the __visc__node calls
-    DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
-    for(auto I: toBeErased) {
-      DEBUG(errs() << *I << "\n");
-    }
-    while(!toBeErased.empty()) {
-      Instruction* I = toBeErased.back(); 
-      DEBUG(errs() << "\tErasing " << *I << "\n");
-      I->eraseFromParent();
-      toBeErased.pop_back(); 
-    }
-
-    if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
-        DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
-        // Argument type list.
-        std::vector<Type*> FArgTypes;
-        for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
-            ai != ae; ++ai) {
-          FArgTypes.push_back(ai->getType());
-        }
-
-        // Find new return type of function
-        Type* NewReturnTy;
-        if(bind == mutateTypeCause::mtc_BIND) {
-
-          std::vector<Type*> TyList;
-          for (unsigned i = 0; i < FRetTypes.size(); i++)
-            TyList.push_back(FRetTypes[i]);
-
-          NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true);
-        }
-        else {
-          NewReturnTy = getReturnTypeFromReturnInst(f);
-          assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
-        }
-
-        FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
-
-        // Change the function type
-        Function* newF = cloneFunction(f, FTy, false);
-        DEBUG(errs() << *newF << "\n");
-
-        if (bind == mutateTypeCause::mtc_BIND) {
-          // This is certainly an internal node, and hence just one BB with one
-          // return terminator instruction. Change return statement
-          ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator());
-          ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy));
-          ReplaceInstWithInst(RI, newRI);        
-        }
-        if (bind == mutateTypeCause::mtc_RETURN) {
-          // Nothing
-        }
-        replaceNodeFunctionInIR(*f->getParent(), f, newF);
-        DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
-    }
-
-
-  }
-  return false; //TODO: What does returning "false" mean?
-}
-
-// Generate Code for declaring a constant string [L x i8] and return a pointer
-// to the start of it.
-Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
-  Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true);
-  Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true,
-                                      GlobalValue::InternalLinkage, SConstant, Name);
-  Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
-  Value* GEPArgs[] = {Zero, Zero};
-  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
-                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
-  return SPtr;
-}
-
-void GenVISC::initializeTimerSet(Instruction* InsertBefore) {
-  Value* TimerSetAddr;
-  StoreInst* SI;
-  TIMER(TimerSet = new GlobalVariable(*M,
-                                      Type::getInt8PtrTy(M->getContext()),
-                                      false,
-                                      GlobalValue::CommonLinkage,
-                                      Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
-                                      "viscTimerSet_GenVISC"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n");
-  //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
-
-  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
-                                        None,
-                                        "",
-                                        InsertBefore));
-  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
-  TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
-  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
-}
-
-void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
-  Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)};
-  TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value*>(switchArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-void GenVISC::printTimerSet(Instruction* InsertBefore) {
-  Value* TimerName;
-  TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore));
-  Value* printArgs[] = {TimerSet, TimerName};
-  TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value*>(printArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
-  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
-}
-
-static Function* transformReturnTypeToStruct(Function* F) {
-  // Currently only works for void return types
-  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
-
-  if (isa<StructType>(F->getReturnType())) {
-    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
-    return F;
-  }
-
-  assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
-
-  // Create the argument type list with added argument types
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-  
-  StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true);
-  FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
-  
-  SmallVector<ReturnInst*, 8> Returns;
-  Function* newF = cloneFunction(F, FTy, false, &Returns);
-  // Replace ret void instruction with ret %RetTy undef
-  for(auto &RI: Returns) {
-    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
-    ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
-    ReplaceInstWithInst(RI, newRI);
-  }
-
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-  return newF;
-}
-
-static Type* getReturnTypeFromReturnInst(Function* F) {
-  for(BasicBlock &BB: *F) {
-    if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n");
-      return RI->getReturnValue()->getType();
-    }
-  }
-}
-
-
-char genvisc::GenVISC::ID = 0;
-static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false);
-
-} // End of namespace genvisc
-
-
diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
index 7bd66b62c6c8cda589fe3e6c1e3711893aceaffb..fc33ebee71123d89c5f931901dd213c82a401941 100644
--- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
+++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "LocalMem"
-#include "SupportVISC/DFG2LLVM.h"
+#include "SupportHPVM/DFG2LLVM.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
@@ -134,7 +134,7 @@ void AT_OCL::codeGen(DFLeafNode *N) {
 // Return pointer to property if this leaf node matches the conditions for being
 // an allocation node. Conditions
 // 1. No incoming memory pointer. No in/out attribute on a pointer argument
-// 2. Uses visc malloc intrinsic to allocate memory
+// 2. Uses hpvm malloc intrinsic to allocate memory
 // 3. Sends it out
 // 2. (TODO:) Whether the allocated pointer escapes the parent node
 AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
@@ -148,18 +148,18 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
 
   Function *F = N->getFuncPointer();
 
-  // Allocation node must use visc malloc intrinsic
-  bool usesVISCMalloc = false;
+  // Allocation node must use hpvm malloc intrinsic
+  bool usesHPVMMalloc = false;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) {
     Instruction *I = &*i;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
-        usesVISCMalloc = true;
+      if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) {
+        usesHPVMMalloc = true;
         break;
       }
     }
   }
-  if (!usesVISCMalloc)
+  if (!usesHPVMMalloc)
     return NULL;
 
   // TODO: Check if allocated pointer leaves parent node
@@ -197,20 +197,20 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
     assert(OutValues[i]->getType()->isPointerTy() &&
            "Expected outgoing edge to be of pointer type");
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) {
-      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
+      if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) {
         // Sanity check: Size passed to malloc intrinsic is same as the value
         // going into the next outgoing edge
-        DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n");
+        DEBUG(errs() << "HPVM malloc size: " << *II->getArgOperand(0) << "\n");
         DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n");
         assert(II->getArgOperand(0) == OutValues[i + 1] &&
-               "Sanity Check Failed: VISC Malloc size argument != next "
+               "Sanity Check Failed: HPVM Malloc size argument != next "
                "outgoing edge");
         ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0));
         i = i + 2;
         continue;
       }
     }
-    llvm_unreachable("Expecting visc malloc intrinsic instruction!");
+    llvm_unreachable("Expecting hpvm malloc intrinsic instruction!");
   }
   return ANP;
 }
diff --git a/hpvm/llvm_installer/llvm_installer.sh b/hpvm/llvm_installer/llvm_installer.sh
index d7fcda4ac4de8c129e47cfce65264097e040d228..e072d042b79a1a3caf8003794a89b5cee2dca67a 100755
--- a/hpvm/llvm_installer/llvm_installer.sh
+++ b/hpvm/llvm_installer/llvm_installer.sh
@@ -179,10 +179,10 @@ echo make -j$NUM_THREADS
 make -j$NUM_THREADS
 #make install
 
-#echo Building HPVM runtime
-#HPVM_RT_DIR=$HPVM_DIR/projects/visc-rt
-#cd $HPVM_RT_DIR
-#make
+# echo Building HPVM runtime
+# HPVM_RT_DIR=$HPVM_DIR/projects/hpvm-rt
+# cd $HPVM_RT_DIR
+# make
 
 #cp -r $CURRENT_DIR/projects $HPVM_DIR/
 #make -j$NUM_THREADS
diff --git a/hpvm/llvm_patches/apply_patch.sh b/hpvm/llvm_patches/apply_patch.sh
index ea86575207a4aa7b4ca138b604f7423943924b22..289e5c11e319aa16262952d2d079f986c2e987b8 100644
--- a/hpvm/llvm_patches/apply_patch.sh
+++ b/hpvm/llvm_patches/apply_patch.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 ### File Copies
-cp include/IR/IntrinsicsVISC.td  ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsVISC.td
+cp include/IR/IntrinsicsHPVM.td  ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsHPVM.td
 
 
 ## Header File Patches
diff --git a/hpvm/llvm_patches/include/IR/Attributes.td b/hpvm/llvm_patches/include/IR/Attributes.td
index b644cdb30bbd590a8b8c238bfde15e4b451e8ea3..c6ff8ef3c6c962f5444d718ff5a7e16ce392a522 100644
--- a/hpvm/llvm_patches/include/IR/Attributes.td
+++ b/hpvm/llvm_patches/include/IR/Attributes.td
@@ -151,7 +151,7 @@ def ShadowCallStack : EnumAttr<"shadowcallstack">;
 /// Sign extended before/after call.
 def SExt : EnumAttr<"signext">;
 
-/// VISC Attributes
+/// HPVM Attributes
 /// Pointer to read only memory
 def In : EnumAttr<"in">;
 
diff --git a/hpvm/llvm_patches/include/IR/Intrinsics.td b/hpvm/llvm_patches/include/IR/Intrinsics.td
index 2f79964a2e381c6d4ec22a5bc3c80a9d411f9fb0..2e3f34eb1a8408371a0b516089dd970adfe9223c 100644
--- a/hpvm/llvm_patches/include/IR/Intrinsics.td
+++ b/hpvm/llvm_patches/include/IR/Intrinsics.td
@@ -1249,4 +1249,4 @@ include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
 include "llvm/IR/IntrinsicsWebAssembly.td"
 include "llvm/IR/IntrinsicsRISCV.td"
-include "llvm/IR/IntrinsicsVISC.td"
+include "llvm/IR/IntrinsicsHPVM.td"
diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td
new file mode 100644
index 0000000000000000000000000000000000000000..410e9c8d3345e67df9614e0d518e5e596a4368e1
--- /dev/null
+++ b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td
@@ -0,0 +1,208 @@
+//===- IntrinsicsHPVM.td - Defines HPVM intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the HPVM-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "hpvm" in {
+  /* All intrinsics start with "llvm.hpvm."
+   * As we do not want the compiler to mess with these intrinsics, we assume
+   * worst memory behavior for all these intrinsics.
+   */
+
+  /* Initialization intrinsic -
+   * i8* llvm.hpvm.setup(function*);
+   */
+  def int_hpvm_init : Intrinsic<[], [], []>;
+
+  /* Launch intrinsic - with streaming argument
+   * i8* llvm.hpvm.launch(i8*, ArgList*, i1);
+   */
+  def int_hpvm_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                  llvm_ptr_ty, llvm_i1_ty], []>;
+
+  /* Push intrinsic - push data on streaming pipeline
+   * void llvm.hpvm.push(i8*, ArgList*);
+   */
+  def int_hpvm_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
+
+  /* Pop intrinsic - pop data from streaming pipeline
+   * i8* llvm.hpvm.pop(i8*);
+   */
+  def int_hpvm_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
+  /* Cleanup intrinsic -
+   * void llvm.hpvm.cleanup(i8*);
+   */
+  def int_hpvm_cleanup : Intrinsic<[], [], []>;
+
+  /* Wait intrinsic -
+   * void llvm.hpvm.wait(graphID*);
+   */
+  def int_hpvm_wait : Intrinsic<[], [llvm_ptr_ty], []>;
+
+  /* Track memory intrinsic -
+   * void llvm.hpvm.trackMemory(i8*, i64);
+   */
+  def int_hpvm_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
+
+  /* Track memory intrinsic -
+   * void llvm.hpvm.untrackMemory(i8*);
+   */
+  def int_hpvm_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>;
+
+  /* Request memory intrinsic -
+   * void llvm.hpvm.requestMemory(i8*, i64);
+   */
+  def int_hpvm_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
+
+  /* Create Node intrinsic -
+   * i8* llvm.hpvm.createNode(function*);
+   */
+  def int_hpvm_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
+  /* Create Node 1D array intrinsic -
+   * i8* llvm.hpvm.createNode1D(function*, i64);
+   */
+  def int_hpvm_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty], []>;
+
+  /* Create Node 2D array intrinsic -
+   * i8* llvm.hpvm.createNode2D(function*, i64, i64);
+   */
+  def int_hpvm_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty, llvm_i64_ty], []>;
+
+  /* Create Node 3D array intrinsic -
+   * i8* llvm.hpvm.createNode2D(function*, i64, i64, i64);
+   */
+  def int_hpvm_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
+                                        []>;
+
+  /* Create dataflow edge intrinsic -
+   * i8* llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1);
+   */
+  def int_hpvm_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
+                                      llvm_i1_ty, llvm_i32_ty, llvm_i32_ty,
+                                      llvm_i1_ty],
+                                      []>;
+
+  /* Create bind input intrinsic -
+   * void llvm.hpvm.bind.input(i8*, i32, i32);
+   */
+  def int_hpvm_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
+                                      llvm_i32_ty, llvm_i1_ty], []>;
+
+  /* Create bind output intrinsic -
+   * void llvm.hpvm.bind.output(i8*, i32, i32);
+   */
+  def int_hpvm_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
+                                       llvm_i32_ty, llvm_i1_ty], []>;
+
+  /* Find associated dataflow node intrinsic -
+   * i8* llvm.hpvm.getNode();
+   */
+  def int_hpvm_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+
+  /* Find parent dataflow node intrinsic -
+   * i8* llvm.hpvm.getParentNode(i8*);
+   */
+  def int_hpvm_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>;
+
+  /* Find the number of dimensions of a dataflow node intrinsic -
+   * i32 llvm.hpvm.getNumDims(i8*);
+   */
+  def int_hpvm_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
+
+  /* Find the unique indentifier of a dataflow node (with respect to his parent
+   * node) in the specified dimension intrinsic -
+   */
+
+  /* i64 llvm.hpvm.getNodeInstanceID.[xyz](i8*);
+   */
+  def int_hpvm_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  def int_hpvm_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  def int_hpvm_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  /* Find the number of instances of a dataflow node in the specified dimension
+   * intrinsic -
+   */
+
+  /* i64 llvm.hpvm.getNumNodeInstances.[xyz](i8*);
+   */
+  def int_hpvm_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  def int_hpvm_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  def int_hpvm_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  /* Local Barrier
+   * void llvm.hpvm.barrier();
+   */
+  def int_hpvm_barrier : Intrinsic<[], [], []>;
+
+  /* Memory allocation inside the graph
+   * i8* llvm.hpvm.malloc();
+   */
+  def int_hpvm_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>;
+
+  /* Find the vector length supported by target architecture
+   * intrinsic -
+   * i32 llvm.hpvm.getVectorLength();
+   */
+  def int_hpvm_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>;
+
+  /* ============ Atomic intrinsics ============= */
+  // Atomic arithmetic operations
+
+  /* i32 llvm.hpvm.atomic.add(i32*, i32)*/
+  def int_hpvm_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.sub(i32*, i32)*/
+  def int_hpvm_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.xchg(i32*, i32)*/
+  def int_hpvm_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.min(i32*, i32)*/
+  def int_hpvm_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.maxi32*, i32)*/
+  def int_hpvm_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  // Atomic bitwise operations
+
+  /* i32 llvm.hpvm.atomic.and(i32*, i32)*/
+  def int_hpvm_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.or(i32*, i32)*/
+  def int_hpvm_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.xor(i32*, i32)*/
+  def int_hpvm_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+}
diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td b/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td
deleted file mode 100644
index d5330175d86c9576394c9363a4ba30fd651f19e8..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td
+++ /dev/null
@@ -1,208 +0,0 @@
-//===- IntrinsicsVISC.td - Defines VISC intrinsics ---------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the VISC-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "visc" in {
-  /* All intrinsics start with "llvm.visc."
-   * As we do not want the compiler to mess with these intrinsics, we assume
-   * worst memory behavior for all these intrinsics.
-   */
-
-  /* Initialization intrinsic -
-   * i8* llvm.visc.setup(function*);
-   */
-  def int_visc_init : Intrinsic<[], [], []>;
-
-  /* Launch intrinsic - with streaming argument
-   * i8* llvm.visc.launch(i8*, ArgList*, i1);
-   */
-  def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                  llvm_ptr_ty, llvm_i1_ty], []>;
-
-  /* Push intrinsic - push data on streaming pipeline
-   * void llvm.visc.push(i8*, ArgList*);
-   */
-  def int_visc_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
-
-  /* Pop intrinsic - pop data from streaming pipeline
-   * i8* llvm.visc.pop(i8*);
-   */
-  def int_visc_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
-
-  /* Cleanup intrinsic -
-   * void llvm.visc.cleanup(i8*);
-   */
-  def int_visc_cleanup : Intrinsic<[], [], []>;
-
-  /* Wait intrinsic -
-   * void llvm.visc.wait(graphID*);
-   */
-  def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>;
-
-  /* Track memory intrinsic -
-   * void llvm.visc.trackMemory(i8*, i64);
-   */
-  def int_visc_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
-
-  /* Track memory intrinsic -
-   * void llvm.visc.untrackMemory(i8*);
-   */
-  def int_visc_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>;
-
-  /* Request memory intrinsic -
-   * void llvm.visc.requestMemory(i8*, i64);
-   */
-  def int_visc_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
-
-  /* Create Node intrinsic -
-   * i8* llvm.visc.createNode(function*);
-   */
-  def int_visc_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
-
-  /* Create Node 1D array intrinsic -
-   * i8* llvm.visc.createNode1D(function*, i64);
-   */
-  def int_visc_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty], []>;
-
-  /* Create Node 2D array intrinsic -
-   * i8* llvm.visc.createNode2D(function*, i64, i64);
-   */
-  def int_visc_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty, llvm_i64_ty], []>;
-
-  /* Create Node 3D array intrinsic -
-   * i8* llvm.visc.createNode2D(function*, i64, i64, i64);
-   */
-  def int_visc_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
-                                        []>;
-
-  /* Create dataflow edge intrinsic -
-   * i8* llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1);
-   */
-  def int_visc_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
-                                      llvm_i1_ty, llvm_i32_ty, llvm_i32_ty,
-                                      llvm_i1_ty],
-                                      []>;
-
-  /* Create bind input intrinsic -
-   * void llvm.visc.bind.input(i8*, i32, i32);
-   */
-  def int_visc_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
-                                      llvm_i32_ty, llvm_i1_ty], []>;
-
-  /* Create bind output intrinsic -
-   * void llvm.visc.bind.output(i8*, i32, i32);
-   */
-  def int_visc_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
-                                       llvm_i32_ty, llvm_i1_ty], []>;
-
-  /* Find associated dataflow node intrinsic -
-   * i8* llvm.visc.getNode();
-   */
-  def int_visc_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-
-  /* Find parent dataflow node intrinsic -
-   * i8* llvm.visc.getParentNode(i8*);
-   */
-  def int_visc_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>;
-
-  /* Find the number of dimensions of a dataflow node intrinsic -
-   * i32 llvm.visc.getNumDims(i8*);
-   */
-  def int_visc_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
-
-  /* Find the unique indentifier of a dataflow node (with respect to his parent
-   * node) in the specified dimension intrinsic -
-   */
-
-  /* i64 llvm.visc.getNodeInstanceID.[xyz](i8*);
-   */
-  def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  /* Find the number of instances of a dataflow node in the specified dimension
-   * intrinsic -
-   */
-
-  /* i64 llvm.visc.getNumNodeInstances.[xyz](i8*);
-   */
-  def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  /* Local Barrier
-   * void llvm.visc.barrier();
-   */
-  def int_visc_barrier : Intrinsic<[], [], []>;
-
-  /* Memory allocation inside the graph
-   * i8* llvm.visc.malloc();
-   */
-  def int_visc_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>;
-
-  /* Find the vector length supported by target architecture
-   * intrinsic -
-   * i32 llvm.visc.getVectorLength();
-   */
-  def int_visc_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>;
-
-  /* ============ Atomic intrinsics ============= */
-  // Atomic arithmetic operations
-
-  /* i32 llvm.visc.atomic.add(i32*, i32)*/
-  def int_visc_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.sub(i32*, i32)*/
-  def int_visc_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.xchg(i32*, i32)*/
-  def int_visc_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.min(i32*, i32)*/
-  def int_visc_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.maxi32*, i32)*/
-  def int_visc_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  // Atomic bitwise operations
-
-  /* i32 llvm.visc.atomic.and(i32*, i32)*/
-  def int_visc_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.or(i32*, i32)*/
-  def int_visc_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.xor(i32*, i32)*/
-  def int_visc_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-}
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
index a924405a2cac85ccd2e5e903a1ee1abb52774566..2c54392f8020ac7334117f1343214d085dbd6b84 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
@@ -855,7 +855,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(bit);
   KEYWORD(varFlags);
 
-  // VISC parameter attributes
+  // HPVM parameter attributes
   KEYWORD(in);
   KEYWORD(out);
   KEYWORD(inout);
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
index f5ce44e2a920405f7e3790fcb1d9eb7fba28d636..7446ff1e32dd79a18fd678446af56e6d193468ad 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
@@ -1470,7 +1470,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
 
-    // VISC Parameter only attributes
+    // HPVM Parameter only attributes
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
@@ -1808,7 +1808,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
       B.addAttribute(Attribute::ImmArg);
       break;
 
-    // VISC parameter attributes
+    // HPVM parameter attributes
     case lltok::kw_in:
       B.addAttribute(Attribute::In);
       break;
@@ -1927,7 +1927,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
 
-    // VISC Parameter only attributes
+    // HPVM Parameter only attributes
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
index 7f9816965b2a21ae3d23873ca789a22481b575fa..cb0479b41c3b9e68d9697cd9d8adce4c80fa5c25 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h
+++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
@@ -351,7 +351,7 @@ enum Kind {
   kw_insertvalue,
   kw_blockaddress,
 
-  // VISC parameter attributes
+  // HPVM parameter attributes
   kw_in,
   kw_out,
   kw_inout,
diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
index 7eb289d5872713ef826174b1e691c6440d4dd43e..a1e64472850911013250976312a8dd7d8b879c98 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1395,7 +1395,7 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::NoFree:
     return 1ULL << 63;
 
-    // VISC Attributes
+    // HPVM Attributes
   case Attribute::In:
     return 3ULL << 0;
   case Attribute::Out:
diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
index 55e7415efbea2b37d85f20b1d123ce9a80efe67e..fd671c397583fad6ec8a9998635705417f59eed1 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -773,7 +773,7 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
   case Attribute::SanitizeMemTag:
     return bitc::ATTR_KIND_SANITIZE_MEMTAG;
 
-  // VISC Attributes
+  // HPVM Attributes
   case Attribute::In:
     return bitc::ATTR_KIND_IN;
   case Attribute::Out:
diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp
index 3cc95b3102fdf6c7062fffe1f9486cfa094bba9b..29c47a9e1107524278dcc57c188b320821ba7d86 100644
--- a/hpvm/llvm_patches/lib/IR/Attributes.cpp
+++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp
@@ -404,7 +404,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   if (hasAttribute(Attribute::ImmArg))
     return "immarg";
 
-  // VISC attributes for arguments
+  // HPVM attributes for arguments
   if (hasAttribute(Attribute::In))
     return "in";
   if (hasAttribute(Attribute::Out))
diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be7f69c4bfa7623c093bd5e913af1de3dbcf951c
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_definitions(-DNUM_CORES=8)
+
+SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
+SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
+
+add_llvm_library(hpvm-rt.ll hpvm-rt.cpp
+
+  DEPENDS
+  clang
+  llvm-dis
+  )
+
+
+target_compile_options(hpvm-rt.ll PUBLIC -flto )
+target_compile_options(hpvm-rt.ll PUBLIC -std=c++11)
+
+add_custom_target(hpvm-rt.cpp.o ALL
+  COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a
+  COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc
+  COMMAND  ${CMAKE_BINARY_DIR}/bin/llvm-dis  ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc)
+
+add_dependencies(hpvm-rt.cpp.o   hpvm-rt.ll)
diff --git a/hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt
similarity index 100%
rename from hpvm/projects/visc-rt/deviceStatusSwitchIntervals.txt
rename to hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt
diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/hpvm-rt/device_abstraction.h
similarity index 96%
rename from hpvm/projects/visc-rt/device_abstraction.h
rename to hpvm/projects/hpvm-rt/device_abstraction.h
index 7e77d100deb6b23b6ed9ca994796cd1cb108b0d4..4948502ce8ae47cbb7e37c1372fcd81813486e15 100644
--- a/hpvm/projects/visc-rt/device_abstraction.h
+++ b/hpvm/projects/hpvm-rt/device_abstraction.h
@@ -27,7 +27,7 @@ void initializeDeviceStatusIntervals() {
   unsigned sz = 0;
   unsigned tmp = 0;
 
-  const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/"
+  const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/hpvm-rt/"
                    "deviceStatusSwitchIntervals.txt";
   std::ifstream infile;
   infile.open(fn);
diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
similarity index 82%
rename from hpvm/projects/visc-rt/visc-rt.cpp
rename to hpvm/projects/hpvm-rt/hpvm-rt.cpp
index 53d3b516f22b59857b1a17aecba32a6b723998f0..ec2534cf43f43f971140de741c7a04f79613e982 100644
--- a/hpvm/projects/visc-rt/visc-rt.cpp
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
@@ -13,7 +13,7 @@
 #if _POSIX_VERSION >= 200112L
 #include <sys/time.h>
 #endif
-#include "visc-rt.h"
+#include "hpvm-rt.h"
 
 #ifndef DEBUG_BUILD
 #define DEBUG(s)                                                               \
@@ -59,7 +59,7 @@ vector<DFGDepth> DStack;
 pthread_mutex_t ocl_mtx;
 
 #define NUM_TESTS 1
-visc_TimerSet kernel_timer;
+hpvm_TimerSet kernel_timer;
 
 static inline void checkErr(cl_int err, cl_int success, const char *name) {
   if (err != success) {
@@ -70,7 +70,7 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) {
 }
 
 /************************* Policies *************************************/
-void llvm_visc_policy_init() {
+void llvm_hpvm_policy_init() {
   cout << "Initializing policy object ...\n";
   //  policy = new NodePolicy();
   //  policy = new IterationPolicy();
@@ -80,19 +80,19 @@ void llvm_visc_policy_init() {
   cout << "DONE: Initializing policy object.\n";
 }
 
-void llvm_visc_policy_clear() {
+void llvm_hpvm_policy_clear() {
   if (policy)
     free(policy);
 }
 
-int llvm_visc_policy_getVersion(const char *name, int64_t i) {
+int llvm_hpvm_policy_getVersion(const char *name, int64_t i) {
   return policy->getVersion(name, i);
 }
 
 /******************** Device Abstraction ********************************/
 std::thread deviceStatusThread;
 
-void llvm_visc_deviceAbstraction_start() {
+void llvm_hpvm_deviceAbstraction_start() {
   cout << "Starting device status simulation ...\n";
   // Initialize vector with points where ti switch device status
   initializeDeviceStatusIntervals();
@@ -102,7 +102,7 @@ void llvm_visc_deviceAbstraction_start() {
   return;
 }
 
-void llvm_visc_deviceAbstraction_end() {
+void llvm_hpvm_deviceAbstraction_end() {
   cout << "Ending device status simulation thread ...\n";
   // Set the variable that allows the thread to know that execution has ended
   executionEnd = true;
@@ -112,7 +112,7 @@ void llvm_visc_deviceAbstraction_end() {
   return;
 }
 
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
+void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus() {
   while (!deviceStatus) {
   };
   return;
@@ -120,7 +120,7 @@ void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
 
 /************************* Depth Stack Routines ***************************/
 
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
+void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
                                uint64_t limitY, uint64_t iY, uint64_t limitZ,
                                uint64_t iZ) {
   DEBUG(cout << "Pushing node information on stack:\n");
@@ -134,7 +134,7 @@ void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_x86_dstack_pop() {
+void llvm_hpvm_x86_dstack_pop() {
   DEBUG(cout << "Popping from depth stack\n");
   pthread_mutex_lock(&ocl_mtx);
   DStack.pop_back();
@@ -142,7 +142,7 @@ void llvm_visc_x86_dstack_pop() {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
+uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim) {
   DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level
              << flush << "\n");
   pthread_mutex_lock(&ocl_mtx);
@@ -154,7 +154,7 @@ uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
   return result;
 }
 
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
+uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim) {
   DEBUG(cout << "Request instance id for dim " << dim << " of ancestor "
              << level << flush << "\n");
   pthread_mutex_lock(&ocl_mtx);
@@ -168,7 +168,7 @@ uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
 
 /********************** Memory Tracking Routines **************************/
 
-void llvm_visc_track_mem(void *ptr, size_t size) {
+void llvm_hpvm_track_mem(void *ptr, size_t size) {
   DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE != NULL) {
@@ -180,7 +180,7 @@ void llvm_visc_track_mem(void *ptr, size_t size) {
   DEBUG(MTracker.print());
 }
 
-void llvm_visc_untrack_mem(void *ptr) {
+void llvm_hpvm_untrack_mem(void *ptr) {
   DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE == NULL) {
@@ -195,7 +195,7 @@ void llvm_visc_untrack_mem(void *ptr) {
   DEBUG(MTracker.print());
 }
 
-static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
+static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size,
                                        DFNodeContext_OCL *Context, bool isInput,
                                        bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
@@ -233,7 +233,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
   else
     clFlags = CL_MEM_READ_ONLY;
 
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
   // pthread_mutex_lock(&ocl_mtx);
   cl_mem d_input =
       clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
@@ -249,7 +249,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
     checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
   }
 
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
   DEBUG(cout << " done\n");
   MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context);
   DEBUG(cout << "Updated Table\n");
@@ -258,11 +258,11 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
   return d_input;
 }
 
-void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) {
-  return llvm_visc_request_mem(ptr, size);
+void *llvm_hpvm_x86_argument_ptr(void *ptr, size_t size) {
+  return llvm_hpvm_request_mem(ptr, size);
 }
 
-void *llvm_visc_request_mem(void *ptr, size_t size) {
+void *llvm_hpvm_request_mem(void *ptr, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
@@ -283,13 +283,13 @@ void *llvm_visc_request_mem(void *ptr, size_t size) {
   DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
              << "\n");
   DEBUG(cout << "\tCopying ...");
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
   // pthread_mutex_lock(&ocl_mtx);
   cl_int errcode = clEnqueueReadBuffer(
       ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue,
       (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
   DEBUG(cout << " done\n");
   checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
   DEBUG(cout << "Free mem object on device\n");
@@ -303,25 +303,25 @@ void *llvm_visc_request_mem(void *ptr, size_t size) {
 
 /*************************** Timer Routines **********************************/
 
-static int is_async(enum visc_TimerID timer) {
-  return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC);
+static int is_async(enum hpvm_TimerID timer) {
+  return (timer == hpvm_TimerID_KERNEL) || (timer == hpvm_TimerID_COPY_ASYNC);
 }
 
-static int is_blocking(enum visc_TimerID timer) {
-  return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
+static int is_blocking(enum hpvm_TimerID timer) {
+  return (timer == hpvm_TimerID_COPY) || (timer == hpvm_TimerID_NONE);
 }
 
-#define INVALID_TIMERID visc_TimerID_LAST
+#define INVALID_TIMERID hpvm_TimerID_LAST
 
-static int asyncs_outstanding(struct visc_TimerSet *timers) {
+static int asyncs_outstanding(struct hpvm_TimerSet *timers) {
   return (timers->async_markers != NULL) &&
          (timers->async_markers->timerID != INVALID_TIMERID);
 }
 
-static struct visc_async_time_marker_list *
-get_last_async(struct visc_TimerSet *timers) {
+static struct hpvm_async_time_marker_list *
+get_last_async(struct hpvm_TimerSet *timers) {
   /* Find the last event recorded thus far */
-  struct visc_async_time_marker_list *last_event = timers->async_markers;
+  struct hpvm_async_time_marker_list *last_event = timers->async_markers;
   if (last_event != NULL && last_event->timerID != INVALID_TIMERID) {
     while (last_event->next != NULL &&
            last_event->next->timerID != INVALID_TIMERID)
@@ -331,17 +331,17 @@ get_last_async(struct visc_TimerSet *timers) {
     return NULL;
 }
 
-static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) {
+static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
+  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
   while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
   if (*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)malloc(
-        sizeof(struct visc_async_time_marker_list));
+    *new_event = (struct hpvm_async_time_marker_list *)malloc(
+        sizeof(struct hpvm_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
     // I don't think this is needed at all. I believe clEnqueueMarker 'creates'
@@ -372,18 +372,18 @@ Event Status!\n");
   }
 }
 
-static void insert_submarker(struct visc_TimerSet *tset, char *label,
-                             enum visc_TimerID timer) {
+static void insert_submarker(struct hpvm_TimerSet *tset, char *label,
+                             enum hpvm_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
+  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
   while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
   if (*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)malloc(
-        sizeof(struct visc_async_time_marker_list));
+    *new_event = (struct hpvm_async_time_marker_list *)malloc(
+        sizeof(struct hpvm_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
 #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
@@ -414,10 +414,10 @@ Event Status!\n");
 }
 
 /* Assumes that all recorded events have completed */
-static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
-  struct visc_async_time_marker_list *next_interval = NULL;
-  struct visc_async_time_marker_list *last_marker = get_last_async(tset);
-  visc_Timestamp total_async_time = 0;
+static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) {
+  struct hpvm_async_time_marker_list *next_interval = NULL;
+  struct hpvm_async_time_marker_list *last_marker = get_last_async(tset);
+  hpvm_Timestamp total_async_time = 0;
 
   for (next_interval = tset->async_markers; next_interval != last_marker;
        next_interval = next_interval->next) {
@@ -439,11 +439,11 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
               ciErrNum);
     }
 
-    visc_Timestamp interval =
-        (visc_Timestamp)(((double)(command_end - command_start)));
+    hpvm_Timestamp interval =
+        (hpvm_Timestamp)(((double)(command_end - command_start)));
     tset->timers[next_interval->timerID].elapsed += interval;
     if (next_interval->label != NULL) {
-      struct visc_SubTimer *subtimer =
+      struct hpvm_SubTimer *subtimer =
           tset->sub_timer_list[next_interval->timerID]->subtimer_list;
       while (subtimer != NULL) {
         if (strcmp(subtimer->label, next_interval->label) == 0) {
@@ -463,8 +463,8 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
   return total_async_time;
 }
 
-static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start,
-                            visc_Timestamp end) {
+static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start,
+                            hpvm_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
@@ -473,33 +473,33 @@ static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start,
 }
 
 #if _POSIX_VERSION >= 200112L
-static visc_Timestamp get_time() {
+static hpvm_Timestamp get_time() {
   struct timespec tv;
   clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
+  return (hpvm_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
 }
 #else
 #error "no supported time libraries are available on this platform"
 #endif
 
-void visc_ResetTimer(struct visc_Timer *timer) {
-  timer->state = visc_Timer_STOPPED;
+void hpvm_ResetTimer(struct hpvm_Timer *timer) {
+  timer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-#error "visc_ResetTimer: not implemented for this system"
+#error "hpvm_ResetTimer: not implemented for this system"
 #endif
 }
 
-void visc_StartTimer(struct visc_Timer *timer) {
-  if (timer->state != visc_Timer_STOPPED) {
+void hpvm_StartTimer(struct hpvm_Timer *timer) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     // FIXME: Removing warning statement to avoid printing this error
     // fputs("Ignoring attempt to start a running timer\n", stderr);
     return;
   }
 
-  timer->state = visc_Timer_RUNNING;
+  timer->state = hpvm_Timer_RUNNING;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -508,19 +508,19 @@ void visc_StartTimer(struct visc_Timer *timer) {
     timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StartTimer: not implemented for this system"
+#error "hpvm_StartTimer: not implemented for this system"
 #endif
 }
 
-void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
-                                struct visc_Timer *subtimer) {
+void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer,
+                                struct hpvm_Timer *subtimer) {
 
   unsigned int numNotStopped = 0x3; // 11
-  if (timer->state != visc_Timer_STOPPED) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     fputs("Warning: Timer was not stopped\n", stderr);
     numNotStopped &= 0x1; // Zero out 2^1
   }
-  if (subtimer->state != visc_Timer_STOPPED) {
+  if (subtimer->state != hpvm_Timer_STOPPED) {
     fputs("Warning: Subtimer was not stopped\n", stderr);
     numNotStopped &= 0x2; // Zero out 2^0
   }
@@ -529,8 +529,8 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
     return;
   }
 
-  timer->state = visc_Timer_RUNNING;
-  subtimer->state = visc_Timer_RUNNING;
+  timer->state = hpvm_Timer_RUNNING;
+  subtimer->state = hpvm_Timer_RUNNING;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -546,19 +546,19 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
     }
   }
 #else
-#error "visc_StartTimer: not implemented for this system"
+#error "hpvm_StartTimer: not implemented for this system"
 #endif
 }
 
-void visc_StopTimer(struct visc_Timer *timer) {
-  visc_Timestamp fini;
+void hpvm_StopTimer(struct hpvm_Timer *timer) {
+  hpvm_Timestamp fini;
 
-  if (timer->state != visc_Timer_RUNNING) {
+  if (timer->state != hpvm_Timer_RUNNING) {
     // fputs("Ignoring attempt to stop a stopped timer\n", stderr);
     return;
   }
 
-  timer->state = visc_Timer_STOPPED;
+  timer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -567,24 +567,24 @@ void visc_StopTimer(struct visc_Timer *timer) {
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StopTimer: not implemented for this system"
+#error "hpvm_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
 }
 
-void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
-                               struct visc_Timer *subtimer) {
+void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer,
+                               struct hpvm_Timer *subtimer) {
 
-  visc_Timestamp fini;
+  hpvm_Timestamp fini;
 
   unsigned int numNotRunning = 0x3; // 11
-  if (timer->state != visc_Timer_RUNNING) {
+  if (timer->state != hpvm_Timer_RUNNING) {
     fputs("Warning: Timer was not running\n", stderr);
     numNotRunning &= 0x1; // Zero out 2^1
   }
-  if (subtimer->state != visc_Timer_RUNNING) {
+  if (subtimer->state != hpvm_Timer_RUNNING) {
     fputs("Warning: Subtimer was not running\n", stderr);
     numNotRunning &= 0x2; // Zero out 2^0
   }
@@ -593,8 +593,8 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
     return;
   }
 
-  timer->state = visc_Timer_STOPPED;
-  subtimer->state = visc_Timer_STOPPED;
+  timer->state = hpvm_Timer_STOPPED;
+  subtimer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -603,7 +603,7 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StopTimer: not implemented for this system"
+#error "hpvm_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
@@ -618,59 +618,59 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
 }
 
 /* Get the elapsed time in seconds. */
-double visc_GetElapsedTime(struct visc_Timer *timer) {
+double hpvm_GetElapsedTime(struct hpvm_Timer *timer) {
   double ret;
 
-  if (timer->state != visc_Timer_STOPPED) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     fputs("Elapsed time from a running timer is inaccurate\n", stderr);
   }
 
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e9;
 #else
-#error "visc_GetElapsedTime: not implemented for this system"
+#error "hpvm_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void visc_InitializeTimerSet(struct visc_TimerSet *timers) {
+void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers) {
   int n;
 
   timers->wall_begin = get_time();
-  timers->current = visc_TimerID_NONE;
+  timers->current = hpvm_TimerID_NONE;
 
   timers->async_markers = NULL;
 
-  for (n = 0; n < visc_TimerID_LAST; n++) {
-    visc_ResetTimer(&timers->timers[n]);
+  for (n = 0; n < hpvm_TimerID_LAST; n++) {
+    hpvm_ResetTimer(&timers->timers[n]);
     timers->sub_timer_list[n] = NULL;
   }
 }
 
-void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
-                      enum visc_TimerID visc_Category) {
+void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
+                      enum hpvm_TimerID hpvm_Category) {
 
-  struct visc_SubTimer *subtimer =
-      (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer));
+  struct hpvm_SubTimer *subtimer =
+      (struct hpvm_SubTimer *)malloc(sizeof(struct hpvm_SubTimer));
 
   int len = strlen(label);
 
   subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s", label);
 
-  visc_ResetTimer(&subtimer->timer);
+  hpvm_ResetTimer(&subtimer->timer);
   subtimer->next = NULL;
 
-  struct visc_SubTimerList *subtimerlist =
-      timers->sub_timer_list[visc_Category];
+  struct hpvm_SubTimerList *subtimerlist =
+      timers->sub_timer_list[hpvm_Category];
   if (subtimerlist == NULL) {
     subtimerlist =
-        (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList));
+        (struct hpvm_SubTimerList *)calloc(1, sizeof(struct hpvm_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
-    timers->sub_timer_list[visc_Category] = subtimerlist;
+    timers->sub_timer_list[hpvm_Category] = subtimerlist;
   } else {
     // Append to list
-    struct visc_SubTimer *element = subtimerlist->subtimer_list;
+    struct hpvm_SubTimer *element = subtimerlist->subtimer_list;
     while (element->next != NULL) {
       element = element->next;
     }
@@ -678,37 +678,37 @@ void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
   }
 }
 
-void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
+void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) {
   // cerr << "Switch to timer: " << timer << flush << "\n";
   /* Stop the currently running timer */
-  if (timers->current != visc_TimerID_NONE) {
-    struct visc_SubTimerList *subtimerlist =
+  if (timers->current != hpvm_TimerID_NONE) {
+    struct hpvm_SubTimerList *subtimerlist =
         timers->sub_timer_list[timers->current];
-    struct visc_SubTimer *currSubTimer =
+    struct hpvm_SubTimer *currSubTimer =
         (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
     if (!is_async(timers->current)) {
       if (timers->current != timer) {
         if (currSubTimer != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
                                     &currSubTimer->timer);
         } else {
-          visc_StopTimer(&timers->timers[timers->current]);
+          hpvm_StopTimer(&timers->timers[timers->current]);
         }
       } else {
         if (currSubTimer != NULL) {
-          visc_StopTimer(&currSubTimer->timer);
+          hpvm_StopTimer(&currSubTimer->timer);
         }
       }
     } else {
       insert_marker(timers, timer);
       if (!is_async(timer)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
 
-  visc_Timestamp currentTime = get_time();
+  hpvm_Timestamp currentTime = get_time();
 
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
@@ -716,7 +716,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
   if (asyncs_outstanding(timers) &&
       (!is_async(timers->current) || is_blocking(timer))) {
 
-    struct visc_async_time_marker_list *last_event = get_last_async(timers);
+    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -736,7 +736,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
 
       // timer to switch to is COPY or NONE
       if (async_done != CL_COMPLETE) {
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
                         timers->async_begin, currentTime);
       }
 
@@ -746,14 +746,14 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
         fprintf(stderr, "Error Waiting for Events!\n");
       }
 
-      visc_Timestamp total_async_time = record_async_times(timers);
+      hpvm_Timestamp total_async_time = record_async_times(timers);
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       if (async_done == CL_COMPLETE) {
         // fprintf(stderr, "Async_done: total_async_type = %lld\n",
         // total_async_time);
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
       }
 
     } else
@@ -763,15 +763,15 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
         if (async_done == CL_COMPLETE) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
           record_async_times(timers);
     }
   }
 
   /* Start the new timer */
-  if (timer != visc_TimerID_NONE) {
+  if (timer != hpvm_TimerID_NONE) {
     if (!is_async(timer)) {
-      visc_StartTimer(&timers->timers[timer]);
+      hpvm_StartTimer(&timers->timers[timer]);
     } else {
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
       if (!asyncs_outstanding(timers)) {
@@ -785,48 +785,48 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list *last_event = get_last_async(timers);
+        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
         last_event->label = NULL;
         last_event->timerID = timer;
       }
       if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
   timers->current = timer;
 }
 
-void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
-                           enum visc_TimerID category) {
-  struct visc_SubTimerList *subtimerlist =
+void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
+                           enum hpvm_TimerID category) {
+  struct hpvm_SubTimerList *subtimerlist =
       timers->sub_timer_list[timers->current];
-  struct visc_SubTimer *curr =
+  struct hpvm_SubTimer *curr =
       (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
-  if (timers->current != visc_TimerID_NONE) {
+  if (timers->current != hpvm_TimerID_NONE) {
     if (!is_async(timers->current)) {
       if (timers->current != category) {
         if (curr != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
                                     &curr->timer);
         } else {
-          visc_StopTimer(&timers->timers[timers->current]);
+          hpvm_StopTimer(&timers->timers[timers->current]);
         }
       } else {
         if (curr != NULL) {
-          visc_StopTimer(&curr->timer);
+          hpvm_StopTimer(&curr->timer);
         }
       }
     } else {
       insert_submarker(timers, label, category);
       if (!is_async(category)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
 
-  visc_Timestamp currentTime = get_time();
+  hpvm_Timestamp currentTime = get_time();
 
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
@@ -834,7 +834,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   if (asyncs_outstanding(timers) &&
       (!is_async(timers->current) || is_blocking(category))) {
 
-    struct visc_async_time_marker_list *last_event = get_last_async(timers);
+    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -858,7 +858,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // because everything is being stopped to wait for synchronization it
       // seems that the extra sync wall time isn't being recorded anywhere
       if (async_done != CL_COMPLETE)
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
                         timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
@@ -866,7 +866,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       if (ciErrNum != CL_SUCCESS) {
         fprintf(stderr, "Error Waiting for Events!\n");
       }
-      visc_Timestamp total_async_time = record_async_times(timers);
+      hpvm_Timestamp total_async_time = record_async_times(timers);
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
@@ -874,7 +874,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // into OVERLAP the immediately preceding EventSynchronize theoretically
       // didn't have any effect since it was already completed.
       if (async_done == CL_COMPLETE /*cudaSuccess*/)
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
 
     } else
         /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
@@ -883,14 +883,14 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
         if (async_done == CL_COMPLETE /*cudaSuccess*/) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
           record_async_times(timers);
     }
     // else, this isn't blocking, so just check the next time around
   }
 
   subtimerlist = timers->sub_timer_list[category];
-  struct visc_SubTimer *subtimer = NULL;
+  struct hpvm_SubTimer *subtimer = NULL;
 
   if (label != NULL) {
     subtimer = subtimerlist->subtimer_list;
@@ -904,18 +904,18 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   }
 
   /* Start the new timer */
-  if (category != visc_TimerID_NONE) {
+  if (category != hpvm_TimerID_NONE) {
     if (!is_async(category)) {
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
 
       if (category != timers->current && subtimer != NULL) {
-        visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
+        hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
       } else if (subtimer != NULL) {
-        visc_StartTimer(&subtimer->timer);
+        hpvm_StartTimer(&subtimer->timer);
       } else {
-        visc_StartTimer(&timers->timers[category]);
+        hpvm_StartTimer(&timers->timers[category]);
       }
     } else {
       if (subtimerlist != NULL) {
@@ -933,7 +933,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list *last_event = get_last_async(timers);
+        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
         last_event->timerID = category;
         last_event->label = label;
       } // else, marker for switchToThis was already inserted
@@ -941,7 +941,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // toSwitchto is already asynchronous, but if current/prev state is async
       // too, then DRIVER is already running
       if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
@@ -949,11 +949,11 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   timers->current = category;
 }
 
-void visc_PrintTimerSet(struct visc_TimerSet *timers) {
-  visc_Timestamp wall_end = get_time();
+void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
+  hpvm_Timestamp wall_end = get_time();
 
-  struct visc_Timer *t = timers->timers;
-  struct visc_SubTimer *sub = NULL;
+  struct hpvm_Timer *t = timers->timers;
+  struct hpvm_SubTimer *sub = NULL;
 
   int maxSubLength;
 
@@ -970,13 +970,13 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) {
   const int maxCategoryLength = 20;
 
   int i;
-  for (i = 1; i < visc_TimerID_LAST;
+  for (i = 1; i < hpvm_TimerID_LAST;
        ++i) { // exclude NONE and OVRELAP from this format
-    if (visc_GetElapsedTime(&t[i]) != 0 || true) {
+    if (hpvm_GetElapsedTime(&t[i]) != 0 || true) {
 
       // Print Category Timer
       printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1],
-             visc_GetElapsedTime(&t[i]));
+             hpvm_GetElapsedTime(&t[i]));
 
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
@@ -999,24 +999,24 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) {
         // Print SubTimers
         while (sub != NULL) {
           printf(" -%-*s: %.9f\n", maxSubLength, sub->label,
-                 visc_GetElapsedTime(&sub->timer));
+                 hpvm_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
 
-  if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
+  if (hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]) != 0)
     printf("CPU/Kernel Overlap: %.9f\n",
-           visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
+           hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]));
 
   float walltime = (wall_end - timers->wall_begin) / 1e9;
   printf("Timer Wall Time: %.9f\n", walltime);
 }
 
-void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
+void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct visc_async_time_marker_list *event = timers->async_markers;
+  struct hpvm_async_time_marker_list *event = timers->async_markers;
   while (event != NULL) {
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -1031,7 +1031,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
     }
 
     free((event)->marker);
-    struct visc_async_time_marker_list *next = ((event)->next);
+    struct hpvm_async_time_marker_list *next = ((event)->next);
 
     free(event);
 
@@ -1040,10 +1040,10 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
   }
 
   int i = 0;
-  for (i = 0; i < visc_TimerID_LAST; ++i) {
+  for (i = 0; i < hpvm_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
-      struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
-      struct visc_SubTimer *prev = NULL;
+      struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
+      struct hpvm_SubTimer *prev = NULL;
       while (subtimer != NULL) {
         free(subtimer->label);
         prev = subtimer;
@@ -1059,7 +1059,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
 #define BUFFER_SIZE 1
 
 // Launch API for a streaming dataflow graph
-void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
+void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
   DFNodeContext_X86 *Context =
       (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
 
@@ -1081,7 +1081,7 @@ void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
 }
 
 // Push API for a streaming dataflow graph
-void llvm_visc_streamPush(void *graphID, void *args) {
+void llvm_hpvm_streamPush(void *graphID, void *args) {
   DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args
              << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
@@ -1094,17 +1094,17 @@ void llvm_visc_streamPush(void *graphID, void *args) {
       if (Ctx->BindInSourcePort->at(j) == i) {
         // Push to all bind buffers connected to parent node at this port
         // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-        llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
+        llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(j), element);
       }
     }
   }
   // Push 0 in isLastInput buffers of all child nodes
   for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers))
-    llvm_visc_bufferPush(buffer, 0);
+    llvm_hpvm_bufferPush(buffer, 0);
 }
 
 // Pop API for a streaming dataflow graph
-void *llvm_visc_streamPop(void *graphID) {
+void *llvm_hpvm_streamPop(void *graphID) {
   DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   unsigned totalBytes = 0;
@@ -1113,7 +1113,7 @@ void *llvm_visc_streamPop(void *graphID) {
   void *output = malloc(totalBytes);
   unsigned offset = 0;
   for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) {
-    uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
+    uint64_t element = llvm_hpvm_bufferPop(Ctx->BindOutputBuffers->at(i));
     // DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
     memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i));
     offset += Ctx->BindOutSizes->at(i);
@@ -1122,24 +1122,24 @@ void *llvm_visc_streamPop(void *graphID) {
 }
 
 // Wait API for a streaming dataflow graph
-void llvm_visc_streamWait(void *graphID) {
+void llvm_hpvm_streamWait(void *graphID) {
   DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   // Push garbage to all other input buffers
   for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) {
     uint64_t element = 0;
     // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-    llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
+    llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(i), element);
   }
   // Push 1 in isLastInput buffers of all child nodes
   for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++)
-    llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
+    llvm_hpvm_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
 
-  llvm_visc_freeThreads(graphID);
+  llvm_hpvm_freeThreads(graphID);
 }
 
 // Create a buffer and return the bufferID
-void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size,
+void *llvm_hpvm_createBindInBuffer(void *graphID, uint64_t size,
                                    unsigned inArgPort) {
   DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
@@ -1154,7 +1154,7 @@ void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size,
   return bufferID;
 }
 
-void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createBindOutBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1166,7 +1166,7 @@ void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) {
   Context->BindOutSizes->push_back(size);
   return bufferID;
 }
-void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createEdgeBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size
              << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1179,7 +1179,7 @@ void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) {
   return bufferID;
 }
 
-void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createLastInputBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1192,7 +1192,7 @@ void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) {
 }
 
 // Free buffers
-void llvm_visc_freeBuffers(void *graphID) {
+void llvm_hpvm_freeBuffers(void *graphID) {
   DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
   for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers))
@@ -1206,19 +1206,19 @@ void llvm_visc_freeBuffers(void *graphID) {
 }
 
 // Pop an element from the buffer
-uint64_t llvm_visc_bufferPop(void *bufferID) {
+uint64_t llvm_hpvm_bufferPop(void *bufferID) {
   CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   return buffer->pop();
 }
 
 // Push an element into the buffer
-void llvm_visc_bufferPush(void *bufferID, uint64_t element) {
+void llvm_hpvm_bufferPush(void *bufferID, uint64_t element) {
   CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   buffer->push(element);
 }
 
 // Create a thread
-void llvm_visc_createThread(void *graphID, void *(*Func)(void *),
+void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *),
                             void *arguments) {
   DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func
              << ", Args: " << arguments << flush << "\n");
@@ -1232,7 +1232,7 @@ void llvm_visc_createThread(void *graphID, void *(*Func)(void *),
 }
 
 // Wait for thread to finish
-void llvm_visc_freeThreads(void *graphID) {
+void llvm_hpvm_freeThreads(void *graphID) {
   DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   for (pthread_t thread : *(Ctx->threads))
@@ -1241,7 +1241,7 @@ void llvm_visc_freeThreads(void *graphID) {
 
 /************************ OPENCL & PTHREAD API ********************************/
 
-void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) {
+void *llvm_hpvm_x86_launch(void *(*rootFunc)(void *), void *arguments) {
   DFNodeContext_X86 *Context =
       (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
   // int err;
@@ -1252,7 +1252,7 @@ void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) {
   return Context;
 }
 
-void llvm_visc_x86_wait(void *graphID) {
+void llvm_hpvm_x86_wait(void *graphID) {
   DEBUG(cout << "Waiting for pthread to finish ...\n");
   // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
   // pthread_join(Context->threadID, NULL);
@@ -1260,9 +1260,9 @@ void llvm_visc_x86_wait(void *graphID) {
   DEBUG(cout << "\t... pthread Done!\n");
 }
 
-void *llvm_visc_ocl_initContext(enum visc::Target T) {
+void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) {
   pthread_mutex_lock(&ocl_mtx);
-  DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR");
+  DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR");
   DEBUG(cout << "Initializing Context for " << Target << " device\n");
   cl_uint numPlatforms;
   cl_int errcode;
@@ -1299,10 +1299,10 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
   // Choose second one which is X86 AVX
   cl_context_properties properties[] = {
-      CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0};
+      CL_CONTEXT_PLATFORM, (long)platforms[T == hpvm::GPU_TARGET ? 0 : 1], 0};
   globalOCLContext = clCreateContextFromType(
       properties,
-      T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
+      T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
       NULL, &errcode);
   // get the list of OCL devices associated with context
   size_t dataBytes;
@@ -1314,7 +1314,7 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
                               clDevices, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-  if (false && T == visc::SPIR_TARGET) {
+  if (false && T == hpvm::SPIR_TARGET) {
     cl_device_partition_property props[4];
     props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
     props[1] = NUM_CORES;
@@ -1340,13 +1340,13 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
 
   DEBUG(cout << "Initialize Kernel Timer\n");
-  visc_InitializeTimerSet(&kernel_timer);
+  hpvm_InitializeTimerSet(&kernel_timer);
 
   pthread_mutex_unlock(&ocl_mtx);
   return globalOCLContext;
 }
 
-void llvm_visc_ocl_clearContext(void *graphID) {
+void llvm_hpvm_ocl_clearContext(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Clear Context\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
@@ -1359,12 +1359,12 @@ void llvm_visc_ocl_clearContext(void *graphID) {
   // DEBUG(cout << "Released context at: " << globalOCLContext);
   free(Context);
   DEBUG(cout << "Done with OCL kernel\n");
-  cout << "Printing VISC Timer: KernelTimer\n";
-  visc_PrintTimerSet(&kernel_timer);
+  cout << "Printing HPVM Timer: KernelTimer\n";
+  hpvm_PrintTimerSet(&kernel_timer);
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
+void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Shared Memory Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1379,7 +1379,7 @@ void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index,
+void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index,
                                    size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Scalar Input:");
@@ -1395,7 +1395,7 @@ void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index,
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
+void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index,
                                  size_t size, bool isInput, bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Pointer Input:");
@@ -1409,7 +1409,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
 
   pthread_mutex_unlock(&ocl_mtx);
   // Check with runtime the location of this memory
-  cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context,
+  cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context,
                                                      isInput, isOutput);
 
   pthread_mutex_lock(&ocl_mtx);
@@ -1424,7 +1424,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
   return d_input;
 }
 
-void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
+void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set device memory for Output Struct:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1446,13 +1446,13 @@ void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   return d_output;
 }
 
-void llvm_visc_ocl_free(void *ptr) {
+void llvm_hpvm_ocl_free(void *ptr) {
   // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
   // cl_mem d_ptr = (cl_mem) ptr;
   // clReleaseMemObject(d_ptr);
 }
 
-void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output,
+void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output,
                               size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Get Output:\n");
@@ -1471,7 +1471,7 @@ void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output,
   return h_output;
 }
 
-void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
+void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim,
                                 const size_t *localWorkSize,
                                 const size_t *globalWorkSize) {
   pthread_mutex_lock(&ocl_mtx);
@@ -1517,7 +1517,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
   // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION);
   // for(int i=0 ;i < NUM_TESTS; i++) {
   // cout << "Iteration = " << i << flush << "\n";
   // pthread_mutex_lock(&ocl_mtx);
@@ -1530,7 +1530,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
   // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
 
   pthread_mutex_unlock(&ocl_mtx);
   return event;
@@ -1579,7 +1579,7 @@ static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
   return cSourceString;
 }
 
-void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) {
+void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Launch OCL Kernel\n");
   // Initialize OpenCL
@@ -1649,7 +1649,7 @@ void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) {
   return Context;
 }
 
-void llvm_visc_ocl_wait(void *graphID) {
+void llvm_hpvm_ocl_wait(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Wait\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
@@ -1659,27 +1659,27 @@ void llvm_visc_ocl_wait(void *graphID) {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) {
+void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) {
   // cout << "Switching to timer " << timer << flush << "\n";
   pthread_mutex_lock(&ocl_mtx);
-  // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
+  // hpvm_SwitchToTimer((hpvm_TimerSet*)(*timerSet), timer);
   pthread_mutex_unlock(&ocl_mtx);
 }
-void llvm_visc_printTimerSet(void **timerSet, char *timerName) {
+void llvm_hpvm_printTimerSet(void **timerSet, char *timerName) {
   pthread_mutex_lock(&ocl_mtx);
-  cout << "Printing VISC Timer: ";
+  cout << "Printing HPVM Timer: ";
   if (timerName != NULL)
     cout << timerName << flush << "\n";
   else
     cout << "Anonymous\n";
-  visc_PrintTimerSet((visc_TimerSet *)(*timerSet));
+  hpvm_PrintTimerSet((hpvm_TimerSet *)(*timerSet));
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void *llvm_visc_initializeTimerSet() {
+void *llvm_hpvm_initializeTimerSet() {
   pthread_mutex_lock(&ocl_mtx);
-  visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet));
-  visc_InitializeTimerSet(TS);
+  hpvm_TimerSet *TS = (hpvm_TimerSet *)malloc(sizeof(hpvm_TimerSet));
+  hpvm_InitializeTimerSet(TS);
   pthread_mutex_unlock(&ocl_mtx);
   return TS;
 }
diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/hpvm-rt/hpvm-rt.h
similarity index 72%
rename from hpvm/projects/visc-rt/visc-rt.h
rename to hpvm/projects/hpvm-rt/hpvm-rt.h
index 3ad315768bf90584a68c1d620ac68936e62a17f0..2b6dafba96d27e4a05c040c77565fbb62ea0e68f 100644
--- a/hpvm/projects/visc-rt/visc-rt.h
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.h
@@ -2,8 +2,8 @@
  *
  * (c) 2010 The Board of Trustees of the University of Illinois.
  */
-#ifndef VISC_RT_HEADER
-#define VISC_RT_HEADER
+#ifndef HPVM_RT_HEADER
+#define HPVM_RT_HEADER
 
 #include <ctime>
 #include <iostream>
@@ -13,8 +13,8 @@
 #include <vector>
 //#include <condition_variable>
 
-#include "../../include/SupportVISC/VISCHint.h"
-#include "../../include/SupportVISC/VISCTimer.h"
+#include "../../include/SupportHPVM/HPVMHint.h"
+#include "../../include/SupportHPVM/HPVMTimer.h"
 #include "device_abstraction.h"
 #include "policy.h"
 
@@ -31,14 +31,14 @@ extern "C" {
 
 /************************* Policies *************************************/
 
-void llvm_visc_policy_init();
-void llvm_visc_policy_clear();
-int llvm_visc_policy_getVersion(const char *, int64_t);
+void llvm_hpvm_policy_init();
+void llvm_hpvm_policy_clear();
+int llvm_hpvm_policy_getVersion(const char *, int64_t);
 
 /******************** Device Abstraction ********************************/
-void llvm_visc_deviceAbstraction_start();
-void llvm_visc_deviceAbstraction_end();
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
+void llvm_hpvm_deviceAbstraction_start();
+void llvm_hpvm_deviceAbstraction_end();
+void llvm_hpvm_deviceAbstraction_waitOnDeviceStatus();
 
 /********************* DFG Depth Stack **********************************/
 class DFGDepth {
@@ -77,12 +77,12 @@ public:
   unsigned getNumDim() const { return numDim; }
 };
 
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
+void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
                                uint64_t limitY = 0, uint64_t iY = 0,
                                uint64_t limitZ = 0, uint64_t iZ = 0);
-void llvm_visc_x86_dstack_pop();
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
+void llvm_hpvm_x86_dstack_pop();
+uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim);
+uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim);
 
 /********************* Memory Tracker **********************************/
 class MemTrackerEntry {
@@ -156,32 +156,32 @@ public:
   }
 };
 
-void llvm_visc_track_mem(void *, size_t);
-void llvm_visc_untrack_mem(void *);
-void *llvm_visc_request_mem(void *, size_t);
+void llvm_hpvm_track_mem(void *, size_t);
+void llvm_hpvm_untrack_mem(void *);
+void *llvm_hpvm_request_mem(void *, size_t);
 
 /*********************** OPENCL & PTHREAD API **************************/
-void *llvm_visc_x86_launch(void *(void *), void *);
-void llvm_visc_x86_wait(void *);
-void *llvm_visc_ocl_initContext(enum visc::Target);
-
-void *llvm_visc_x86_argument_ptr(void *, size_t);
-
-void llvm_visc_ocl_clearContext(void *);
-void llvm_visc_ocl_argument_shared(void *, int, size_t);
-void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t);
-void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool);
-void *llvm_visc_ocl_output_ptr(void *, int, size_t);
-void llvm_visc_ocl_free(void *);
-void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t);
-void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *,
+void *llvm_hpvm_x86_launch(void *(void *), void *);
+void llvm_hpvm_x86_wait(void *);
+void *llvm_hpvm_ocl_initContext(enum hpvm::Target);
+
+void *llvm_hpvm_x86_argument_ptr(void *, size_t);
+
+void llvm_hpvm_ocl_clearContext(void *);
+void llvm_hpvm_ocl_argument_shared(void *, int, size_t);
+void llvm_hpvm_ocl_argument_scalar(void *, void *, int, size_t);
+void *llvm_hpvm_ocl_argument_ptr(void *, void *, int, size_t, bool, bool);
+void *llvm_hpvm_ocl_output_ptr(void *, int, size_t);
+void llvm_hpvm_ocl_free(void *);
+void *llvm_hpvm_ocl_getOutput(void *, void *, void *, size_t);
+void *llvm_hpvm_ocl_executeNode(void *, unsigned, const size_t *,
                                 const size_t *);
-void *llvm_visc_ocl_launch(const char *, const char *);
-void llvm_visc_ocl_wait(void *);
+void *llvm_hpvm_ocl_launch(const char *, const char *);
+void llvm_hpvm_ocl_wait(void *);
 
-void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID);
-void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL);
-void *llvm_visc_initializeTimerSet();
+void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID);
+void llvm_hpvm_printTimerSet(void **timerSet, char *timerName = NULL);
+void *llvm_hpvm_initializeTimerSet();
 }
 
 /*************************** Pipeline API ******************************/
@@ -262,30 +262,30 @@ template <class ElementType> ElementType CircularBuffer<ElementType>::pop() {
 
 extern "C" {
 // Functions to push and pop values from pipeline buffers
-uint64_t llvm_visc_bufferPop(void *);
-void llvm_visc_bufferPush(void *, uint64_t);
+uint64_t llvm_hpvm_bufferPop(void *);
+void llvm_hpvm_bufferPush(void *, uint64_t);
 
 // Functions to create and destroy buffers
-void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned);
-void *llvm_visc_createBindOutBuffer(void *, uint64_t);
-void *llvm_visc_createEdgeBuffer(void *, uint64_t);
-void *llvm_visc_createLastInputBuffer(void *, uint64_t);
+void *llvm_hpvm_createBindInBuffer(void *, uint64_t, unsigned);
+void *llvm_hpvm_createBindOutBuffer(void *, uint64_t);
+void *llvm_hpvm_createEdgeBuffer(void *, uint64_t);
+void *llvm_hpvm_createLastInputBuffer(void *, uint64_t);
 
-void llvm_visc_freeBuffers(void *);
+void llvm_hpvm_freeBuffers(void *);
 
 // Functions to create and destroy threads
-void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *);
-void llvm_visc_freeThreads(void *);
+void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), void *);
+void llvm_hpvm_freeThreads(void *);
 
 // Launch API for a streaming graph.
 // Arguments:
 // (1) Launch Function: void* (void*, void*)
 // (2) Push Function:   void (void*, std::vector<uint64_t>**, unsgined)
 // (3) Pop Function:    void* (std::vector<uint64_t>**, unsigned)
-void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *);
-void llvm_visc_streamPush(void *graphID, void *args);
-void *llvm_visc_streamPop(void *graphID);
-void llvm_visc_streamWait(void *graphID);
+void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *);
+void llvm_hpvm_streamPush(void *graphID, void *args);
+void *llvm_hpvm_streamPop(void *graphID);
+void llvm_hpvm_streamWait(void *graphID);
 }
 
-#endif // VISC_RT_HEADER
+#endif // HPVM_RT_HEADER
diff --git a/hpvm/projects/visc-rt/makefile b/hpvm/projects/hpvm-rt/makefile
similarity index 97%
rename from hpvm/projects/visc-rt/makefile
rename to hpvm/projects/hpvm-rt/makefile
index adcc6323356d2537eca6ed653cad6d17a1d1ef0e..927e26e254a2b2f980fed8efd8858935e9f3cbdf 100644
--- a/hpvm/projects/visc-rt/makefile
+++ b/hpvm/projects/hpvm-rt/makefile
@@ -9,7 +9,7 @@ ifeq ($(NUM_CORES),)
 endif
 
 CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
-TARGET:=visc-rt
+TARGET:=hpvm-rt
 
 LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
 LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/hpvm-rt/policy.h
similarity index 100%
rename from hpvm/projects/visc-rt/policy.h
rename to hpvm/projects/hpvm-rt/policy.h
diff --git a/hpvm/projects/visc-rt/CMakeLists.txt b/hpvm/projects/visc-rt/CMakeLists.txt
deleted file mode 100644
index 5b9449bf2d00ac7a03c085cc1418a95e032d01b7..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-add_definitions(-DNUM_CORES=8)
-
-SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
-SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
-
-add_llvm_library(visc-rt.ll visc-rt.cpp
-
-  DEPENDS
-  clang
-  llvm-dis
-  )
-
-
-target_compile_options(visc-rt.ll PUBLIC -flto )
-target_compile_options(visc-rt.ll PUBLIC -std=c++11)
-
-add_custom_target(visc-rt.cpp.o ALL
-  COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libvisc-rt.ll.a
-  COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc
-  COMMAND  ${CMAKE_BINARY_DIR}/bin/llvm-dis  ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc)
-
-add_dependencies(visc-rt.cpp.o   visc-rt.ll)
diff --git a/hpvm/test/CTestSuite/Makefile b/hpvm/test/CTestSuite/Makefile
index 226a83287d743360d9cd64a7c57e864871829b0b..1169e4e896a861975ac0562ebff8b208828bbf89 100644
--- a/hpvm/test/CTestSuite/Makefile
+++ b/hpvm/test/CTestSuite/Makefile
@@ -9,7 +9,7 @@ LLVM_CC:=$(LLVM_INSTALL)/bin/clang
 LLVM_OPT:=$(LLVM_INSTALL)/bin/opt
 BUILD_DIR:=build
 
-all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll)
+all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -17,10 +17,10 @@ $(BUILD_DIR):
 $(HOST:%=$(BUILD_DIR)/%.ll):$(BUILD_DIR)/%.ll:%.c
 	$(LLVM_CC) -S -emit-llvm $< -O3 -o $@
 
-$(HOST:%=$(BUILD_DIR)/%.visc.ll):$(BUILD_DIR)/%.visc.ll:$(BUILD_DIR)/%.ll
-	$(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenVISC.so -genvisc -globaldce $< -S -o $@
+$(HOST:%=$(BUILD_DIR)/%.hpvm.ll):$(BUILD_DIR)/%.hpvm.ll:$(BUILD_DIR)/%.ll
+	$(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenHPVM.so -genhpvm -globaldce $< -S -o $@
 	@cat RUN.script $@ > $@.tmp
 	@mv $@.tmp $@
 
 clean :
-	rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.nvptx.s)  $(BUILD_DIR)/DataflowGraph.dot*
+	rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.nvptx.s)  $(BUILD_DIR)/DataflowGraph.dot*
diff --git a/hpvm/test/CTestSuite/RUN.script b/hpvm/test/CTestSuite/RUN.script
index 10bf667818824719af2e041fc6b2dc3e449d9158..23fa1694ebf4b7448c731327b96b949c0509b62e 100644
--- a/hpvm/test/CTestSuite/RUN.script
+++ b/hpvm/test/CTestSuite/RUN.script
@@ -1,6 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
 ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
 ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin
diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c
index d0a69ba25c27fb65ea549023deed2dfb0197b882..eb0a3c5e9204d9621c4a15ae7f07ef5158ac1d07 100644
--- a/hpvm/test/CTestSuite/gemm.c
+++ b/hpvm/test/CTestSuite/gemm.c
@@ -54,14 +54,14 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   return 1; // Success
 }
 
-// Dummy visc node execution call
-// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// Dummy hpvm node execution call
+// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned),
 // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
 // outputs);
 
 void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
 
-  __visc__attributes(2, A, B, 1, C);
+  __hpvm__attributes(2, A, B, 1, C);
   // printf("Entered function\n");
   int tx = get_local_id(0); // 2D Global Thread ID x
   int ty = get_local_id(1); // 2D Global Thread ID y
@@ -130,10 +130,10 @@ int main(int argc, char **argv) {
 
   // Compute using OpenCL
   // matrixMul(h_A, h_B, h_C, WA, WB);
-  //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
-  unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B,
+  //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
+  unsigned graphMM = __hpvm__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B,
                                   bytes_B, h_C, bytes_C, WA, WB, 0);
-  __visc__wait(graphMM);
+  __hpvm__wait(graphMM);
   if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c
index bd7ab27fc0160275442d23faf507851b7c2369f7..df4555936316703cfccd4048f2ade4e28592e53a 100644
--- a/hpvm/test/CTestSuite/gemm_2.c
+++ b/hpvm/test/CTestSuite/gemm_2.c
@@ -54,13 +54,13 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   return 1; // Success
 }
 
-// Dummy visc node execution call
-// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// Dummy hpvm node execution call
+// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned),
 // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
 // outputs);
 
 void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
-  __visc__attributes(2, A, B, 1, C);
+  __hpvm__attributes(2, A, B, 1, C);
 
   // printf("Entered function\n");
   int tx = get_global_id(0); // 2D Global Thread ID x
@@ -130,11 +130,11 @@ int main(int argc, char **argv) {
 
   // Compute using OpenCL
   // matrixMul(h_A, h_B, h_C, WA, WB);
-  //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
+  //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
   unsigned graphMM =
-      __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A,
+      __hpvm__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A,
                    h_B, bytes_B, h_C, bytes_C, WA, WB, 0);
-  __visc__wait(graphMM);
+  __hpvm__wait(graphMM);
   if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
diff --git a/hpvm/test/hpvm-cava/.gitignore b/hpvm/test/hpvm-cava/.gitignore
index 2fc1b235647962ac761edda7dfbda4499cbcd4f0..f08b880bf9b4b8171e9fb878bea3a6d266a1f9c0 100644
--- a/hpvm/test/hpvm-cava/.gitignore
+++ b/hpvm/test/hpvm-cava/.gitignore
@@ -1,5 +1,5 @@
 build/
-cava-visc
+cava-hpvm
 Makefile.config
 
 example-face/*.bin
diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile
index 62219a1cb0a92d1ca0d5bc661645b4c8251a24b8..7530477f3d73ef7b641f3d4b39fda4f50b201d0f 100644
--- a/hpvm/test/hpvm-cava/Makefile
+++ b/hpvm/test/hpvm-cava/Makefile
@@ -26,21 +26,21 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
 
 INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR)
-INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include
 ifneq ($(CONFUSE_ROOT),)
   INCLUDES += -I$(CONFUSE_ROOT)/include
   LFLAGS += -L$(CONFUSE_ROOT)/lib
 endif
 
-EXE = cava-visc-$(VERSION)-$(TARGET)
+EXE = cava-hpvm-$(VERSION)-$(TARGET)
 
 LFLAGS += -pthread
 
 ## BEGIN HPVM MAKEFILE
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll
 OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP = $(EXE)
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
@@ -52,23 +52,23 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt
 
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll
 
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 
 ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
 endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 
 CFLAGS += -DDEVICE=$(DEVICE)
 CXXFLAGS += -DDEVICE=$(DEVICE)
@@ -79,7 +79,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
 .PRECIOUS: $(BUILD_DIR)/%.ll
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
 ifeq ($(TARGET),seq)
@@ -107,14 +107,14 @@ $(KERNEL_OCL) : $(KERNEL)
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
+$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp
 	make -C $(LLVM_LIB_PATH)
 
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
+	$(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -125,7 +125,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
 $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
 	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
 
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
+	$(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/hpvm-cava/Makefile.config.example b/hpvm/test/hpvm-cava/Makefile.config.example
index 269f0b7df273c958f0cd20a0f935716a329e00ae..8cbe04af784fa1e030ed0bce07176b081980649d 100644
--- a/hpvm/test/hpvm-cava/Makefile.config.example
+++ b/hpvm/test/hpvm-cava/Makefile.config.example
@@ -4,20 +4,20 @@ OPENCL_PATH=/opt/intelFPGA_pro/18.0/hld/host/linux64
 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib
 
 # NOTE: You may need to configure this based on your root path.
-VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
+HPVM_SRC_ROOT=$(LLVM_SRC_ROOT)
 
-VISC_BUILD_DIR =$(VISC_SRC_ROOT)/build
-CC = $(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+HPVM_BUILD_DIR =$(HPVM_SRC_ROOT)/build
+CC = $(HPVM_BUILD_DIR)/bin/clang
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+CXX = $(HPVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(HPVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lrt -lOpenCL -L$(OPENCL_LIB_PATH)
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/hpvm-cava/README.md b/hpvm/test/hpvm-cava/README.md
index 890b629d172a2f53bf77d6d52bda27637c71afeb..1106c4781b285c47d59548d47e5cd03f09063b28 100644
--- a/hpvm/test/hpvm-cava/README.md
+++ b/hpvm/test/hpvm-cava/README.md
@@ -12,7 +12,7 @@ See the original camera/vision pipeline repo (repo: `yaoyuannnn/cava`) for detai
 After building HPVM, the following steps are required to build and run the camera pipeline:
 
 1. Build with `make TARGET=seq` for CPU and `make TARGET=gpu` for gpu.
-2. Run with `./cava-visc-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. 
+2. Run with `./cava-hpvm-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. 
     * `<Target>` can be either `seq` or `gpu` depending on what target is used to build.
     * This processes the raw image `example-tulip-small/raw_tulip-small.bin`. Note that raw images are different from bitmaps, so you might need to obtain them using special software.
     * This generates: `tulip-small.bin` and `tulip-small-<stage>.bin` where `<stage>` represents the stage of the pipeline.
diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c
index e43bbb4f25c4c97c9907ebae37251c854860c3b5..4188c9e86045de9d3d6d2688b0ebc48dc3152004 100644
--- a/hpvm/test/hpvm-cava/src/main.c
+++ b/hpvm/test/hpvm-cava/src/main.c
@@ -1,136 +1,154 @@
+#include "utility.h"
 #include <argp.h>
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 #include <string.h>
-#include <math.h>
-#include "utility.h"
 
 #include "cam_pipe_utility.h"
-#include "pipe_stages.h"
 #include "load_cam_model.h"
+#include "pipe_stages.h"
 
-#include "visc.h"
+#include "hpvm.h"
 
 int NUM_TEST_CASES;
 int NUM_CLASSES;
 int INPUT_DIM;
 int NUM_WORKER_THREADS;
 
+// Type of struct holding the return value from the last node.
+struct RetStruct {
+  size_t bytesRet;
+};
+
 // Type of struct that is used to pass arguments to the HPVM dataflow graph
 // using the hpvm launch operation
 typedef struct __attribute__((__packed__)) {
-    uint8_t *input; size_t bytes_input;
-    uint8_t *result; size_t bytes_result;
-    float *input_scaled; size_t bytes_input_scaled; 
-    float *result_scaled; size_t bytes_result_scaled;
-    float *demosaic_out; size_t bytes_demosaic_out;
-    float *denoise_out; size_t bytes_denoise_out;
-    float *transform_out; size_t bytes_transform_out;
-    float *gamut_out;size_t bytes_gamut_out;
-    float *TsTw; size_t bytes_TsTw;
-    float *ctrl_pts; size_t bytes_ctrl_pts;
-    float *weights; size_t bytes_weights;
-    float*coefs; size_t bytes_coefs;
-    float *l2_dist; size_t bytes_l2_dist;
-    float *tone_map; size_t bytes_tone_map;
-    size_t row_size; size_t col_size;
-} 
-RootIn;
+  uint8_t *input;
+  size_t bytes_input;
+  uint8_t *result;
+  size_t bytes_result;
+  float *input_scaled;
+  size_t bytes_input_scaled;
+  float *result_scaled;
+  size_t bytes_result_scaled;
+  float *demosaic_out;
+  size_t bytes_demosaic_out;
+  float *denoise_out;
+  size_t bytes_denoise_out;
+  float *transform_out;
+  size_t bytes_transform_out;
+  float *gamut_out;
+  size_t bytes_gamut_out;
+  float *TsTw;
+  size_t bytes_TsTw;
+  float *ctrl_pts;
+  size_t bytes_ctrl_pts;
+  float *weights;
+  size_t bytes_weights;
+  float *coefs;
+  size_t bytes_coefs;
+  float *l2_dist;
+  size_t bytes_l2_dist;
+  float *tone_map;
+  size_t bytes_tone_map;
+  int row_size;
+  int col_size;
+  struct RetStruct ret; // Instance of RetStruct holding the return value.
+} RootIn;
 
 typedef enum _argnum {
-    RAW_IMAGE_BIN,
-    OUTPUT_IMAGE_BIN,
-    NUM_REQUIRED_ARGS,
-    DATA_FILE = NUM_REQUIRED_ARGS,
-    NUM_ARGS,
+  RAW_IMAGE_BIN,
+  OUTPUT_IMAGE_BIN,
+  NUM_REQUIRED_ARGS,
+  DATA_FILE = NUM_REQUIRED_ARGS,
+  NUM_ARGS,
 } argnum;
 
 typedef struct _arguments {
-    char* args[NUM_ARGS];
-    int num_inputs;
-    int num_threads;
+  char *args[NUM_ARGS];
+  int num_inputs;
+  int num_threads;
 } arguments;
 
 static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
 static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
 static struct argp_option options[] = {
-    { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 },
-    { "data-file", 'f', "F", 0,
-      "File to read data and weights from (if data-init-mode == READ_FILE or "
-      "save-params is true). *.txt files are decoded as text files, while "
-      "*.bin files are decoded as binary files." },
+    {"num-inputs", 'n', "N", 0, "Number of input images"},
+    {0},
+    {"data-file", 'f', "F", 0,
+     "File to read data and weights from (if data-init-mode == READ_FILE or "
+     "save-params is true). *.txt files are decoded as text files, while "
+     "*.bin files are decoded as binary files."},
 };
 
-static error_t parse_opt(int key, char* arg, struct argp_state* state) {
-    arguments* args = (arguments*)(state->input);
-    switch (key) {
-        case 'n': {
-            args->num_inputs = strtol(arg, NULL, 10);
-            break;
-        }
-        case 'f': {
-            args->args[DATA_FILE] = arg;
-            break;
-        }
-        case 't': {
-            args->num_threads = strtol(arg, NULL, 10);
-            break;
-        }
-        case ARGP_KEY_ARG: {
-            if (state->arg_num >= NUM_REQUIRED_ARGS)
-                argp_usage(state);
-            args->args[state->arg_num] = arg;
-            break;
-        }
-        case ARGP_KEY_END: {
-            if (state->arg_num < NUM_REQUIRED_ARGS) {
-                fprintf(stderr,
-                        "Not enough arguments! Got %d, require %d.\n",
-                        state->arg_num,
-                        NUM_REQUIRED_ARGS);
-                argp_usage(state);
-            }
-            break;
-        }
-        default:
-            return ARGP_ERR_UNKNOWN;
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+  arguments *args = (arguments *)(state->input);
+  switch (key) {
+  case 'n': {
+    args->num_inputs = strtol(arg, NULL, 10);
+    break;
+  }
+  case 'f': {
+    args->args[DATA_FILE] = arg;
+    break;
+  }
+  case 't': {
+    args->num_threads = strtol(arg, NULL, 10);
+    break;
+  }
+  case ARGP_KEY_ARG: {
+    if (state->arg_num >= NUM_REQUIRED_ARGS)
+      argp_usage(state);
+    args->args[state->arg_num] = arg;
+    break;
+  }
+  case ARGP_KEY_END: {
+    if (state->arg_num < NUM_REQUIRED_ARGS) {
+      fprintf(stderr, "Not enough arguments! Got %d, require %d.\n",
+              state->arg_num, NUM_REQUIRED_ARGS);
+      argp_usage(state);
     }
-    return 0;
+    break;
+  }
+  default:
+    return ARGP_ERR_UNKNOWN;
+  }
+  return 0;
 }
 
-void set_default_args(arguments* args) {
-    args->num_inputs = 1;
-    args->num_threads = 0;
-    for (int i = 0; i < NUM_ARGS; i++) {
-        args->args[i] = NULL;
-    }
+void set_default_args(arguments *args) {
+  args->num_inputs = 1;
+  args->num_threads = 0;
+  for (int i = 0; i < NUM_ARGS; i++) {
+    args->args[i] = NULL;
+  }
 }
 
-static struct argp parser = { options, parse_opt, args_doc, prog_doc };
+static struct argp parser = {options, parse_opt, args_doc, prog_doc};
 
 // Helper function for printing intermediate results
-void descale_cpu(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  
+void descale_cpu(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    for (i = 0; i < n - 1; i++)
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+  for (i = 0; i < n - 1; i++)
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 /**************************************************************/
@@ -140,256 +158,259 @@ static void sort(float arr[], int n) {
 // In this benchmark, no use of HPVM query intrinsics in the leaf node functions
 
 // Leaf HPVM node function for scale
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               size_t row_size, size_t col_size) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, size_t row_size, size_t col_size) {
 
-  //Specifies compilation target for current node
-  __visc__hint(CPU_TARGET);
+  // Specifies compilation target for current node
+  __hpvm__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-  __visc__attributes(2, input, output, 1, output);
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__attributes(2, input, output, 1, output);
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++){
-        int index = (chan*row_size + row) * col_size + col;
-        output[index] = input[index] * 1.0 / 255;
-      }
-  __visc__return(1, bytes_output);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      output[index] = input[index] * 1.0 / 255;
+    }
+  __hpvm__return(1, bytes_output);
 }
 
 // Leaf HPVM node function for descale
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, output, 1, output);
-  
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, output, 1, output);
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for demosaicing
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 1; row < row_size - 1; row++)
-    for (int col = 1; col < col_size - 1; col++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - 1];
-            float R2 = input[index_0 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size];
-            float B2 = input[index_2 + col_size];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size - 1];
-            float B2 = input[index_2 - col_size + 1];
-            float B3 = input[index_2 + col_size - 1];
-            float B4 = input[index_2 + col_size + 1];
-            // R
-            result[index_0] = input[index_0];
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            result[index_2] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size - 1];
-            float R2 = input[index_0 + col_size - 1];
-            float R3 = input[index_0 - col_size + 1];
-            float R4 = input[index_0 + col_size + 1];
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // R
-            result[index_0] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            result[index_2] = input[index_2];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size];
-            float R2 = input[index_0 + col_size];
-            // Getting the B values
-            float B1 = input[index_2 - 1];
-            float B2 = input[index_2 + 1];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        }
-      }
-  __visc__return(1, bytes_result);
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
+  //  for (int row = 1; row < row_size - 1; row++)
+  for (int col = 1; col < col_size - 1; col++) {
+    int index_0 = (0 * row_size + row) * col_size + col;
+    int index_1 = (1 * row_size + row) * col_size + col;
+    int index_2 = (2 * row_size + row) * col_size + col;
+    if (row % 2 == 0 && col % 2 == 0) {
+      // Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - 1];
+      float R2 = input[index_0 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size];
+      float B2 = input[index_2 + col_size];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    } else if (row % 2 == 0 && col % 2 == 1) {
+      // Red pixel
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size - 1];
+      float B2 = input[index_2 - col_size + 1];
+      float B3 = input[index_2 + col_size - 1];
+      float B4 = input[index_2 + col_size + 1];
+      // R
+      result[index_0] = input[index_0];
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B (center pixel)
+      result[index_2] = (B1 + B2 + B3 + B4) / 4;
+    } else if (row % 2 == 1 && col % 2 == 0) {
+      // Blue pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size - 1];
+      float R2 = input[index_0 + col_size - 1];
+      float R3 = input[index_0 - col_size + 1];
+      float R4 = input[index_0 + col_size + 1];
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // R
+      result[index_0] = (R1 + R2 + R3 + R4) / 4;
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B
+      result[index_2] = input[index_2];
+    } else {
+      // Bottom Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size];
+      float R2 = input[index_0 + col_size];
+      // Getting the B values
+      float B1 = input[index_2 - 1];
+      float B2 = input[index_2 + 1];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    }
+  }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for denoise
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++)
-        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
-          float filter[9];
-          for (int i = -1; i < 2; i++)
-            for (int j = -1; j < 2; j++) {
-              int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1;
-              filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)];
-            }
-          sort(filter, 9);
-          result[(chan * row_size + row) * col_size + col] = filter[4];
-        } else {
-      result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col];
-        }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++)
+      if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+        float filter[9];
+        for (int i = -1; i < 2; i++)
+          for (int j = -1; j < 2; j++) {
+            int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1;
+            filter[index] =
+                input[(chan * row_size + (i + row)) * col_size + (j + col)];
+          }
+        sort(filter, 9);
+        result[(chan * row_size + row) * col_size + col] = filter[4];
+      } else {
+        result[(chan * row_size + row) * col_size + col] =
+            input[(chan * row_size + row) * col_size + col];
+      }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for color map and white balance transform
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        int index_2d_0 = 0 * CHAN_SIZE + chan;
-        int index_2d_1 = 1 * CHAN_SIZE + chan;
-        int index_2d_2 = 2 * CHAN_SIZE + chan;
-        result[index] =
-            max(input[index_0] * TsTw_tran[index_2d_0] +
-                input[index_1] * TsTw_tran[index_2d_1] +
-                input[index_2] * TsTw_tran[index_2d_2],
-                0);
-      }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      int index_2d_0 = 0 * CHAN_SIZE + chan;
+      int index_2d_1 = 1 * CHAN_SIZE + chan;
+      int index_2d_2 = 2 * CHAN_SIZE + chan;
+      result[index] = max(input[index_0] * TsTw_tran[index_2d_0] +
+                              input[index_1] * TsTw_tran[index_2d_1] +
+                              input[index_2] * TsTw_tran[index_2d_2],
+                          0);
+    }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for gamut mapping
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
-
- // First, get the L2 norm from every pixel to the control points,
- // Then, sum it and weight it. Finally, add the bias.
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      float chan_val_0 = 0.0;
-      float chan_val_1 = 0.0;
-      float chan_val_2 = 0.0;
-      for (int cp = 0; cp < 3702; cp++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); 
-        float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
-        float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); 
-        float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
-        float val = val1 * val2 + val3 * val4 + val5 * val6;
-        float sqrt_val = sqrt(val);
-        chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
-        chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
-        chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
-      }
-        chan_val_0 += coefs[0 * CHAN_SIZE + 0] + 
-                    coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
-        chan_val_1 += coefs[0 * CHAN_SIZE + 1] + 
-                    coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
-        chan_val_2 += coefs[0 * CHAN_SIZE + 2] + 
-                    coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
-        result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
-        result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
-        result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2,
+                     result, l2_dist);
+
+  // First, get the L2 norm from every pixel to the control points,
+  // Then, sum it and weight it. Finally, add the bias.
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
+  //  for (int row = 0; row < row_size; row++)
+  for (int col = 0; col < col_size; col++) {
+    float chan_val_0 = 0.0;
+    float chan_val_1 = 0.0;
+    float chan_val_2 = 0.0;
+    for (int cp = 0; cp < 3702; cp++) {
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val = val1 * val2 + val3 * val4 + val5 * val6;
+      float sqrt_val = sqrt(val);
+      chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
+      chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
+      chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
     }
-  __visc__return(1, bytes_result);
+    chan_val_0 +=
+        coefs[0 * CHAN_SIZE + 0] +
+        coefs[1 * CHAN_SIZE + 0] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 0] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
+    chan_val_1 +=
+        coefs[0 * CHAN_SIZE + 1] +
+        coefs[1 * CHAN_SIZE + 1] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 1] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
+    chan_val_2 +=
+        coefs[0 * CHAN_SIZE + 2] +
+        coefs[1 * CHAN_SIZE + 2] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 2] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
+    result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
+    result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
+    result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  }
+  __hpvm__return(1, bytes_result);
 }
 
 // HPVM leaf node function, for tone mapping
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        uint8_t x = input[index] * 255;
-        result[index] = tone_map[x * CHAN_SIZE + chan];
-      }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      uint8_t x = input[index] * 255;
+      result[index] = tone_map[x * CHAN_SIZE + chan];
+    }
+  __hpvm__return(1, bytes_result);
 }
 
 /********************************************************************/
@@ -400,185 +421,184 @@ void tone_map_fxp(float *input, size_t bytes_input,
 // requirement for the FPGA backend . The CPU backend also supports this,
 // so it does not cause a portability issue.
 
-void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
+void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
+                       size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
 
   // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic
   // instance (last argument) associated with node function scale_fxp
-  void *ScaleNode = __visc__createNodeND(1, scale_fxp, row_size);
+  void *ScaleNode = __hpvm__createNodeND(1, scale_fxp, row_size);
 
   // Binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
   // - argument position in argument list of function of destination node
   // - streaming (1) or non-streaming (0)
-  __visc__bindIn(ScaleNode, 0, 0, 0); // bind input
-  __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(ScaleNode, 2, 2, 0); // bind result
-  __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size
+  __hpvm__bindIn(ScaleNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(ScaleNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(ScaleNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(ScaleNode, 5, 5, 0); // bind col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-  __visc__bindOut(ScaleNode, 0, 0, 0);
+  __hpvm__bindOut(ScaleNode, 0, 0, 0);
 }
 
-void descale_fxp_wrapper(float *input, size_t bytes_input, 
-                       uint8_t *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size);
-  __visc__bindIn(DescaleNode, 0, 0, 0); // bind input
-  __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DescaleNode, 2, 2, 0); // bind result
-  __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DescaleNode, 0, 0, 0);
+void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DescaleNode = __hpvm__createNodeND(1, descale_fxp, row_size);
+  __hpvm__bindIn(DescaleNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DescaleNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DescaleNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DescaleNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DescaleNode, 0, 0, 0);
 }
 
-void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size);
-  __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input
-  __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result
-  __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DemosaicNode, 0, 0, 0);
+void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, size_t row_size,
+                          size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DemosaicNode = __hpvm__createNodeND(1, demosaic_fxp, row_size);
+  __hpvm__bindIn(DemosaicNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DemosaicNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DemosaicNode, 0, 0, 0);
 }
 
-void denoise_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size);
-  __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input
-  __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result
-  __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DenoiseNode, 0, 0, 0);
+void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DenoiseNode = __hpvm__createNodeND(1, denoise_fxp, row_size);
+  __hpvm__bindIn(DenoiseNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DenoiseNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DenoiseNode, 0, 0, 0);
 }
 
-void transform_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *TsTw_tran, size_t bytes_TsTw,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size);
-  __visc__bindIn(TransformNode, 0, 0, 0); // bind input
-  __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(TransformNode, 2, 2, 0); // bind result
-  __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw
-  __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
-  __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size
-  __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size
-  
-  __visc__bindOut(TransformNode, 0, 0, 0);
+void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                           size_t bytes_result, float *TsTw_tran,
+                           size_t bytes_TsTw, size_t row_size,
+                           size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+  void *TransformNode = __hpvm__createNodeND(1, transform_fxp, row_size);
+  __hpvm__bindIn(TransformNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(TransformNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(TransformNode, 4, 4, 0); // bind tstw
+  __hpvm__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
+  __hpvm__bindIn(TransformNode, 6, 6, 0); // bind row_size
+  __hpvm__bindIn(TransformNode, 7, 7, 0); // bind col_size
+
+  __hpvm__bindOut(TransformNode, 0, 0, 0);
 }
 
-void gamut_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *ctrl_pts, size_t bytes_ctrl_pts,
-                       float *weights, size_t bytes_weights,
-                       float *coefs, size_t bytes_coefs,
-                       float *l2_dist, size_t bytes_l2_dist,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size);
-  __visc__bindIn(GamutNode, 0, 0, 0); // bind input
-  __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(GamutNode, 2, 2, 0); // bind result
-  __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts
-  __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts
-  __visc__bindIn(GamutNode, 6, 6, 0); // bind weights
-  __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights
-  __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs
-  __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs
-  __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
-  __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
-  __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size
-  __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size
-  
-  __visc__bindOut(GamutNode, 0, 0, 0);
+void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                       size_t bytes_result, float *ctrl_pts,
+                       size_t bytes_ctrl_pts, float *weights,
+                       size_t bytes_weights, float *coefs, size_t bytes_coefs,
+                       float *l2_dist, size_t bytes_l2_dist, size_t row_size,
+                       size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
+  void *GamutNode = __hpvm__createNodeND(1, gamut_map_fxp, row_size);
+  __hpvm__bindIn(GamutNode, 0, 0, 0);   // bind input
+  __hpvm__bindIn(GamutNode, 1, 1, 0);   // bind bytes_input
+  __hpvm__bindIn(GamutNode, 2, 2, 0);   // bind result
+  __hpvm__bindIn(GamutNode, 3, 3, 0);   // bind bytes_result
+  __hpvm__bindIn(GamutNode, 4, 4, 0);   // bind ctrl_pts
+  __hpvm__bindIn(GamutNode, 5, 5, 0);   // bind bytes_ctrl_pts
+  __hpvm__bindIn(GamutNode, 6, 6, 0);   // bind weights
+  __hpvm__bindIn(GamutNode, 7, 7, 0);   // bind bytes_weights
+  __hpvm__bindIn(GamutNode, 8, 8, 0);   // bind coefs
+  __hpvm__bindIn(GamutNode, 9, 9, 0);   // bind bytes_coefs
+  __hpvm__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
+  __hpvm__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
+  __hpvm__bindIn(GamutNode, 12, 12, 0); // bind row_size
+  __hpvm__bindIn(GamutNode, 13, 13, 0); // bind col_size
+
+  __hpvm__bindOut(GamutNode, 0, 0, 0);
 }
-void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *tone_map, size_t bytes_tone_map,
-                       size_t row_size, size_t col_size) {
-
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, row_size);
-  __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input
-  __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
-  __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map 
-  __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
-  __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
-  __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
-  
-  __visc__bindOut(ToneMapNode, 0, 0, 0);
+void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, float *tone_map,
+                          size_t bytes_tone_map, size_t row_size,
+                          size_t col_size) {
+
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+  void *ToneMapNode = __hpvm__createNodeND(1, tone_map_fxp, row_size);
+  __hpvm__bindIn(ToneMapNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(ToneMapNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map
+  __hpvm__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
+  __hpvm__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
+  __hpvm__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
+
+  __hpvm__bindOut(ToneMapNode, 0, 0, 0);
 }
 
-
 /*** ROOT Node - Top Level of the Graph Hierarchy ***/
-void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input, 
-                 /*2*/ uint8_t *result,        /*3*/ size_t bytes_result,
-                 /*4*/ float *input_scaled,    /*5*/ size_t bytes_input_scaled,
-                 /*6*/ float *result_scaled,   /*7*/ size_t bytes_result_scaled,
-                 /*8*/ float *demosaic_out,    /*9*/ size_t bytes_demosaic_out,
-                 /*10*/ float *denoise_out,    /*11*/ size_t bytes_denoise_out,
-                 /*12*/ float *transform_out,  /*13*/ size_t bytes_transform_out,
-                 /*14*/ float *gamut_out,      /*15*/ size_t bytes_gamut_out,
-                 /*16*/ float *TsTw,           /*17*/ size_t bytes_TsTw,
-                 /*18*/ float *ctrl_pts,       /*19*/ size_t bytes_ctrl_pts,
-                 /*20*/ float *weights,        /*21*/ size_t bytes_weights,
-                 /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
-                 /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
-                 /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
-                 /*28*/ size_t row_size,          /*29*/ size_t col_size) {
-
-  //Specifies compilation target for current node
-    __visc__hint(CPU_TARGET);
+void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
+                 /*2*/ uint8_t *result, /*3*/ size_t bytes_result,
+                 /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled,
+                 /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled,
+                 /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out,
+                 /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out,
+                 /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out,
+                 /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out,
+                 /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw,
+                 /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts,
+                 /*20*/ float *weights, /*21*/ size_t bytes_weights,
+                 /*22*/ float *coefs, /*23*/ size_t bytes_coefs,
+                 /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist,
+                 /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map,
+                 /*28*/ size_t row_size, /*29*/ size_t col_size) {
+
+  // Specifies compilation target for current node
+  __hpvm__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-    __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, 
-                       transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, 
-                       5, result, demosaic_out, denoise_out, transform_out, gamut_out);
+  __hpvm__attributes(14, input, result, input_scaled, result_scaled,
+                     demosaic_out, denoise_out, transform_out, gamut_out, TsTw,
+                     ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result,
+                     demosaic_out, denoise_out, transform_out, gamut_out);
 
   // Create an 0D (specified by 1st argument) HPVM node - so a single node
   // associated with node function ---_fxp_wrapper
-    void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
-    void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
-    void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
-    void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
-    void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
-    void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
-    void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
-    
+  void *ScNode = __hpvm__createNodeND(0, scale_fxp_wrapper);
+  void *DmNode = __hpvm__createNodeND(0, demosaic_fxp_wrapper);
+  void *DnNode = __hpvm__createNodeND(0, denoise_fxp_wrapper);
+  void *TrNode = __hpvm__createNodeND(0, transform_fxp_wrapper);
+  void *GmNode = __hpvm__createNodeND(0, gamut_fxp_wrapper);
+  void *TnNode = __hpvm__createNodeND(0, tone_map_fxp_wrapper);
+  void *DsNode = __hpvm__createNodeND(0, descale_fxp_wrapper);
+
   // BindIn binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
@@ -592,268 +612,283 @@ void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input,
   // - destination position (in argument list of destination node)
   // - streaming (1) or non-streaming (0)
 
-    // scale_fxp inputs
-    __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input
-    __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
-    __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result
-    __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result
-    __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
-    __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
-
-    // demosaic_fxp inputs
-    __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
-    __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input
-    __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result
-    __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result
-    __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size 
-    __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
-
-    // denoise_fxp inputs
-    __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
-    __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input
-    __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
-    __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
-    __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size 
-    __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
-    
-    // transform_fxp inputs
-    __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
-    __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input
-    __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
-    __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result
-    __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
-    __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
-    __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size 
-    __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
-    
-    // gamut_fxp inputs
-    __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
-    __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input
-    __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
-    __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
-    __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
-    __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
-    __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
-    __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
-    __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
-    __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
-    __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
-    __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
-    __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size 
-    __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
-    
-    // tone_map_fxp inputs
-    __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
-    __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input
-    __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
-    __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
-    __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
-    __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
-    __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size 
-    __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
-
-    // descale_fxp inputs
-    __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
-    __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input
-    __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result
-    __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result
-    __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
-    __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
+  // scale_fxp inputs
+  __hpvm__bindIn(ScNode, 0, 0, 0);  // input -> ScNode:input
+  __hpvm__bindIn(ScNode, 1, 1, 0);  // bytes_input -> ScNode:bytes_input
+  __hpvm__bindIn(ScNode, 4, 2, 0);  // input_scaled -> ScNode:result
+  __hpvm__bindIn(ScNode, 5, 3, 0);  // bytes_input_scaled -> ScNode:bytes_result
+  __hpvm__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
+  __hpvm__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
+
+  // demosaic_fxp inputs
+  __hpvm__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
+  __hpvm__edge(ScNode, DmNode, 1, 0, 1,
+               0);                  // SCNode:bytes_result -> DmNode:bytes_input
+  __hpvm__bindIn(DmNode, 8, 2, 0);  // demosaic_out -> DmNode:result
+  __hpvm__bindIn(DmNode, 9, 3, 0);  // bytes_demosaic_out -> DmNode:bytes_result
+  __hpvm__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size
+  __hpvm__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
+
+  // denoise_fxp inputs
+  __hpvm__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
+  __hpvm__edge(DmNode, DnNode, 1, 0, 1,
+               0);                  // DMNode:bytes_result -> DnNode:bytes_input
+  __hpvm__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
+  __hpvm__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
+  __hpvm__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size
+  __hpvm__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
+
+  // transform_fxp inputs
+  __hpvm__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
+  __hpvm__edge(DnNode, TrNode, 1, 0, 1,
+               0);                  // DnNode:bytes_result -> TrNode:bytes_input
+  __hpvm__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
+  __hpvm__bindIn(TrNode, 13, 3,
+                 0); // bytes_result_scaled -> TrNode:bytes_result
+  __hpvm__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
+  __hpvm__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
+  __hpvm__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size
+  __hpvm__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
+
+  // gamut_fxp inputs
+  __hpvm__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
+  __hpvm__edge(TrNode, GmNode, 1, 0, 1,
+               0);                  // TrNode:bytes_result -> GmNode:bytes_input
+  __hpvm__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
+  __hpvm__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
+  __hpvm__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
+  __hpvm__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
+  __hpvm__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
+  __hpvm__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
+  __hpvm__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
+  __hpvm__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
+  __hpvm__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
+  __hpvm__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
+  __hpvm__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size
+  __hpvm__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
+
+  // tone_map_fxp inputs
+  __hpvm__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
+  __hpvm__edge(GmNode, TnNode, 1, 0, 1,
+               0);                 // GmNode:bytes_result -> TnNode:bytes_input
+  __hpvm__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
+  __hpvm__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
+  __hpvm__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
+  __hpvm__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
+  __hpvm__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size
+  __hpvm__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
+
+  // descale_fxp inputs
+  __hpvm__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
+  __hpvm__edge(TnNode, DsNode, 1, 0, 1,
+               0);                  // TnNode:bytes_result -> DsNode:bytes_input
+  __hpvm__bindIn(DsNode, 2, 2, 0);  // result -> DsNode:result
+  __hpvm__bindIn(DsNode, 3, 3, 0);  // bytes_result -> DsNode:bytes_result
+  __hpvm__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
+  __hpvm__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-    __visc__bindOut(DsNode, 0, 0, 0);
-    
+  __hpvm__bindOut(DsNode, 0, 0, 0);
 }
 
-int main(int argc, char* argv[]) {
-    // Parse the arguments.
-    arguments args;
-    set_default_args(&args);
-    argp_parse(&parser, argc, argv, 0, 0, &args);
-
-    // Read a raw image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
-    printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
-    size_t row_size, col_size;
-    uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
-
-    printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
-
-    // Allocate a buffer for storing the output image data.
-    // (This is currently the same size as the input image data.)
-    size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
-    size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
-    uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image);
-
-    __visc__init();
-
-    ///////////////////////////////////////////////////////////////
-    // Camera Model Parameters
-    ///////////////////////////////////////////////////////////////
-    // Path to the camera model to be used
-//    char cam_model_path[100];
-//    char cam_model_path = "cam_models/NikonD7000/";
-    // White balance index (select white balance from transform file)
-    // The first white balance in the file has a wb_index of 1
-    // For more information on model format see the readme
-    int wb_index = 6;
-
-    // Number of control points
-    int num_ctrl_pts = 3702;
-    uint8_t *input, *result;
-    float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out;
-    float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
-
-    TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
-    float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
-    free(TsTw);
-    TsTw = trans;
-    ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
-    weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
-    coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
-    tone_map = get_tone_map("cam_models/NikonD7000/");
-    
-    input_scaled = (float*) malloc_aligned(bytes_fimage);
-    result_scaled = (float*) malloc_aligned(bytes_fimage);
-    demosaic_out = (float*) malloc_aligned(bytes_fimage);
-    denoise_out = (float*) malloc_aligned(bytes_fimage);
-    transform_out  = (float*) malloc_aligned(bytes_fimage);
-    gamut_out = (float*) malloc_aligned(bytes_fimage);
-    l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);    
-    
-    // This is host_input in cam_pipe()
-    input = (uint8_t*) malloc_aligned(bytes_image);
-    convert_hwc_to_chw(image_in, row_size, col_size, &input);
-    
-    // This is host_result in cam_pipe()
-    result = (uint8_t*) malloc_aligned(bytes_image);
-
-    // Allocate struct to pass DFG inputs
-    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
-
-    // Set up HPVM DFG inputs in the rootArgs struct.
-    rootArgs->input = input;
-    rootArgs->bytes_input = bytes_image;
-    
-    rootArgs->result = result;
-    rootArgs->bytes_result = bytes_image;
-    
-    rootArgs->input_scaled = input_scaled;
-    rootArgs->bytes_input_scaled = bytes_fimage;
-    
-    rootArgs->result_scaled = result_scaled;
-    rootArgs->bytes_result_scaled = bytes_fimage;
-    
-    rootArgs->demosaic_out = demosaic_out;
-    rootArgs->bytes_demosaic_out = bytes_fimage;
-    
-    rootArgs->denoise_out = denoise_out;
-    rootArgs->bytes_denoise_out = bytes_fimage;
-    
-    rootArgs->transform_out = transform_out;
-    rootArgs->bytes_transform_out = bytes_fimage;
-
-    rootArgs->gamut_out = gamut_out;
-    rootArgs->bytes_gamut_out = bytes_fimage;
-
-    rootArgs->TsTw = TsTw;
-    rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->ctrl_pts = ctrl_pts;
-    rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->weights = weights;
-    rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->coefs = coefs;
-    rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->tone_map = tone_map;
-    rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->l2_dist = l2_dist;
-    rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
-    
-    rootArgs->row_size = row_size;
-    rootArgs->col_size = col_size;
-
-    // Memory tracking is required for pointer arguments.
-    // Nodes can be scheduled on different targets, and 
-    // dataflow edge implementation needs to request data.
-    // The pair (pointer, size) is inserted in memory tracker using this call
-    llvm_visc_track_mem(input, bytes_image);
-    llvm_visc_track_mem(result, bytes_image);
-    llvm_visc_track_mem(input_scaled, bytes_fimage);
-    llvm_visc_track_mem(result_scaled, bytes_fimage);
-    llvm_visc_track_mem(demosaic_out, bytes_fimage);
-    llvm_visc_track_mem(denoise_out, bytes_fimage);
-    llvm_visc_track_mem(transform_out, bytes_fimage);
-    llvm_visc_track_mem(gamut_out, bytes_fimage);
-    llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); 
-    llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float));
-    llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
-    
-    printf("\n\nLaunching CAVA pipeline!\n");
-
-    void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs);
-    __visc__wait(camPipeDFG);
-
-    printf("\n\nPipeline execution completed!\n");
-    printf("\n\nRequesting memory!\n");
-
-    // Request data from graph.    
-    llvm_visc_request_mem(result, bytes_image);
-    llvm_visc_request_mem(demosaic_out, bytes_fimage);
-    llvm_visc_request_mem(denoise_out, bytes_fimage);
-    llvm_visc_request_mem(transform_out, bytes_fimage);
-    llvm_visc_request_mem(gamut_out, bytes_fimage);
-    printf("\n\nDone requesting memory!\n");
-
-
-    uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-  uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    
-  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size);
-    
-    convert_chw_to_hwc(result, row_size, col_size, &image_out);
-   convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
-    convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic);
-    convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise);
-    convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform);
-
-    
-    // Remove tracked pointers.
-    llvm_visc_untrack_mem(input);
-    llvm_visc_untrack_mem(result);
-    llvm_visc_untrack_mem(input_scaled);
-    llvm_visc_untrack_mem(result_scaled);
-    llvm_visc_untrack_mem(demosaic_out);
-    llvm_visc_untrack_mem(denoise_out);
-    llvm_visc_untrack_mem(transform_out);
-    llvm_visc_untrack_mem(gamut_out);
-    
-    llvm_visc_untrack_mem(TsTw); 
-    llvm_visc_untrack_mem(ctrl_pts);
-    llvm_visc_untrack_mem(weights);
-    llvm_visc_untrack_mem(coefs);
-    llvm_visc_untrack_mem(tone_map);
-    llvm_visc_untrack_mem(l2_dist);
-
-    // Output the image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
+int main(int argc, char *argv[]) {
+  // Parse the arguments.
+  arguments args;
+  set_default_args(&args);
+  argp_parse(&parser, argc, argv, 0, 0, &args);
+
+  // Read a raw image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
+  printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
+  size_t row_size, col_size;
+  uint8_t *image_in =
+      read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
+
+  printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
+
+  // Allocate a buffer for storing the output image data.
+  // (This is currently the same size as the input image data.)
+  size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
+  size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
+  uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image);
+
+  __hpvm__init();
+
+  ///////////////////////////////////////////////////////////////
+  // Camera Model Parameters
+  ///////////////////////////////////////////////////////////////
+  // Path to the camera model to be used
+  //    char cam_model_path[100];
+  //    char cam_model_path = "cam_models/NikonD7000/";
+  // White balance index (select white balance from transform file)
+  // The first white balance in the file has a wb_index of 1
+  // For more information on model format see the readme
+  int wb_index = 6;
+
+  // Number of control points
+  int num_ctrl_pts = 3702;
+  uint8_t *input, *result;
+  float *input_scaled, *result_scaled, *demosaic_out, *denoise_out,
+      *transform_out, *gamut_out;
+  float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
+
+  TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+  float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
+  free(TsTw);
+  TsTw = trans;
+  ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
+  weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
+  coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
+  tone_map = get_tone_map("cam_models/NikonD7000/");
+
+  input_scaled = (float *)malloc_aligned(bytes_fimage);
+  result_scaled = (float *)malloc_aligned(bytes_fimage);
+  demosaic_out = (float *)malloc_aligned(bytes_fimage);
+  denoise_out = (float *)malloc_aligned(bytes_fimage);
+  transform_out = (float *)malloc_aligned(bytes_fimage);
+  gamut_out = (float *)malloc_aligned(bytes_fimage);
+  l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
+
+  // This is host_input in cam_pipe()
+  input = (uint8_t *)malloc_aligned(bytes_image);
+  convert_hwc_to_chw(image_in, row_size, col_size, &input);
+
+  // This is host_result in cam_pipe()
+  result = (uint8_t *)malloc_aligned(bytes_image);
+
+  // Allocate struct to pass DFG inputs
+  RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn));
+
+  // Set up HPVM DFG inputs in the rootArgs struct.
+  rootArgs->input = input;
+  rootArgs->bytes_input = bytes_image;
+
+  rootArgs->result = result;
+  rootArgs->bytes_result = bytes_image;
+
+  rootArgs->input_scaled = input_scaled;
+  rootArgs->bytes_input_scaled = bytes_fimage;
+
+  rootArgs->result_scaled = result_scaled;
+  rootArgs->bytes_result_scaled = bytes_fimage;
+
+  rootArgs->demosaic_out = demosaic_out;
+  rootArgs->bytes_demosaic_out = bytes_fimage;
+
+  rootArgs->denoise_out = denoise_out;
+  rootArgs->bytes_denoise_out = bytes_fimage;
+
+  rootArgs->transform_out = transform_out;
+  rootArgs->bytes_transform_out = bytes_fimage;
+
+  rootArgs->gamut_out = gamut_out;
+  rootArgs->bytes_gamut_out = bytes_fimage;
+
+  rootArgs->TsTw = TsTw;
+  rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
+
+  rootArgs->ctrl_pts = ctrl_pts;
+  rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->weights = weights;
+  rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->coefs = coefs;
+  rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->tone_map = tone_map;
+  rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->l2_dist = l2_dist;
+  rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
+
+  rootArgs->row_size = row_size;
+  rootArgs->col_size = col_size;
+
+  // Memory tracking is required for pointer arguments.
+  // Nodes can be scheduled on different targets, and
+  // dataflow edge implementation needs to request data.
+  // The pair (pointer, size) is inserted in memory tracker using this call
+  llvm_hpvm_track_mem(input, bytes_image);
+  llvm_hpvm_track_mem(result, bytes_image);
+  llvm_hpvm_track_mem(input_scaled, bytes_fimage);
+  llvm_hpvm_track_mem(result_scaled, bytes_fimage);
+  llvm_hpvm_track_mem(demosaic_out, bytes_fimage);
+  llvm_hpvm_track_mem(denoise_out, bytes_fimage);
+  llvm_hpvm_track_mem(transform_out, bytes_fimage);
+  llvm_hpvm_track_mem(gamut_out, bytes_fimage);
+  llvm_hpvm_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
+
+  printf("\n\nLaunching CAVA pipeline!\n");
+
+  void *camPipeDFG = __hpvm__launch(0, CamPipeRoot, (void *)rootArgs);
+  __hpvm__wait(camPipeDFG);
+
+  printf("\n\nPipeline execution completed!\n");
+  printf("Pipeline final stage returned %lu; should be %lu\n",
+         rootArgs->ret.bytesRet, bytes_image);
+  printf("\n\nRequesting memory!\n");
+
+  // Request data from graph.
+  llvm_hpvm_request_mem(result, bytes_image);
+  llvm_hpvm_request_mem(demosaic_out, bytes_fimage);
+  llvm_hpvm_request_mem(denoise_out, bytes_fimage);
+  llvm_hpvm_request_mem(transform_out, bytes_fimage);
+  llvm_hpvm_request_mem(gamut_out, bytes_fimage);
+  printf("\n\nDone requesting memory!\n");
+
+  uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+
+  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image,
+              row_size, col_size);
+
+  convert_chw_to_hwc(result, row_size, col_size, &image_out);
+  convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
+  convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size,
+                     &image_out_demosaic);
+  convert_chw_to_hwc(denoise_out_descaled, row_size, col_size,
+                     &image_out_denoise);
+  convert_chw_to_hwc(transform_out_descaled, row_size, col_size,
+                     &image_out_transform);
+
+  // Remove tracked pointers.
+  llvm_hpvm_untrack_mem(input);
+  llvm_hpvm_untrack_mem(result);
+  llvm_hpvm_untrack_mem(input_scaled);
+  llvm_hpvm_untrack_mem(result_scaled);
+  llvm_hpvm_untrack_mem(demosaic_out);
+  llvm_hpvm_untrack_mem(denoise_out);
+  llvm_hpvm_untrack_mem(transform_out);
+  llvm_hpvm_untrack_mem(gamut_out);
+
+  llvm_hpvm_untrack_mem(TsTw);
+  llvm_hpvm_untrack_mem(ctrl_pts);
+  llvm_hpvm_untrack_mem(weights);
+  llvm_hpvm_untrack_mem(coefs);
+  llvm_hpvm_untrack_mem(tone_map);
+  llvm_hpvm_untrack_mem(l2_dist);
+
+  // Output the image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
   char str[50], base_str[50];
   strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
   strcpy(str, base_str);
@@ -877,8 +912,7 @@ int main(int argc, char* argv[]) {
   printf("Writing output image to %s\n", str);
   write_image_to_binary(str, image_out_transform, row_size, col_size);
 
-    __visc__cleanup();
+  __hpvm__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c
index 2ebedec936915b5e7f11881c5001c84b6db26474..05bb06697fa8df130aa0d0d324f9bc39bc575fb2 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.c
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.c
@@ -1,172 +1,169 @@
-#include <stdio.h>
-#include <math.h>
 #include "pipe_stages.h"
 #include "cam_pipe_utility.h"
+#include <math.h>
+#include <stdio.h>
+
+// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, output, 1, output);
 
-//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, output, 1, output);
-  
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(float, _output, output, row_size, col_size);
-  sl_chan:
+sl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    sl_row:
+  sl_row:
     for (int row = 0; row < row_size; row++)
-      sl_col:
+    sl_col:
       for (int col = 0; col < col_size; col++)
         _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255;
 
-  __visc__return(1, bytes_output);
+  __hpvm__return(1, bytes_output);
 }
 
-//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, output, 1, output);
-  
+// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, output, 1, output);
+
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _output, output, row_size, col_size);
-  dsl_chan:
+dsl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dsl_row:
+  dsl_row:
     for (int row = 0; row < row_size; row++)
-      dsl_col:
+    dsl_col:
       for (int col = 0; col < col_size; col++)
-        _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255);
+        _output[chan][row][col] =
+            min(max(_input[chan][row][col] * 255, 0), 255);
 
-  __visc__return(1, bytes_output);
+  __hpvm__return(1, bytes_output);
 }
 
 // Demosaicing stage
 // G R
 // B G
-//void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
+// void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
   printf("Demosaicing.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dm_row:
+dm_row:
   for (int row = 1; row < row_size - 1; row++)
-    dm_col:
+  dm_col:
     for (int col = 1; col < col_size - 1; col++)
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = _input[0][row][col - 1];
-            float R2 = _input[0][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col];
-            float B2 = _input[2][row + 1][col];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col - 1];
-            float B2 = _input[2][row - 1][col + 1];
-            float B3 = _input[2][row + 1][col - 1];
-            float B4 = _input[2][row + 1][col + 1];
-            // R
-            _result[0][row][col] = _input[0][row][col];
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col - 1];
-            float R2 = _input[0][row + 1][col - 1];
-            float R3 = _input[0][row - 1][col + 1];
-            float R4 = _input[0][row + 1][col + 1];
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            _result[2][row][col] = _input[2][row][col];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col];
-            float R2 = _input[0][row + 1][col];
-            // Getting the B values
-            float B1 = _input[2][row][col - 1];
-            float B2 = _input[2][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        }
+      if (row % 2 == 0 && col % 2 == 0) {
+        // Green pixel
+        // Getting the R values
+        float R1 = _input[0][row][col - 1];
+        float R2 = _input[0][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col];
+        float B2 = _input[2][row + 1][col];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      } else if (row % 2 == 0 && col % 2 == 1) {
+        // Red pixel
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col - 1];
+        float B2 = _input[2][row - 1][col + 1];
+        float B3 = _input[2][row + 1][col - 1];
+        float B4 = _input[2][row + 1][col + 1];
+        // R
+        _result[0][row][col] = _input[0][row][col];
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B (center pixel)
+        _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
+      } else if (row % 2 == 1 && col % 2 == 0) {
+        // Blue pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col - 1];
+        float R2 = _input[0][row + 1][col - 1];
+        float R3 = _input[0][row - 1][col + 1];
+        float R4 = _input[0][row + 1][col + 1];
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B
+        _result[2][row][col] = _input[2][row][col];
+      } else {
+        // Bottom Green pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col];
+        float R2 = _input[0][row + 1][col];
+        // Getting the B values
+        float B1 = _input[2][row][col - 1];
+        float B2 = _input[2][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      }
 
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    dn_sort_i:
-    for (i = 0; i < n - 1; i++)
-        dn_sort_j:
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+dn_sort_i:
+  for (i = 0; i < n - 1; i++)
+  dn_sort_j:
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 // Simple denoise
-//void denoise_fxp(float *input, int row_size, int col_size, float *result) {
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
+// void denoise_fxp(float *input, int row_size, int col_size, float *result) {
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
   printf("Denoising.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dn_chan:
+dn_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dn_row:
+  dn_row:
     for (int row = 0; row < row_size; row++)
-      dn_col:
+    dn_col:
       for (int col = 0; col < col_size; col++)
         if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
           float filter[9];
-          dn_slide_row:
-          for (int i = row-1; i < row+2; i++)
-            dn_slide_col:
-            for (int j = col-1; j < col+2; j++) {
+        dn_slide_row:
+          for (int i = row - 1; i < row + 2; i++)
+          dn_slide_col:
+            for (int j = col - 1; j < col + 2; j++) {
               int index = (i - row + 1) * 3 + j - col + 1;
               filter[index] = _input[chan][i][j];
             }
@@ -175,53 +172,52 @@ void denoise_fxp(float *input, size_t bytes_input,
         } else {
           _result[chan][row][col] = _input[chan][row][col];
         }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Color map and white balance transform
-//void transform_fxp(float *input, int row_size, int col_size, float *result,
+// void transform_fxp(float *input, int row_size, int col_size, float *result,
 //                   float *TsTw_tran) {
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+
   printf("Color mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3);
 
-  tr_chan:
+tr_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tr_row:
+  tr_row:
     for (int row = 0; row < row_size; row++)
-      tr_col:
+    tr_col:
       for (int col = 0; col < col_size; col++)
         _result[chan][row][col] =
             max(_input[0][row][col] * _TsTw_tran[0][chan] +
                     _input[1][row][col] * _TsTw_tran[1][chan] +
                     _input[2][row][col] * _TsTw_tran[2][chan],
                 0);
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 //
 // Weighted radial basis function for gamut mapping
 //
-//void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
-//                   float *ctrl_pts, float *weights, float *coefs, float *l2_dist) {
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+// void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *ctrl_pts, float *weights, float *coefs, float
+//                   *l2_dist) {
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
+
   printf("Gamut mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
@@ -229,26 +225,25 @@ void gamut_map_fxp(float *input, size_t bytes_input,
   ARRAY_2D(float, _weights, weights, 3);
   ARRAY_2D(float, _coefs, coefs, 3);
 
-  // First, get the L2 norm from every pixel to the control points,
-  // Then, sum it and weight it. Finally, add the bias.
-  gm_rbf_row:
+// First, get the L2 norm from every pixel to the control points,
+// Then, sum it and weight it. Finally, add the bias.
+gm_rbf_row:
   for (int row = 0; row < row_size; row++)
-    gm_rbf_col:
+  gm_rbf_col:
     for (int col = 0; col < col_size; col++) {
-      gm_rbf_cp0:
+    gm_rbf_cp0:
       for (int cp = 0; cp < num_ctrl_pts; cp++) {
-        l2_dist[cp] =
-            sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
-                     (_input[0][row][col] - _ctrl_pts[cp][0]) +
-                 (_input[1][row][col] - _ctrl_pts[cp][1]) *
-                     (_input[1][row][col] - _ctrl_pts[cp][1]) +
-                 (_input[2][row][col] - _ctrl_pts[cp][2]) *
-                     (_input[2][row][col] - _ctrl_pts[cp][2]));
+        l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
+                               (_input[0][row][col] - _ctrl_pts[cp][0]) +
+                           (_input[1][row][col] - _ctrl_pts[cp][1]) *
+                               (_input[1][row][col] - _ctrl_pts[cp][1]) +
+                           (_input[2][row][col] - _ctrl_pts[cp][2]) *
+                               (_input[2][row][col] - _ctrl_pts[cp][2]));
       }
-      gm_rbf_chan:
+    gm_rbf_chan:
       for (int chan = 0; chan < CHAN_SIZE; chan++) {
         float chan_val = 0.0;
-        gm_rbf_cp1:
+      gm_rbf_cp1:
         for (int cp = 0; cp < num_ctrl_pts; cp++) {
           chan_val += l2_dist[cp] * _weights[cp][chan];
         }
@@ -259,32 +254,31 @@ void gamut_map_fxp(float *input, size_t bytes_input,
         _result[chan][row][col] = max(chan_val, 0);
       }
     }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Tone mapping
-//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
+// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
 //                  float *result) {
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+
   printf("Tone mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _tone_map, tone_map, 3);
 
-  tm_chan:
+tm_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tm_row:
+  tm_row:
     for (int row = 0; row < row_size; row++)
-      tm_col:
+    tm_col:
       for (int col = 0; col < col_size; col++) {
         uint8_t x = _input[chan][row][col] * 255;
         _result[chan][row][col] = _tone_map[x][chan];
       }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
diff --git a/hpvm/test/include/hpvm.h b/hpvm/test/include/hpvm.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e31c98946f00e32d84933fe4bfd443e65cb92a9
--- /dev/null
+++ b/hpvm/test/include/hpvm.h
@@ -0,0 +1,73 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef DEVICE
+#define DEVICE GPU_TARGET
+#endif
+
+#include "../../include/SupportHPVM/HPVMHint.h"
+
+#ifndef __cplusplus
+#define noexcept
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+void __hpvm__hint(hpvm::Target) noexcept;
+#else
+void __hpvm__hint(enum Target) noexcept;
+#endif
+
+void *__hpvm__createNodeND(unsigned, ...) noexcept;
+void __hpvm__return(unsigned, ...) noexcept;
+
+void __hpvm__attributes(unsigned, ...) noexcept;
+void __hpvm__init() noexcept;
+void __hpvm__cleanup() noexcept;
+
+void __hpvm__bindIn(void *, unsigned, unsigned, unsigned) noexcept;
+void __hpvm__bindOut(void *, unsigned, unsigned, unsigned) noexcept;
+void *__hpvm__edge(void *, void *, unsigned, unsigned, unsigned,
+                   unsigned) noexcept;
+
+void __hpvm__push(void *, void *) noexcept;
+void *__hpvm__pop(void *) noexcept;
+void *__hpvm__launch(unsigned, ...) noexcept;
+void __hpvm__wait(void *) noexcept;
+
+void *__hpvm__getNode() noexcept;
+void *__hpvm__getParentNode(void *) noexcept;
+void __hpvm__barrier() noexcept;
+void *__hpvm__malloc(long) noexcept;
+long __hpvm__getNodeInstanceID_x(void *) noexcept;
+long __hpvm__getNodeInstanceID_y(void *) noexcept;
+long __hpvm__getNodeInstanceID_z(void *) noexcept;
+long __hpvm__getNumNodeInstances_x(void *) noexcept;
+long __hpvm__getNumNodeInstances_y(void *) noexcept;
+long __hpvm__getNumNodeInstances_z(void *) noexcept;
+
+// Atomic
+// signed int
+int __hpvm__atomic_add(int *, int) noexcept;
+int __hpvm__atomic_sub(int *, int) noexcept;
+int __hpvm__atomic_xchg(int *, int) noexcept;
+int __hpvm__atomic_inc(int *) noexcept;
+int __hpvm__atomic_dec(int *) noexcept;
+int __hpvm__atomic_min(int *, int) noexcept;
+int __hpvm__atomic_max(int *, int) noexcept;
+int __hpvm__atomic_and(int *, int) noexcept;
+int __hpvm__atomic_or(int *, int) noexcept;
+int __hpvm__atomic_xor(int *, int) noexcept;
+
+void llvm_hpvm_track_mem(void *, size_t) noexcept;
+void llvm_hpvm_untrack_mem(void *) noexcept;
+void llvm_hpvm_request_mem(void *, size_t) noexcept;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/hpvm/test/include/visc.h b/hpvm/test/include/visc.h
deleted file mode 100644
index 18b29500261362be66ea23feecf9a5f85ac68005..0000000000000000000000000000000000000000
--- a/hpvm/test/include/visc.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef DEVICE
-#define DEVICE GPU_TARGET
-#endif
-
-#include "../../include/SupportVISC/VISCHint.h"
-
-#ifndef __cplusplus
-#define noexcept
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-void __visc__hint(visc::Target) noexcept;
-#else
-void __visc__hint(enum Target) noexcept;
-#endif
-
-void *__visc__createNodeND(unsigned, ...) noexcept;
-void __visc__return(unsigned, ...) noexcept;
-
-void __visc__attributes(unsigned, ...) noexcept;
-void __visc__init() noexcept;
-void __visc__cleanup() noexcept;
-
-void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept;
-void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept;
-void *__visc__edge(void *, void *, unsigned, unsigned, unsigned,
-                   unsigned) noexcept;
-
-void __visc__push(void *, void *) noexcept;
-void *__visc__pop(void *) noexcept;
-void *__visc__launch(unsigned, ...) noexcept;
-void __visc__wait(void *) noexcept;
-
-void *__visc__getNode() noexcept;
-void *__visc__getParentNode(void *) noexcept;
-void __visc__barrier() noexcept;
-void *__visc__malloc(long) noexcept;
-long __visc__getNodeInstanceID_x(void *) noexcept;
-long __visc__getNodeInstanceID_y(void *) noexcept;
-long __visc__getNodeInstanceID_z(void *) noexcept;
-long __visc__getNumNodeInstances_x(void *) noexcept;
-long __visc__getNumNodeInstances_y(void *) noexcept;
-long __visc__getNumNodeInstances_z(void *) noexcept;
-
-// Atomic
-// signed int
-int __visc__atomic_add(int *, int) noexcept;
-int __visc__atomic_sub(int *, int) noexcept;
-int __visc__atomic_xchg(int *, int) noexcept;
-int __visc__atomic_inc(int *) noexcept;
-int __visc__atomic_dec(int *) noexcept;
-int __visc__atomic_min(int *, int) noexcept;
-int __visc__atomic_max(int *, int) noexcept;
-int __visc__atomic_and(int *, int) noexcept;
-int __visc__atomic_or(int *, int) noexcept;
-int __visc__atomic_xor(int *, int) noexcept;
-
-void llvm_visc_track_mem(void *, size_t) noexcept;
-void llvm_visc_untrack_mem(void *) noexcept;
-void llvm_visc_request_mem(void *, size_t) noexcept;
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/RUN.parboil.script b/hpvm/test/parboil/RUN.parboil.script
index 7f8c01ede7bacdccc546f2a68935eb91db64afd6..5cedcf480dbcd357599710acdf27c274ec7c4ccf 100644
--- a/hpvm/test/parboil/RUN.parboil.script
+++ b/hpvm/test/parboil/RUN.parboil.script
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
 ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
 ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll  -S -o %t.linked.ll
+; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll parboil.ll  -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
diff --git a/hpvm/test/parboil/benchmarks/bfs/Makefile b/hpvm/test/parboil/benchmarks/bfs/Makefile
index cc6db678298c4c66312248cc4f7a2df0bd134d3f..e40a8484a3c7b40919b07fd7c30ab512c01741d8 100644
--- a/hpvm/test/parboil/benchmarks/bfs/Makefile
+++ b/hpvm/test/parboil/benchmarks/bfs/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = bfs
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile
similarity index 81%
rename from hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile
index a459707110b6f281e8b1c8fc1cf21f888dffe95e..27cde148f75502914d12a77448d358cbea2f17ab 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/bfs/src/visc/config.h
rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/config.h
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp
similarity index 70%
rename from hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp
index 9491218e5e93d39fc1bda4fac3c14770ee48645b..0fa9a60df8e631f4684c58c26fdafc498a06295b 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/hpvm/main.cpp
@@ -26,11 +26,11 @@
 */
 #include "config.h"
 #include "parboil.h"
+#include <hpvm.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
 /**********
 Define colors for BFS
@@ -113,11 +113,11 @@ void packData(RootIn *args, int *q1, size_t bytesq1, int *q2, size_t bytesq2,
 
 void Allocation(long block) {
   // Memory shared between threadblocks
-  void *local_q_tail = __visc__malloc(sizeof(int));
-  void *local_q = __visc__malloc(LOCAL_MEM_SIZE * sizeof(int));
-  void *shift = __visc__malloc(sizeof(int));
+  void *local_q_tail = __hpvm__malloc(sizeof(int));
+  void *local_q = __hpvm__malloc(LOCAL_MEM_SIZE * sizeof(int));
+  void *shift = __hpvm__malloc(sizeof(int));
 
-  __visc__return(6, local_q_tail, sizeof(int), local_q,
+  __hpvm__return(6, local_q_tail, sizeof(int), local_q,
                  LOCAL_MEM_SIZE * sizeof(int), shift, sizeof(int));
 }
 
@@ -133,21 +133,21 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
              int *local_q_tail, size_t byteslocal_q_tail, int *local_q,
              size_t byteslocal_q, int *shift, size_t bytesshift) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
                      4, q2, g_color, g_cost, tail);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int dimx = __visc__getNumNodeInstances_x(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int dimx = __hpvm__getNumNodeInstances_x(thisNode);
 
   if (lx == 0) {
     *local_q_tail = 0; // initialize the tail of w-queue
   }
 
-  __visc__barrier();
+  __hpvm__barrier();
 
   // first, propagate and add the new frontier elements into w-queues
   // int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0);
@@ -170,16 +170,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
       int cost = cur_edge.y;
       cost += cur_cost;
 
-      int orig_cost = __visc__atomic_min(&g_cost[id], cost);
+      int orig_cost = __hpvm__atomic_min(&g_cost[id], cost);
       if (orig_cost > cost) { // the node should be visited
         if (g_color[id] > UP_LIMIT) {
-          int old_color = __visc__atomic_xchg(&g_color[id], gray_shade);
+          int old_color = __hpvm__atomic_xchg(&g_color[id], gray_shade);
           // this guarantees that only one thread will push this node
           // into a queue
           if (old_color != gray_shade) {
             // atomic operation guarantees the correctness
             // even if multiple warps are executing simultaneously
-            int index = __visc__atomic_add(local_q_tail, 1);
+            int index = __hpvm__atomic_add(local_q_tail, 1);
             local_q[index] = id;
           }
         }
@@ -187,16 +187,16 @@ void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
     }
   }
 
-  __visc__barrier();
+  __hpvm__barrier();
 
   if (lx == 0) {
     int tot_sum = *local_q_tail;
     // the offset or "shift" of the block-level queue within the grid-level
     // queue is determined by atomic operation
-    *shift = __visc__atomic_add(tail, tot_sum);
+    *shift = __hpvm__atomic_add(tail, tot_sum);
   }
 
-  __visc__barrier();
+  __hpvm__barrier();
 
   // shift within a w-queue
   int local_shift = lx;
@@ -220,41 +220,41 @@ void BlockingBFS(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
                  // ideally be placed in local memory
                  int *local_q_tail, size_t byteslocal_q_tail, int *local_q,
                  size_t byteslocal_q, int *shift, size_t bytesshift) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
                      4, q2, g_color, g_cost, tail);
 
-  void *AllocationNode = __visc__createNodeND(0, Allocation);
-  void *BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block);
+  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
+  void *BFSLeafNode = __hpvm__createNodeND(1, BFSLeaf, block);
 
   // Bind edges
-  __visc__bindIn(AllocationNode, 17, 0, 0); // Bind block
-  __visc__bindIn(BFSLeafNode, 0, 0, 0);     // Bind q1
-  __visc__bindIn(BFSLeafNode, 1, 1, 0);     // Bind bytes_q1
-  __visc__bindIn(BFSLeafNode, 2, 2, 0);     // Bind q2
-  __visc__bindIn(BFSLeafNode, 3, 3, 0);     // Bind bytes_q2
-  __visc__bindIn(BFSLeafNode, 4, 4, 0);     // Bind graph_nodes
-  __visc__bindIn(BFSLeafNode, 5, 5, 0);     // Bind bytes_graph_nodes
-  __visc__bindIn(BFSLeafNode, 6, 6, 0);     // Bind graph_edges
-  __visc__bindIn(BFSLeafNode, 7, 7, 0);     // Bind bytes_graph_edges
-  __visc__bindIn(BFSLeafNode, 8, 8, 0);     // Bind color
-  __visc__bindIn(BFSLeafNode, 9, 9, 0);     // Bind bytes_color
-  __visc__bindIn(BFSLeafNode, 10, 10, 0);   // Bind cost
-  __visc__bindIn(BFSLeafNode, 11, 11, 0);   // Bind bytes_cost
-  __visc__bindIn(BFSLeafNode, 12, 12, 0);   // Bind tail
-  __visc__bindIn(BFSLeafNode, 13, 13, 0);   // Bind bytes_tail
-  __visc__bindIn(BFSLeafNode, 14, 14, 0);   // Bind no_of_nodes
-  __visc__bindIn(BFSLeafNode, 15, 15, 0);   // Bind gray_shade
-  __visc__bindIn(BFSLeafNode, 16, 16, 0);   // Bind k
+  __hpvm__bindIn(AllocationNode, 17, 0, 0); // Bind block
+  __hpvm__bindIn(BFSLeafNode, 0, 0, 0);     // Bind q1
+  __hpvm__bindIn(BFSLeafNode, 1, 1, 0);     // Bind bytes_q1
+  __hpvm__bindIn(BFSLeafNode, 2, 2, 0);     // Bind q2
+  __hpvm__bindIn(BFSLeafNode, 3, 3, 0);     // Bind bytes_q2
+  __hpvm__bindIn(BFSLeafNode, 4, 4, 0);     // Bind graph_nodes
+  __hpvm__bindIn(BFSLeafNode, 5, 5, 0);     // Bind bytes_graph_nodes
+  __hpvm__bindIn(BFSLeafNode, 6, 6, 0);     // Bind graph_edges
+  __hpvm__bindIn(BFSLeafNode, 7, 7, 0);     // Bind bytes_graph_edges
+  __hpvm__bindIn(BFSLeafNode, 8, 8, 0);     // Bind color
+  __hpvm__bindIn(BFSLeafNode, 9, 9, 0);     // Bind bytes_color
+  __hpvm__bindIn(BFSLeafNode, 10, 10, 0);   // Bind cost
+  __hpvm__bindIn(BFSLeafNode, 11, 11, 0);   // Bind bytes_cost
+  __hpvm__bindIn(BFSLeafNode, 12, 12, 0);   // Bind tail
+  __hpvm__bindIn(BFSLeafNode, 13, 13, 0);   // Bind bytes_tail
+  __hpvm__bindIn(BFSLeafNode, 14, 14, 0);   // Bind no_of_nodes
+  __hpvm__bindIn(BFSLeafNode, 15, 15, 0);   // Bind gray_shade
+  __hpvm__bindIn(BFSLeafNode, 16, 16, 0);   // Bind k
 
   // Create Edges between AllocationNode and BFSLeafNodeNode
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18,
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 1, 18,
                0); // Edge bytes_local_q_tail
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift
+  __hpvm__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift
 }
 
 // VoidRetTy
@@ -264,30 +264,30 @@ void BFS_Root(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
               int *g_color, size_t bytesg_color, int *g_cost,
               size_t bytesg_cost, int *tail, size_t bytestail, int no_of_nodes,
               int gray_shade, int k, long block, long grid) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
                      4, q2, g_color, g_cost, tail);
-  void *BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid);
+  void *BlockingBFSNode = __hpvm__createNodeND(1, BlockingBFS, grid);
 
   // Bind edges
-  __visc__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
-  __visc__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
-  __visc__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
-  __visc__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
-  __visc__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
-  __visc__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
-  __visc__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
-  __visc__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
-  __visc__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
-  __visc__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
-  __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
-  __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
-  __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
-  __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail
-  __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes
-  __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
-  __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
-  __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
+  __hpvm__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
+  __hpvm__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
+  __hpvm__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
+  __hpvm__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
+  __hpvm__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
+  __hpvm__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
+  __hpvm__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
+  __hpvm__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
+  __hpvm__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
+  __hpvm__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
+  __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
+  __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
+  __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
+  __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail
+  __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes
+  __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
+  __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
+  __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
 }
 
 void BFS_Wrapper(int *q1, size_t bytesq1,                               // 0, 1
@@ -300,31 +300,31 @@ void BFS_Wrapper(int *q1, size_t bytesq1,                               // 0, 1
                  int no_of_nodes, int gray_shade, // 14, 15
                  int k, long block, long grid     // 16 - 18
 ) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
                      4, q2, g_color, g_cost, tail);
-  void *BlockingBFSNode = __visc__createNodeND(0, BFS_Root);
+  void *BlockingBFSNode = __hpvm__createNodeND(0, BFS_Root);
 
   // Bind edges
-  __visc__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
-  __visc__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
-  __visc__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
-  __visc__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
-  __visc__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
-  __visc__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
-  __visc__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
-  __visc__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
-  __visc__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
-  __visc__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
-  __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
-  __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
-  __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
-  __visc__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail
-  __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes
-  __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
-  __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
-  __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
-  __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid
+  __hpvm__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
+  __hpvm__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
+  __hpvm__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
+  __hpvm__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
+  __hpvm__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
+  __hpvm__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
+  __hpvm__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
+  __hpvm__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
+  __hpvm__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
+  __hpvm__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
+  __hpvm__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
+  __hpvm__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
+  __hpvm__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
+  __hpvm__bindIn(BlockingBFSNode, 13, 13, 0); // Bind bytes_tail
+  __hpvm__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes
+  __hpvm__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
+  __hpvm__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
+  __hpvm__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
+  __hpvm__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid
 }
 
 FILE *fp;
@@ -415,7 +415,7 @@ int main(int argc, char **argv) {
     fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // allocate mem for the result on host side
@@ -433,15 +433,15 @@ int main(int argc, char **argv) {
   int *q2 = (int *)malloc(sizeof(int) * num_of_nodes);
   int *tail = (int *)malloc(sizeof(int));
 
-  llvm_visc_track_mem(graph_nodes, bytes_graph_nodes);
-  llvm_visc_track_mem(graph_edges, bytes_graph_edges);
-  llvm_visc_track_mem(cost, bytes_cost);
-  llvm_visc_track_mem(color, bytes_cost);
+  llvm_hpvm_track_mem(graph_nodes, bytes_graph_nodes);
+  llvm_hpvm_track_mem(graph_edges, bytes_graph_edges);
+  llvm_hpvm_track_mem(cost, bytes_cost);
+  llvm_hpvm_track_mem(color, bytes_cost);
   // Allocating stuff on host side, but these can also be allocated in the graph
-  llvm_visc_track_mem(q1, bytes_cost);
-  llvm_visc_track_mem(q2, bytes_cost);
+  llvm_hpvm_track_mem(q1, bytes_cost);
+  llvm_hpvm_track_mem(q2, bytes_cost);
   // Scalar variable read/written by both graph and host.
-  llvm_visc_track_mem(tail, sizeof(int));
+  llvm_hpvm_track_mem(tail, sizeof(int));
 
   int num_of_blocks;
   int num_of_threads_per_block;
@@ -466,9 +466,9 @@ int main(int argc, char **argv) {
            graph_edges, bytes_graph_edges, color, bytes_cost, cost, bytes_cost,
            tail, sizeof(int), num_of_nodes, gray, k, block, grid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   do {
-    llvm_visc_request_mem(tail, sizeof(int));
+    llvm_hpvm_request_mem(tail, sizeof(int));
     num_t = *tail;
     // printf("tail for iteration %d = %d\n",k, num_t);
     *tail = 0;
@@ -493,7 +493,7 @@ int main(int argc, char **argv) {
     } else {
       args->gray_shade = GRAY1;
     }
-    // void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17,
+    // void* bfsDFG = __hpvm__node(BFS_kernel, 2, 1, block, grid, 17,
     // q1, bytes_cost,
     // q2, bytes_cost,
     // graph_nodes, bytes_graph_nodes,
@@ -505,8 +505,8 @@ int main(int argc, char **argv) {
     // gray,
     // k,
     // 0);
-    void *bfsDFG = __visc__launch(0, BFS_Wrapper, (void *)args);
-    __visc__wait(bfsDFG);
+    void *bfsDFG = __hpvm__launch(0, BFS_Wrapper, (void *)args);
+    __hpvm__wait(bfsDFG);
     // Swap q1 and q2
     // Swap q1 and q2
     int *temp = args->q1;
@@ -518,22 +518,22 @@ int main(int argc, char **argv) {
   // copy result from device to host
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-  llvm_visc_request_mem(cost, bytes_cost);
-  llvm_visc_request_mem(color, bytes_cost);
+  llvm_hpvm_request_mem(cost, bytes_cost);
+  llvm_hpvm_request_mem(color, bytes_cost);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-  llvm_visc_untrack_mem(graph_nodes);
-  llvm_visc_untrack_mem(graph_edges);
-  llvm_visc_untrack_mem(cost);
-  llvm_visc_untrack_mem(color);
-  llvm_visc_untrack_mem(q1);
-  llvm_visc_untrack_mem(q2);
-  llvm_visc_untrack_mem(tail);
+  llvm_hpvm_untrack_mem(graph_nodes);
+  llvm_hpvm_untrack_mem(graph_edges);
+  llvm_hpvm_untrack_mem(cost);
+  llvm_hpvm_untrack_mem(color);
+  llvm_hpvm_untrack_mem(q1);
+  llvm_hpvm_untrack_mem(q2);
+  llvm_hpvm_untrack_mem(tail);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   // Store the result into a file
   // FIXME: color is not even printed. Why are we reading it back??
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll
index 9abdb29a3c9cb7f4dc641d278fd8e1e001433c44..aca5667b70e9f612d833f06a8482be0a312173cc 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/kernel-spir64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/bfs/src/opencl_base/kernel.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/bfs/src/opencl_base/kernel.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
index 9b8b502688abb01934b337bc7fb178b32fda4633..8e0d34c4b8e070958d47e517bec3dedbfd9c6403 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
@@ -237,7 +237,7 @@ int main(int argc, char **argv) {
                                         NULL, NULL));
 
   printf("Starting GPU kernel\n");
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   int num_of_blocks;
   int num_of_threads_per_block;
 
@@ -272,7 +272,7 @@ int main(int argc, char **argv) {
     OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
                                           sizeof(int), &zero, 0, NULL, NULL));
 
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+    pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
     if (num_t == 0) { // frontier is empty
       break;
     }
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
index 3f9bc775574f597bdcf69c6999553c3c37bd352d..cfd0bf870a91988f5b0f67ffb3be2143e3b6e964 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
@@ -428,7 +428,7 @@ int main(int argc, char **argv) {
         OCL_ERRCK_RETVAL(clSetKernelArg(
             BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL));
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+        pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
         OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1,
                                                 0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
@@ -458,7 +458,7 @@ int main(int argc, char **argv) {
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL));
 
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+        pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
         OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1,
                                                 0, grid, block, 0, 0, 0));
@@ -490,7 +490,7 @@ int main(int argc, char **argv) {
         OCL_ERRCK_RETVAL(
             clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL));
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+        pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
         OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1,
                                                 0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
@@ -542,7 +542,7 @@ int main(int argc, char **argv) {
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL));
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+        pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
         OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1,
                                                 0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
@@ -572,7 +572,7 @@ int main(int argc, char **argv) {
             clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL));
         OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL));
 
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+        pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
         OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1,
                                                 0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
diff --git a/hpvm/test/parboil/benchmarks/cutcp/Makefile b/hpvm/test/parboil/benchmarks/cutcp/Makefile
index 5e56793360aa479f604883f63b41a3ab8bb0cc58..e8edc6e7314b4b41d4712d6e4433ffc321f3f082 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/Makefile
+++ b/hpvm/test/parboil/benchmarks/cutcp/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = cutcp
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile
similarity index 85%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile
index d4c650a17e4261cd14a564f38bea3e9009b92dd3..43a175b947140200bc9415ccd421c198349ba32a 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=excl.ll cutcpu.ll cutoff6overlap.ll output.ll readatom.ll ocl.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/atom.h
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutcpu.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff.h
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/cutoff6overlap.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/excl.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/macros.h
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp
similarity index 82%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp
index caf99a5b37daaa28af83cd058c138af1270feff9..0a36196619a5013108c9bf3656ab2ce90fcfc710 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/main.cpp
@@ -16,7 +16,7 @@
 #include "cutoff.h"
 #include "macros.h"
 #include "output.h"
-#include <visc.h>
+#include <hpvm.h>
 
 #define ERRTOL 1e-4f
 
@@ -54,11 +54,11 @@ extern float rsqrt(float x);
 void Allocation(long block) {
   // Memory shared between threadblocks
   size_t bytes_AtomBinCache = sizeof(float) * BIN_CACHE_MAXLEN * BIN_DEPTH * 4;
-  void *AtomBinCache = __visc__malloc(bytes_AtomBinCache);
+  void *AtomBinCache = __hpvm__malloc(bytes_AtomBinCache);
 
   size_t bytes_myBinIndex = sizeof(xyz);
-  void *myBinIndex = __visc__malloc(bytes_myBinIndex);
-  __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex,
+  void *myBinIndex = __hpvm__malloc(bytes_myBinIndex);
+  __hpvm__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex,
                  bytes_myBinIndex);
 }
 
@@ -76,21 +76,21 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
                // local memory args
                float *AtomBinCache, size_t bytes_AtomBinCache, int *myBinIndex,
                size_t bytes_myBinIndex) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
                      regionZeroAddr);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int ly = __visc__getNodeInstanceID_y(thisNode);
-  int lz = __visc__getNodeInstanceID_z(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
-  int dimx = __visc__getNumNodeInstances_x(thisNode);
-  int dimy = __visc__getNumNodeInstances_y(thisNode);
-  int gdimx = __visc__getNumNodeInstances_x(parentNode);
-  int gdimy = __visc__getNumNodeInstances_y(parentNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int ly = __hpvm__getNodeInstanceID_y(thisNode);
+  int lz = __hpvm__getNodeInstanceID_z(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
+  int dimx = __hpvm__getNumNodeInstances_x(thisNode);
+  int dimy = __hpvm__getNumNodeInstances_y(thisNode);
+  int gdimx = __hpvm__getNumNodeInstances_x(parentNode);
+  int gdimy = __hpvm__getNumNodeInstances_y(parentNode);
 
   float *binZeroAddr = binBaseAddr + 4 * offset;
 
@@ -168,7 +168,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
       AtomBinCache[binIndex + tidmask + 16] = p_global[tidmask + 16];
     }
 
-    __visc__barrier();
+    __hpvm__barrier();
     /* no warp divergence */
     if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) {
       numbins = *NbrListLen - totalbins;
@@ -196,7 +196,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
         if (r2 < cutoff2) {
           float s = (1.f - r2 * inv_cutoff2);
           energy0 += aq * rsqrt(r2) * s * s;
-          // energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+          // energy0 += aq * (1.0/__hpvm__sqrt(r2)) * s * s;
         }
 #else
         energy0 += (r2 < cutoff2);
@@ -208,7 +208,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
         if (r2 < cutoff2) {
           float s = (1.f - r2 * inv_cutoff2);
           energy1 += aq * rsqrt(r2) * s * s;
-          // energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+          // energy1 += aq * (1.0/__hpvm__sqrt(r2)) * s * s;
         }
 #else
         energy1 += (r2 < cutoff2);
@@ -219,7 +219,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
         if (r2 < cutoff2) {
           float s = (1.f - r2 * inv_cutoff2);
           energy2 += aq * rsqrt(r2) * s * s;
-          // energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+          // energy2 += aq * (1.0/__hpvm__sqrt(r2)) * s * s;
         }
 #else
         energy2 += (r2 < cutoff2);
@@ -237,7 +237,7 @@ void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
 #endif
       } /* end loop over atoms in bin */
     }   /* end loop over cached atom bins */
-    __visc__barrier();
+    __hpvm__barrier();
   } /* end loop over neighbor list */
 
   /* store into global memory */
@@ -260,38 +260,38 @@ void BlockingCUTCP(int binDim_x, int binDim_y, float4 *binBaseAddr,
                    size_t bytes_NbrList, long blockx, long blocky,
                    long blockz) {
 
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
                      regionZeroAddr);
 
-  void *AllocationNode = __visc__createNodeND(0, Allocation);
+  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
   void *CUTCPLeafNode =
-      __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz);
+      __hpvm__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz);
 
   // Bind Inputs
-  __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx
-  __visc__bindIn(CUTCPLeafNode, 0, 0, 0);   // Bind binDim_x
-  __visc__bindIn(CUTCPLeafNode, 1, 1, 0);   // Bind binDim_y
-  __visc__bindIn(CUTCPLeafNode, 2, 2, 0);   // Bind binBaseAddr
-  __visc__bindIn(CUTCPLeafNode, 3, 3, 0);   // Bind bytes_binBaseAddr
-  __visc__bindIn(CUTCPLeafNode, 4, 4, 0);   // Bind offset
-  __visc__bindIn(CUTCPLeafNode, 5, 5, 0);   // Bind h
-  __visc__bindIn(CUTCPLeafNode, 6, 6, 0);   // Bind cutoff2
-  __visc__bindIn(CUTCPLeafNode, 7, 7, 0);   // Bind inv_cutoff2
-  __visc__bindIn(CUTCPLeafNode, 8, 8, 0);   // Bind regionZeroAddr
-  __visc__bindIn(CUTCPLeafNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
-  __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex
-  __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen
-  __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen
-  __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList
-  __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList
+  __hpvm__bindIn(AllocationNode, 15, 0, 0); // Bind blockx
+  __hpvm__bindIn(CUTCPLeafNode, 0, 0, 0);   // Bind binDim_x
+  __hpvm__bindIn(CUTCPLeafNode, 1, 1, 0);   // Bind binDim_y
+  __hpvm__bindIn(CUTCPLeafNode, 2, 2, 0);   // Bind binBaseAddr
+  __hpvm__bindIn(CUTCPLeafNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __hpvm__bindIn(CUTCPLeafNode, 4, 4, 0);   // Bind offset
+  __hpvm__bindIn(CUTCPLeafNode, 5, 5, 0);   // Bind h
+  __hpvm__bindIn(CUTCPLeafNode, 6, 6, 0);   // Bind cutoff2
+  __hpvm__bindIn(CUTCPLeafNode, 7, 7, 0);   // Bind inv_cutoff2
+  __hpvm__bindIn(CUTCPLeafNode, 8, 8, 0);   // Bind regionZeroAddr
+  __hpvm__bindIn(CUTCPLeafNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __hpvm__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex
+  __hpvm__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen
+  __hpvm__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __hpvm__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList
+  __hpvm__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList
 
   // Create Edges
-  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache
-  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16,
+  __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache
+  __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16,
                0); // Edge bytes_AtomBinCache
-  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex
-  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18,
+  __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex
+  __hpvm__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18,
                0); // Edge bytes_myBinIndex
 }
 
@@ -370,32 +370,32 @@ void CUTCPRoot(int binDim_x, int binDim_y, float4 *binBaseAddr,
                int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
                size_t bytes_NbrList, long blockx, long blocky, long blockz,
                long gridx, long gridy, long gridz) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
                      regionZeroAddr);
 
   void *BlockingCUTCPNode =
-      __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz);
+      __hpvm__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz);
 
   // Bind Inputs
-  __visc__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
-  __visc__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
-  __visc__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
-  __visc__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
-  __visc__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
-  __visc__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
-  __visc__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
-  __visc__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
-  __visc__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
-  __visc__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
-  __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
-  __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
-  __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
-  __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
-  __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
-  __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
-  __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
-  __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
+  __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
+  __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
+  __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
+  __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
+  __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
+  __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
+  __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
+  __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
+  __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
+  __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
+  __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
+  __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
+  __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
 }
 
 void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr,
@@ -410,34 +410,34 @@ void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr,
                   int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
                   size_t bytes_NbrList, long blockx, long blocky, long blockz,
                   long gridx, long gridy, long gridz) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
                      regionZeroAddr);
 
-  void *BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot);
+  void *BlockingCUTCPNode = __hpvm__createNodeND(0, CUTCPRoot);
 
   // Bind Inputs
-  __visc__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
-  __visc__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
-  __visc__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
-  __visc__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
-  __visc__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
-  __visc__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
-  __visc__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
-  __visc__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
-  __visc__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
-  __visc__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
-  __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
-  __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
-  __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
-  __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
-  __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
-  __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
-  __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
-  __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
-  __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx
-  __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy
-  __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz
+  __hpvm__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
+  __hpvm__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
+  __hpvm__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
+  __hpvm__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
+  __hpvm__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
+  __hpvm__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
+  __hpvm__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __hpvm__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
+  __hpvm__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
+  __hpvm__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __hpvm__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
+  __hpvm__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
+  __hpvm__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
+  __hpvm__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
+  __hpvm__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
+  __hpvm__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx
+  __hpvm__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy
+  __hpvm__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz
 }
 
 // ==================== Host Code ==============================
@@ -546,7 +546,7 @@ int main(int argc, char *argv[]) {
   }
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -586,7 +586,7 @@ int main(int argc, char *argv[]) {
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   /* Print output */
   // pb_SwitchToTimer(&timers, pb_TimerID_IO);
@@ -873,11 +873,11 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("\n");
   }
 
-  // Track visc data
-  llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr);
-  llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr);
-  llvm_visc_track_mem(nbrlistlen, sizeof(int));
-  llvm_visc_track_mem(nbrlist, bytes_nbrlist);
+  // Track hpvm data
+  llvm_hpvm_track_mem(regionZeroAddr, bytes_regionZeroAddr);
+  llvm_hpvm_track_mem(binBaseAddr, bytes_binBaseAddr);
+  llvm_hpvm_track_mem(nbrlistlen, sizeof(int));
+  llvm_hpvm_track_mem(nbrlist, bytes_nbrlist);
 
   /* setup OpenCL kernel parameters */
   blockDim[0] = 8;
@@ -914,7 +914,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
            gridDim[1], gridDim[2]);
 
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
-  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
   void *CUTCP_DFG;
   if (verbose)
     printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
@@ -926,9 +926,9 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 
     args->zRegionIndex = zRegionIndex;
 
-    CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void *)args);
-    __visc__wait(CUTCP_DFG);
-    // llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t));
+    CUTCP_DFG = __hpvm__launch(0, CUTCPWrapper, (void *)args);
+    __hpvm__wait(CUTCP_DFG);
+    // llvm_hpvm_request_mem(regionZeroAddr, lnall*sizeof(ener_t));
   }
 
   /*
@@ -941,14 +941,14 @@ int gpu_compute_cutoff_potential_lattice6overlap(
       printf("computing extra atoms on CPU\n");
     }
 
-    pb_SwitchToTimer(timers, visc_TimerID_MISC);
+    pb_SwitchToTimer(timers, hpvm_TimerID_MISC);
 
     if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
       fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
                       "for extra atoms\n");
       return -1;
     }
-    pb_SwitchToTimer(timers, visc_TimerID_MISC);
+    pb_SwitchToTimer(timers, hpvm_TimerID_MISC);
     printf("\n");
   }
   if (verbose)
@@ -957,7 +957,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /* copy result regions from OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
-  llvm_visc_request_mem(regionZeroAddr, lnall * sizeof(ener_t));
+  llvm_hpvm_request_mem(regionZeroAddr, lnall * sizeof(ener_t));
 
   /*
    * transpose on CPU, updating, producing the final lattice
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/ocl.h
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/output.h
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c
rename to hpvm/test/parboil/benchmarks/cutcp/src/hpvm/readatom.c
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
index 06f856c1a0fa43dc95cb896450baa42f74c047fd..dfd7f1ff388be0c0a51dadbeee80345355c8bf4c 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
@@ -427,7 +427,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /*CHECK_ERROR("clCreateCommandQueue")*/
 
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
-  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
   if (verbose)
     printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
   for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc.cl
rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm.cl
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll
similarity index 99%
rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll
rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll
index 7f614e66ff564c661e2388c7e9aef6d70eb4add8..85a73b291f407feae2d407385679fc0bb05b589f 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.ll
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_visc.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel_hpvm.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir
similarity index 100%
rename from hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_visc_x64.spir
rename to hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_hpvm_x64.spir
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll
index 370e3c0f8ffec89a85e9a884a4ebcea7664a5723..5a3c1fcd5d853dcda7ba55a9a9ab84a376b1a2f0 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/kernel_x64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/cutcp/src/opencl_nvidia/kernel.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
index 96ebeafbdf377a2d2e6e8e7f2cf5e1e58a3e7a6a..076532b709b6fa49a552f777975f596fc72e2ed3 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
@@ -423,7 +423,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /*CHECK_ERROR("clCreateCommandQueue")*/
 
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
-  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
   if (verbose)
     printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
   for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile
index 4ebf6fc0af2f05cd10f6d556e0b52bee186540d8..af7215ff7039795e2d09ce98af675a851b32b0cb 100644
--- a/hpvm/test/parboil/benchmarks/lbm/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/Makefile
@@ -5,9 +5,9 @@ ifeq ($(NUM_CORES),)
   NUM_CORES=8
 endif
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
similarity index 85%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
index d1664ee9880312ccfa2677e6a284851ecadf1f24..5aa206f758e87a94cdaa1cbaadfa3bf9b661d120 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=lbm.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES)
 APP_CXXFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
similarity index 86%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
index b51864366b500fc796d9073fe1893be2f402797f..32db8e9b2c4d153a28ee1da2dd91877ba2b2a680 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
@@ -8,11 +8,11 @@
 
 /*############################################################################*/
 
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <visc.h>
 
 #include "layout_config.h"
 #include "lbm.h"
@@ -92,18 +92,18 @@ typedef struct __attribute__((__packed__)) {
 
 void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG,
                                  size_t bytes_dstG) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
 
   srcG += MARGIN;
   dstG += MARGIN;
 
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
 
   // Using some predefined macros here.  Consider this the declaration
   //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
@@ -274,40 +274,40 @@ void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG,
 
 void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
   void *lbm_node =
-      __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
+      __hpvm__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
 }
 
 void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
-  void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
-  __visc__bindIn(lbm_node, 4, 4, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __hpvm__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
+  __hpvm__bindIn(lbm_node, 4, 4, 0);
 }
 
 void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
-  void *lbm_node = __visc__createNodeND(0, lbmLvl2);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
-  __visc__bindIn(lbm_node, 4, 4, 0);
-  __visc__bindIn(lbm_node, 5, 5, 0);
-  __visc__bindIn(lbm_node, 6, 6, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __hpvm__createNodeND(0, lbmLvl2);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
+  __hpvm__bindIn(lbm_node, 4, 4, 0);
+  __hpvm__bindIn(lbm_node, 5, 5, 0);
+  __hpvm__bindIn(lbm_node, 6, 6, 0);
 }
 
 __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src,
@@ -321,9 +321,9 @@ __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src,
   RootIn root_in_local = {src - MARGIN, size,   dst - MARGIN, size,
                           SIZE_X,       SIZE_Y, SIZE_Z};
   *(RootIn *)root_in = root_in_local;
-  void *lbmDFG = __visc__launch(0, lbmLvl3, root_in);
+  void *lbmDFG = __hpvm__launch(0, lbmLvl3, root_in);
 
-  __visc__wait(lbmDFG);
+  __hpvm__wait(lbmDFG);
 }
 
 void MAIN_initialize(const MAIN_Param *param) {
@@ -379,12 +379,12 @@ int main(int nArgs, char *arg[]) {
   MAIN_initialize(&param);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(srcGrid - MARGIN, size);
-  llvm_visc_track_mem(dstGrid - MARGIN, size);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(srcGrid - MARGIN, size);
+  llvm_hpvm_track_mem(dstGrid - MARGIN, size);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   for (t = 1; t <= param.nTimeSteps; t++) {
@@ -404,15 +404,15 @@ int main(int nArgs, char *arg[]) {
   }
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(srcGrid - MARGIN, size);
+  llvm_hpvm_request_mem(srcGrid - MARGIN, size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(srcGrid - MARGIN);
-  llvm_visc_untrack_mem(dstGrid - MARGIN);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(srcGrid - MARGIN);
+  llvm_hpvm_untrack_mem(dstGrid - MARGIN);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
   MAIN_finalize(&param);
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
index 59aa8daf9a018348274e20653c9c92f6995a96e4..a55f0ce785e635e1c840de8000a68b85b7295807 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
@@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
   clFinish(prm.clCommandQueue);
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   for (t = 1; t <= param.nTimeSteps; t++) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -101,7 +101,7 @@ int main(int nArgs, char *arg[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
index d93a919df300c520c7105612cc54f9684f052678..64fe482b81503c4ef4ac5a88f9b0eb0a16f9a806 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
@@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) {
   LBM_showGridStatistics(TEMP_srcGrid);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCL_initialize(&prm);
 
@@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   for (t = 1; t <= param.nTimeSteps; t++) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
@@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
   LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
   LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   // Setup DEVICE datastructures
   OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
   OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
@@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
 
   LBM_freeGrid((float **)&TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
index d93a919df300c520c7105612cc54f9684f052678..64fe482b81503c4ef4ac5a88f9b0eb0a16f9a806 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
@@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) {
   LBM_showGridStatistics(TEMP_srcGrid);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCL_initialize(&prm);
 
@@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   for (t = 1; t <= param.nTimeSteps; t++) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
@@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
   LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
   LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   // Setup DEVICE datastructures
   OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
   OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
@@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
 
   LBM_freeGrid((float **)&TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
index 18320b7394e5d499339ee820a992b00acd9b368e..54399ee119a6c905baffae6c116ba890cafe44a8 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
@@ -79,7 +79,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
   for (t = 1; t <= param.nTimeSteps; t++) {
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+    pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
     OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
index 5e43b754279910d3ca3b45d40184df666138f9e5..6d682e98e6c4df3b05bb197ef36a21623b545f96 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
@@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) {
   LBM_showGridStatistics(TEMP_srcGrid);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCL_initialize(&prm);
 
@@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 1; i++) {
     for (t = 1; t <= param.nTimeSteps; t++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
@@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
   LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
   LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   // Setup DEVICE datastructures
   OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
   OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
@@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
 
   LBM_freeGrid((float **)&TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
index e66cb2c47cc5bd1f62d774952a7e2397005f1e47..9dc95e7d856a5425f84d4063d7a7ba7bfddcebf6 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
@@ -65,7 +65,7 @@ int main(int nArgs, char *arg[]) {
   LBM_showGridStatistics(TEMP_srcGrid);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCL_initialize(&prm);
 
@@ -78,7 +78,7 @@ int main(int nArgs, char *arg[]) {
   OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
   OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 4; i++) {
     for (t = 1; t <= param.nTimeSteps; t++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -100,7 +100,7 @@ int main(int nArgs, char *arg[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
@@ -197,7 +197,7 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
   LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
   LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   // Setup DEVICE datastructures
   OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
   OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
@@ -233,7 +233,7 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
 
   LBM_freeGrid((float **)&TEMP_srcGrid);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   OpenCL_LBM_freeGrid(OpenCL_srcGrid);
   OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile
index ace9ded22b6ef365c9cd0f6262245dd2e086643d..4757432d224ea5a1aaa762bfc89c1c89e869bd32 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = sgemm
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc_sh
+  VERSION = hpvm_sh
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
index d1f6c96d0c279bc2f2e3e70313369d49881b62b8..6e63f8384190ff75c281592df1ab3843b017d07f 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O1
 APP_CXXFLAGS=-ffast-math -O1
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
similarity index 69%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
index 627f5a82412374cff4a9061620ce1f27ea3c14a6..de36705707d7062b4cef2042197902c2c415e312 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -57,17 +57,17 @@ typedef struct __attribute__((__packed__)) {
 void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha,
                float beta) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, A, B, C, 1, C);
-
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int ly = __visc__getNodeInstanceID_y(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
-  int gridy = __visc__getNumNodeInstances_y(thisNode);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int ly = __hpvm__getNodeInstanceID_y(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
+  int gridy = __hpvm__getNumNodeInstances_y(thisNode);
   int m = gx * gridx + lx;
   int n = gy * gridy + ly;
 
@@ -83,46 +83,46 @@ void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
 void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, A, B, C, 1, C);
   void *sgemm_node =
-      __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
+      __hpvm__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
 }
 
 void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1,
                     size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
   void *sgemm_node =
-      __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
-  __visc__bindIn(sgemm_node, 12, 12, 0);
-  __visc__bindIn(sgemm_node, 13, 13, 0);
+      __hpvm__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
+  __hpvm__bindIn(sgemm_node, 12, 12, 0);
+  __hpvm__bindIn(sgemm_node, 13, 13, 0);
 }
 
 // A wrapper level used in codegen for some backends
@@ -130,25 +130,25 @@ void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1,
                     size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
-  __visc__bindIn(sgemm_node, 12, 12, 0);
-  __visc__bindIn(sgemm_node, 13, 13, 0);
-  __visc__bindIn(sgemm_node, 14, 14, 0);
-  __visc__bindIn(sgemm_node, 15, 15, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *sgemm_node = __hpvm__createNodeND(0, basicSgemmLvl2);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
+  __hpvm__bindIn(sgemm_node, 12, 12, 0);
+  __hpvm__bindIn(sgemm_node, 13, 13, 0);
+  __hpvm__bindIn(sgemm_node, 14, 14, 0);
+  __hpvm__bindIn(sgemm_node, 15, 15, 0);
 }
 
 __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
@@ -194,8 +194,8 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
                           dg[0] / db[0],
                           dg[1] / db[1]};
   *(RootIn *)root_in = root_in_local;
-  void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in);
-  __visc__wait(sgemmDFG);
+  void *sgemmDFG = __hpvm__launch(0, basicSgemmLvl3, root_in);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -233,7 +233,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -246,9 +246,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -263,16 +263,16 @@ int main(int argc, char *argv[]) {
              matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/kernel.cl
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
similarity index 90%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
index 62f9285e8a8054e5597fe45adc5257470b147622..a1db2e56a5c5639319d7be5f6a890d44c3a28421 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -42,8 +42,8 @@ extern char *readFile(const char *);
 
 void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
                float alpha, float beta) {
-  __visc__hint(visc::GPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::GPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
 
   float c[TILE_N];
   for (int i = 0; i < TILE_N; i++)
@@ -96,10 +96,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
   //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
   unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N};
 
-  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
                                    dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
                                    ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -129,7 +129,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -142,9 +142,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -159,16 +159,16 @@ int main(int argc, char *argv[]) {
              matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
similarity index 86%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
index a0fd0e95753970ad1c0db1038cf243635d259899..f81bac47072bc017dcdcdccf373cdfbd0f21ceac 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
similarity index 65%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
index 05d143b5884164926213ca060da341a254399bf3..de0d473ed6fe6724ef81f99b13e02d0de29b103b 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -82,29 +82,29 @@ void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B,
 }
 
 void Allocation(long block_x, long block_y) {
-  void *shB = __visc__malloc(block_x * block_y * sizeof(float));
-  __visc__return(2, shB, block_x * block_y * sizeof(float));
+  void *shB = __hpvm__malloc(block_x * block_y * sizeof(float));
+  __hpvm__return(2, shB, block_x * block_y * sizeof(float));
 }
 
 void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
                int ldb, float *C, size_t bytesC, int ldc, int k, float alpha,
                float beta, float *shB, size_t bytesshB) {
-  __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::SPIR_TARGET);
-  //__visc__hint(visc::GPU_TARGET);
+  __hpvm__hint(hpvm::DEVICE);
+  //__hpvm__hint(hpvm::SPIR_TARGET);
+  //__hpvm__hint(hpvm::GPU_TARGET);
 
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__attributes(3, A, B, C, 1, C);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
 
-  long lx = __visc__getNodeInstanceID_x(thisNode);
-  long ly = __visc__getNodeInstanceID_y(thisNode);
+  long lx = __hpvm__getNodeInstanceID_x(thisNode);
+  long ly = __hpvm__getNodeInstanceID_y(thisNode);
 
-  long gx = __visc__getNodeInstanceID_x(parentNode);
-  long gy = __visc__getNodeInstanceID_y(parentNode);
+  long gx = __hpvm__getNodeInstanceID_x(parentNode);
+  long gy = __hpvm__getNodeInstanceID_y(parentNode);
 
-  long dimx = __visc__getNumNodeInstances_x(thisNode);
+  long dimx = __hpvm__getNumNodeInstances_x(thisNode);
 
   float c[TILE_N];
   for (int i = 0; i < TILE_N; i++)
@@ -119,7 +119,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
     // shB[ly][lx] = B[n+(i+ly)*ldb];
     shB[ly * dimx + lx] = B[n + (i + ly) * ldb];
 
-    __visc__barrier();
+    __hpvm__barrier();
     for (int j = 0; j < TILE_TB_HEIGHT; j++) {
       a = A[m + (i + j) * lda];
       for (int kk = 0; kk < TILE_N; kk++) {
@@ -127,7 +127,7 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
         c[kk] += a * shB[j * dimx + kk];
       }
     }
-    __visc__barrier();
+    __hpvm__barrier();
   }
 
   int t = ldc * gy * TILE_N + m;
@@ -140,31 +140,31 @@ void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
 void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb,
              float *C, size_t bytesC, int ldc, int k, float alpha, float beta,
              long block_x, long block_y) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void *AllocationNode = __visc__createNodeND(0, Allocation);
-  void *SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
+  void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y);
 
   // Bind edges
-  __visc__bindIn(SgemmLeafNode, 0, 0, 0);   // Bind A
-  __visc__bindIn(SgemmLeafNode, 1, 1, 0);   // Bind bytesA
-  __visc__bindIn(SgemmLeafNode, 2, 2, 0);   // Bind lda
-  __visc__bindIn(SgemmLeafNode, 3, 3, 0);   // Bind B
-  __visc__bindIn(SgemmLeafNode, 4, 4, 0);   // Bind bytesB
-  __visc__bindIn(SgemmLeafNode, 5, 5, 0);   // Bind ldb
-  __visc__bindIn(SgemmLeafNode, 6, 6, 0);   // Bind C
-  __visc__bindIn(SgemmLeafNode, 7, 7, 0);   // Bind bytesC
-  __visc__bindIn(SgemmLeafNode, 8, 8, 0);   // Bind ldc
-  __visc__bindIn(SgemmLeafNode, 9, 9, 0);   // Bind k
-  __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
-  __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
-
-  __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
-  __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
+  __hpvm__bindIn(SgemmLeafNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmLeafNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmLeafNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmLeafNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmLeafNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmLeafNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmLeafNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmLeafNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmLeafNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmLeafNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
+
+  __hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
+  __hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
 
   // Create Edges between AllocationNode and BFSLeafNodeNode
-  __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
-  __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
+  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
+  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
                0); // Edge bytes_local_B
 }
 
@@ -175,25 +175,25 @@ void SgemmRoot(float *A, size_t bytesA, int lda,                    // 0-2
                int k, float alpha, float beta,                      // 9-11
                long block_x, long block_y, long grid_x, long grid_y // 12-15
 ) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void *SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y);
 
   // Bind edges
-  __visc__bindIn(SgemmTBNode, 0, 0, 0);   // Bind A
-  __visc__bindIn(SgemmTBNode, 1, 1, 0);   // Bind bytesA
-  __visc__bindIn(SgemmTBNode, 2, 2, 0);   // Bind lda
-  __visc__bindIn(SgemmTBNode, 3, 3, 0);   // Bind B
-  __visc__bindIn(SgemmTBNode, 4, 4, 0);   // Bind bytesB
-  __visc__bindIn(SgemmTBNode, 5, 5, 0);   // Bind ldb
-  __visc__bindIn(SgemmTBNode, 6, 6, 0);   // Bind C
-  __visc__bindIn(SgemmTBNode, 7, 7, 0);   // Bind bytesC
-  __visc__bindIn(SgemmTBNode, 8, 8, 0);   // Bind ldc
-  __visc__bindIn(SgemmTBNode, 9, 9, 0);   // Bind k
-  __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
-  __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
-  __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
-  __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
+  __hpvm__bindIn(SgemmTBNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmTBNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmTBNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmTBNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmTBNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmTBNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmTBNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmTBNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmTBNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmTBNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
+  __hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
+  __hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
 }
 
 void SgemmWrapper(float *A, size_t bytesA, int lda,                    // 0-2
@@ -202,27 +202,27 @@ void SgemmWrapper(float *A, size_t bytesA, int lda,                    // 0-2
                   int k, float alpha, float beta,                      // 9-11
                   long block_x, long block_y, long grid_x, long grid_y // 12-15
 ) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void *SgemmRootNode = __visc__createNodeND(0, SgemmRoot);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot);
 
   // Bind edges
-  __visc__bindIn(SgemmRootNode, 0, 0, 0);   // Bind A
-  __visc__bindIn(SgemmRootNode, 1, 1, 0);   // Bind bytesA
-  __visc__bindIn(SgemmRootNode, 2, 2, 0);   // Bind lda
-  __visc__bindIn(SgemmRootNode, 3, 3, 0);   // Bind B
-  __visc__bindIn(SgemmRootNode, 4, 4, 0);   // Bind bytesB
-  __visc__bindIn(SgemmRootNode, 5, 5, 0);   // Bind ldb
-  __visc__bindIn(SgemmRootNode, 6, 6, 0);   // Bind C
-  __visc__bindIn(SgemmRootNode, 7, 7, 0);   // Bind bytesC
-  __visc__bindIn(SgemmRootNode, 8, 8, 0);   // Bind ldc
-  __visc__bindIn(SgemmRootNode, 9, 9, 0);   // Bind k
-  __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
-  __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
-  __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
-  __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
-  __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
-  __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
+  __hpvm__bindIn(SgemmRootNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmRootNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmRootNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmRootNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmRootNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmRootNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmRootNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmRootNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmRootNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmRootNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
+  __hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
+  __hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
+  __hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
+  __hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
 }
 
 // Creates root node for sgemm
@@ -262,10 +262,10 @@ __attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers,
   packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta,
            block_x, block_y, grid_x, grid_y);
 
-  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
-  void *sgemmDFG = __visc__launch(0, SgemmWrapper, (void *)args);
+  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
+  void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args);
 
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
 }
 
@@ -296,7 +296,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -309,9 +309,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -324,16 +324,16 @@ int main(int argc, char *argv[]) {
              C_sz, matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
     /* Write C to file */
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
similarity index 90%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
index 0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8..be39d713d55d1cb518083679fb1ea1ce717a4ca9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -40,7 +40,7 @@ extern char *readFile(const char *);
 
 void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
                float alpha, float beta) {
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__attributes(3, A, B, C, 1, C);
   float c0, c1, c2, c3;
   c0 = c1 = c2 = c3 = 0.0f;
   int m = 4 * get_global_id(0);
@@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
   unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
   unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
-  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
                                    dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
                                    ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -107,7 +107,7 @@ int main(int argc, char *argv[]) {
   std::vector<float> matA, matBT;
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
@@ -138,9 +138,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -158,22 +158,22 @@ int main(int argc, char *argv[]) {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
     /* Write C to file */
-    llvm_visc_request_mem(&matC.front(), C_sz);
+    llvm_hpvm_request_mem(&matC.front(), C_sz);
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
     writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
   std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
             << std::endl;
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
   pb_FreeParameters(params);
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
similarity index 90%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
index 0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8..be39d713d55d1cb518083679fb1ea1ce717a4ca9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -40,7 +40,7 @@ extern char *readFile(const char *);
 
 void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
                float alpha, float beta) {
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__attributes(3, A, B, C, 1, C);
   float c0, c1, c2, c3;
   c0 = c1 = c2 = c3 = 0.0f;
   int m = 4 * get_global_id(0);
@@ -90,10 +90,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
   unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
   unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
-  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
                                    dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
                                    ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -107,7 +107,7 @@ int main(int argc, char *argv[]) {
   std::vector<float> matA, matBT;
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
@@ -138,9 +138,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -158,22 +158,22 @@ int main(int argc, char *argv[]) {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
     /* Write C to file */
-    llvm_visc_request_mem(&matC.front(), C_sz);
+    llvm_hpvm_request_mem(&matC.front(), C_sz);
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
     writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
   std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
             << std::endl;
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
   pb_FreeParameters(params);
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/kernel.cl
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
similarity index 90%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
index 76d0cefc817ea28f2ffb15cd48d8dd5c7a97d0e0..286297d6fefe0b6f72bdc9e8a9079a131a7b16bf 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -41,8 +41,8 @@ extern char *readFile(const char *);
 
 void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
                float alpha, float beta) {
-  __visc__hint(visc::GPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::GPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
 
   float c = 0.0f;
   int m = get_global_id(0);
@@ -99,10 +99,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
   unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
   unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
-  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
                                    dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
                                    ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -132,7 +132,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -145,9 +145,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -162,16 +162,16 @@ int main(int argc, char *argv[]) {
              matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/kernel.cl
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc
similarity index 91%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc
index a4c252d8f183e76f91349d97872dbca0b3766acf..8fbc45e08a9e2fd1e3af6cc03360086b354665d7 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -42,8 +42,8 @@ extern char *readFile(const char *);
 
 void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
                float alpha, float beta) {
-  __visc__hint(visc::SPIR_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::SPIR_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
 
   float c[TILE_N];
   for (int i = 0; i < TILE_N; i++)
@@ -135,10 +135,10 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
   unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
   unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
 
-  void *sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+  void *sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
                                 dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
                                 ldb, C, bytesC, ldc, k, alpha, beta, 0);
-  __visc__wait(sgemmDFG);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -181,10 +181,10 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
 
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -200,16 +200,16 @@ int main(int argc, char *argv[]) {
              matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
index 5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd..e8d1c69ec9a63c3328f573195a66ceaa02b73aab 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -212,7 +212,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
index 105baf590da13dd2ffc3cb803d63291daef0854d..4285a52a01adec3b17084c058ada68b6dbe23836 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
@@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
index f72c18c293c52e322a35814b13c000f9b64548b0..7edbf05a4bc423d2f30b01ebde457a02263d1fa0 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
index 744ee4096664e2f11620fae388a0a848a8cd49ac..cccec04beba6122632347b1339ec6caaeac16f29 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
@@ -110,7 +110,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -254,7 +254,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
index 45ed8e942a1a69475b75a63a24b70655f1ffa2aa..36e7b93571c24aad59c206d18f69293689bf395f 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
@@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
   clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 1; i++) {
 
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -247,7 +247,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
index d8275be777079f1a57e585b3057685f737f38ed3..2cc311d1eff010bb3c4820bb517083ac33ad8c58 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
@@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
   clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  // pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  // pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   // for(int i=0; i<15; i++) {
 
   clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -212,7 +212,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
index b4e561ded6b82bf2b84aa4dbab2f5f4b5bceab7b..678b4d8131515b68b52f8c12d5384b849c1b54ae 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
@@ -83,7 +83,7 @@ void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
   clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   // for(int i=0; i<15; i++) {
 
   clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -247,7 +247,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll
index ca9fcca0608a891f800e5c5a68f10d36aff268d9..9b4cf7702d777fea811ad800bacf09db63fe7e1d 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel-spir64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/kernel.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
index 8de437a4f8935d5746dbcfbbe5345e0e66ae484a..79fecfb84b536388136932789f00b9e40491df0b 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
@@ -195,7 +195,7 @@ int main(int argc, char *argv[]) {
                                   &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   // Use standard sgemm interface
   regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll
index 908c7104bb776bcade055ae430762e3eeab45b9d..2f72a6cebad6829711f2c8a4c33dd649497a9a30 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test-spir64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/sgemm/src/opencl_cpu_sm/test.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
index 06f5da5c319811ebfc5aa8937559219b2feed625..22f66ca0a8cfe3bd7789b93e4f96f3adbf323a31 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
@@ -190,7 +190,7 @@ int main(int argc, char *argv[]) {
                                   &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   // Use standard sgemm interface
   regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
index b22ebd8804bdb1204c42e2859aab69209dc77e4c..10e044545385162e2d682e77c98f801bba36dbed 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
@@ -119,7 +119,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]) {
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
index a7cb9793e8c1ec991d5a3f3cd1676f7a88ff8e26..59da9562a1169c27a20b699eaf49383090e7c977 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
@@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
   clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 4; i++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
                                       0, NULL, NULL);
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -211,7 +211,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
index 713fd9e88966f885919bfba7df3bb0386c815f9a..5069484492c50e921276378615df3972987559a3 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
@@ -79,7 +79,7 @@ void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
   clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 200; i++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
                                       0, NULL, NULL);
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -211,7 +211,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
index 7d5d75c53341060d5d61e21ffdd4d8123aa019a9..bad82538709cc06a07f11853c1dbd01458f034e4 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
@@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
   clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
                       NULL, NULL);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll
deleted file mode 100644
index ea1e7b3b7cc4092f69dd0de9b33ad9b693bcac1c..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll
+++ /dev/null
@@ -1,894 +0,0 @@
-; ModuleID = 'build/visc_tc_vec_default/main.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::ios_base::Init" = type { i8 }
-%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
-%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
-%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
-%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
-%"struct.std::ios_base::_Words" = type { i8*, i64 }
-%"class.std::locale" = type { %"class.std::locale::_Impl"* }
-%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
-%"class.std::locale::facet" = type { i32 (...)**, i32 }
-%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
-%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 }
-%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
-%struct.__locale_data = type opaque
-%"class.std::num_put" = type { %"class.std::locale::facet" }
-%"class.std::num_get" = type { %"class.std::locale::facet" }
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%rtype = type {}
-%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }>
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%"class.std::vector" = type { %"struct.std::_Vector_base" }
-%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" }
-%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* }
-%struct.pb_Parameters = type { i8*, i8** }
-
-@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
-@__dso_handle = external global i8
-@_ZSt4cerr = external global %"class.std::basic_ostream"
-@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1
-@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1
-@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1
-@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1
-@stderr = external global %struct._IO_FILE*
-@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1
-@_ZSt4cout = external global %"class.std::basic_ostream"
-@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-
-declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0
-
-declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0
-
-; Function Attrs: nounwind
-declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode()
-  %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %2 = mul i32 %0, %1
-  %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %4 = add i32 %2, %3
-  %mul = shl nsw i32 %4, 2
-  %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %7 = mul i32 %5, %6
-  %8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %9 = add i32 %7, %8
-  %cmp147 = icmp sgt i32 %k, 0
-  %add3144 = or i32 %mul, 1
-  %add8145 = or i32 %mul, 2
-  %add13146 = or i32 %mul, 3
-
-  %mul.tmp1 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %mul, i32 0
-  %mul.tmp2 = insertelement <4 x i32> %mul.tmp1, i32 %add3144, i32 1
-  %mul.tmp3 = insertelement <4 x i32> %mul.tmp2, i32 %add8145, i32 2
-  %mul.vector = insertelement <4 x i32> %mul.tmp2, i32 %add13146, i32 3
-
-  %lda.tmp = insertelement <1 x i32> < i32 0 >, i32 %lda, i32 0
-  %lda.vector = shufflevector <1 x i32> %lda.tmp, <1 x i32> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 >
-
-  br i1 %cmp147, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-
-;  %c0.0152 = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ]
-;  %c1.0151 = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ]
-;  %c2.0150 = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ]
-;  %c3.0149 = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ]
-  %c.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ]
-
-  %10 = trunc i64 %indvars.iv to i32
-  %mul2 = mul nsw i32 %10, %lda
-
-;  %add = add nsw i32 %mul2, %mul
-;  %idxprom = sext i32 %add to i64
-;  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-;  %11 = load float* %arrayidx, align 4, !tbaa !0
-;  %add5 = add nsw i32 %mul2, %add3144
-;  %idxprom6 = sext i32 %add5 to i64
-;  %arrayidx7 = getelementptr inbounds float* %A, i64 %idxprom6
-;  %12 = load float* %arrayidx7, align 4, !tbaa !0
-;  %add10 = add nsw i32 %mul2, %add8145
-;  %idxprom11 = sext i32 %add10 to i64
-;  %arrayidx12 = getelementptr inbounds float* %A, i64 %idxprom11
-;  %13 = load float* %arrayidx12, align 4, !tbaa !0
-;  %add15 = add nsw i32 %mul2, %add13146
-;  %idxprom16 = sext i32 %add15 to i64
-;  %arrayidx17 = getelementptr inbounds float* %A, i64 %idxprom16
-;  %14 = load float* %arrayidx17, align 4, !tbaa !0
-  %add = add nsw i32 %mul2, %mul
-  %idxprom = sext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %arrayidx.cast = bitcast float* %arrayidx to <4 x float>*
-  %11 = load <4 x float>* %arrayidx.cast, align 4
-
-  %mul18 = mul nsw i32 %10, %ldb
-  %add19 = add nsw i32 %mul18, %9
-  %idxprom20 = sext i32 %add19 to i64
-  %arrayidx21 = getelementptr inbounds float* %B, i64 %idxprom20
-;  %15 = load float* %arrayidx21, align 4, !tbaa !0
-  %12 = load float* %arrayidx21, align 4, !tbaa !0
-
-  %b.tmp = insertelement <1 x float> < float 0.000000e+00 >, float %12, i32 0
-  %b.vector = shufflevector <1 x float> %b.tmp, <1 x float> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 >
-
-;  %mul22 = fmul fast float %11, %15
-;  %add23 = fadd fast float %c0.0152, %mul22
-;  %mul24 = fmul fast float %12, %15
-;  %add25 = fadd fast float %c1.0151, %mul24
-;  %mul26 = fmul fast float %13, %15
-;  %add27 = fadd fast float %c2.0150, %mul26
-;  %mul28 = fmul fast float %14, %15
-;  %add29 = fadd fast float %c3.0149, %mul28
-  %mul22 = fmul fast <4 x float> %11, %b.vector
-  %add23 = fadd fast <4 x float> %c.vector, %mul22
-
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-;  %c0.0.lcssa = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ]
-;  %c1.0.lcssa = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ]
-;  %c2.0.lcssa = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ]
-;  %c3.0.lcssa = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ]
-  %c.end.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ]
-
-  %c0.0.lcssa = extractelement <4 x float> %c.end.vector, i32 0
-  %c1.0.lcssa = extractelement <4 x float> %c.end.vector, i32 1
-  %c2.0.lcssa = extractelement <4 x float> %c.end.vector, i32 2
-  %c3.0.lcssa = extractelement <4 x float> %c.end.vector, i32 3
-
-  %mul30 = mul nsw i32 %9, %ldc
-  %add31 = add nsw i32 %mul30, %mul
-  %idxprom32 = sext i32 %add31 to i64
-  %arrayidx33 = getelementptr inbounds float* %C, i64 %idxprom32
-
-;  %16 = load float* %arrayidx33, align 4, !tbaa !0
-;  %mul34 = fmul fast float %16, %beta
-  %13 = load float* %arrayidx33, align 4, !tbaa !0
-  %mul34 = fmul fast float %13, %beta
-
-  %mul35 = fmul fast float %c0.0.lcssa, %alpha
-  %add36 = fadd fast float %mul35, %mul34
-  store float %add36, float* %arrayidx33, align 4, !tbaa !0
-  %add43 = add nsw i32 %add3144, %mul30
-  %idxprom44 = sext i32 %add43 to i64
-  %arrayidx45 = getelementptr inbounds float* %C, i64 %idxprom44
-
-;  %17 = load float* %arrayidx45, align 4, !tbaa !0
-;  %mul46 = fmul fast float %17, %beta
-  %14 = load float* %arrayidx45, align 4, !tbaa !0
-  %mul46 = fmul fast float %14, %beta
-
-  %mul47 = fmul fast float %c1.0.lcssa, %alpha
-  %add48 = fadd fast float %mul47, %mul46
-  store float %add48, float* %arrayidx45, align 4, !tbaa !0
-  %add56 = add nsw i32 %add8145, %mul30
-  %idxprom57 = sext i32 %add56 to i64
-  %arrayidx58 = getelementptr inbounds float* %C, i64 %idxprom57
-
-;  %18 = load float* %arrayidx58, align 4, !tbaa !0
-;  %mul59 = fmul fast float %18, %beta
-  %15 = load float* %arrayidx58, align 4, !tbaa !0
-  %mul59 = fmul fast float %15, %beta
-
-  %mul60 = fmul fast float %c2.0.lcssa, %alpha
-  %add61 = fadd fast float %mul60, %mul59
-  store float %add61, float* %arrayidx58, align 4, !tbaa !0
-  %add69 = add nsw i32 %add13146, %mul30
-  %idxprom70 = sext i32 %add69 to i64
-  %arrayidx71 = getelementptr inbounds float* %C, i64 %idxprom70
-
-;  %19 = load float* %arrayidx71, align 4, !tbaa !0
-;  %mul72 = fmul fast float %19, %beta
-  %16 = load float* %arrayidx71, align 4, !tbaa !0
-  %mul72 = fmul fast float %16, %beta
-
-  %mul73 = fmul fast float %c3.0.lcssa, %alpha
-  %add74 = fadd fast float %mul73, %mul72
-  store float %add74, float* %arrayidx71, align 4, !tbaa !0
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 {
-entry:
-  switch i8 %transa, label %if.then [
-    i8 78, label %if.end
-    i8 110, label %if.end
-  ]
-
-if.then:                                          ; preds = %entry
-  %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1
-  %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %0 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %0, align 8
-  %add.ptr.i.sum = add i64 %vbase.offset.i, 240
-  %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum
-  %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
-  %2 = load %"class.std::ctype"** %1, align 8, !tbaa !4
-  %tobool.i97 = icmp eq %"class.std::ctype"* %2, null
-  br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-
-if.then.i98:                                      ; preds = %if.then
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit:    ; preds = %if.then
-  %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6
-  %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !1
-  %tobool.i = icmp eq i8 %3, 0
-  br i1 %tobool.i, label %if.end.i, label %if.then.i
-
-if.then.i:                                        ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10
-  %4 = load i8* %arrayidx.i, align 1, !tbaa !1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-if.end.i:                                         ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1
-  %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !3
-  %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6
-  %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
-  %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-_ZNKSt5ctypeIcE5widenEc.exit:                     ; preds = %if.end.i, %if.then.i
-  %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ]
-  %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1
-  %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1
-  br label %return
-
-if.end:                                           ; preds = %entry, %entry
-  switch i8 %transb, label %if.then9 [
-    i8 84, label %if.end12
-    i8 116, label %if.end12
-  ]
-
-if.then9:                                         ; preds = %if.end
-  %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1
-  %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3
-  %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24
-  %7 = bitcast i8* %vbase.offset.ptr.i52 to i64*
-  %vbase.offset.i53 = load i64* %7, align 8
-  %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240
-  %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum
-  %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"**
-  %9 = load %"class.std::ctype"** %8, align 8, !tbaa !4
-  %tobool.i100 = icmp eq %"class.std::ctype"* %9, null
-  br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-
-if.then.i101:                                     ; preds = %if.then9
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9
-  %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6
-  %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !1
-  %tobool.i76 = icmp eq i8 %10, 0
-  br i1 %tobool.i76, label %if.end.i82, label %if.then.i78
-
-if.then.i78:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10
-  %11 = load i8* %arrayidx.i77, align 1, !tbaa !1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit84
-
-if.end.i82:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1
-  %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !3
-  %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6
-  %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8
-  %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit84
-
-_ZNKSt5ctypeIcE5widenEc.exit84:                   ; preds = %if.end.i82, %if.then.i78
-  %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ]
-  %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1
-  %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1
-  br label %return
-
-if.end12:                                         ; preds = %if.end, %if.end
-  %rem44 = and i32 %m, 15
-  %tobool = icmp eq i32 %rem44, 0
-  br i1 %tobool, label %lor.lhs.false, label %if.then15
-
-lor.lhs.false:                                    ; preds = %if.end12
-  %rem1345 = and i32 %n, 15
-  %tobool14 = icmp eq i32 %rem1345, 0
-  br i1 %tobool14, label %if.end21, label %if.then15
-
-if.then15:                                        ; preds = %lor.lhs.false, %if.end12
-  %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1
-  %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1
-  %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1
-  %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1
-  %14 = bitcast %"class.std::basic_ostream"* %call19 to i8**
-  %vtable.i63 = load i8** %14, align 8, !tbaa !3
-  %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24
-  %15 = bitcast i8* %vbase.offset.ptr.i64 to i64*
-  %vbase.offset.i65 = load i64* %15, align 8
-  %16 = bitcast %"class.std::basic_ostream"* %call19 to i8*
-  %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240
-  %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum
-  %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"**
-  %18 = load %"class.std::ctype"** %17, align 8, !tbaa !4
-  %tobool.i104 = icmp eq %"class.std::ctype"* %18, null
-  br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-
-if.then.i105:                                     ; preds = %if.then15
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15
-  %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6
-  %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !1
-  %tobool.i88 = icmp eq i8 %19, 0
-  br i1 %tobool.i88, label %if.end.i94, label %if.then.i90
-
-if.then.i90:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-  %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10
-  %20 = load i8* %arrayidx.i89, align 1, !tbaa !1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit96
-
-if.end.i94:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1
-  %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !3
-  %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6
-  %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8
-  %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit96
-
-_ZNKSt5ctypeIcE5widenEc.exit96:                   ; preds = %if.end.i94, %if.then.i90
-  %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ]
-  %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1
-  %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1
-  br label %if.end21
-
-if.end21:                                         ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false
-  %div = sdiv i32 %m, 16
-  %mul = and i32 %div, 1073741823
-  %div22 = sdiv i32 %n, 16
-  %mul24 = and i32 %div22, 268435455
-  %conv33 = fpext float %alpha to double
-  %conv34 = fpext float %beta to double
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19)
-  %in.addr = alloca %struct.arg
-  %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  store float* %A, float** %in.addr.A
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  store i64 %bytesA, i64* %in.addr.bytes_A
-  %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  store i32 %lda, i32* %in.addr.lda
-  %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  store float* %B, float** %in.addr.B
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  store i64 %bytesB, i64* %in.addr.bytes_B
-  %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  store i32 %ldb, i32* %in.addr.ldb
-  %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  store float* %C, float** %in.addr.C
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  store i64 %bytesC, i64* %in.addr.bytes_C
-  %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-  store i32 %ldc, i32* %in.addr.ldc
-  %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  store i32 %k, i32* %in.addr.k
-  %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10
-  %in.addr.alpha.cast = fptrunc double %conv33 to float
-  store float %in.addr.alpha.cast, float* %in.addr.alpha
-  %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11
-  %in.addr.beta.cast = fptrunc double %conv34 to float
-  store float %in.addr.beta.cast, float* %in.addr.beta
-  %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12
-  store i32 4, i32* %in.addr.dimX0
-  %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13
-  store i32 16, i32* %in.addr.dimY0
-  %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14
-  store i32 %mul, i32* %in.addr.dimX1
-  %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15
-  store i32 %mul24, i32* %in.addr.dimY1
-  %args = bitcast %struct.arg* %in.addr to i8*
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args)
-  call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2)
-  br label %return
-
-return:                                           ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit
-  ret void
-}
-
-declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #2 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  %matArow = alloca i32, align 4
-  %matAcol = alloca i32, align 4
-  %matBrow = alloca i32, align 4
-  %matBcol = alloca i32, align 4
-  %matA = alloca %"class.std::vector", align 8
-  %matBT = alloca %"class.std::vector", align 8
-  %matC = alloca %"class.std::vector", align 8
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !5
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %1 = bitcast %"class.std::vector"* %matA to i8*
-  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1
-  %2 = bitcast %"class.std::vector"* %matBT to i8*
-  call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %3 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %3, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  call void @llvm.visc.init()
-  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
-  %4 = load i8*** %inpFiles, align 8, !tbaa !4
-  %5 = load i8** %4, align 8, !tbaa !4
-  %cmp = icmp eq i8* %5, null
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %arrayidx2 = getelementptr inbounds i8** %4, i64 1
-  %6 = load i8** %arrayidx2, align 8, !tbaa !4
-  %cmp3 = icmp eq i8* %6, null
-  br i1 %cmp3, label %if.then, label %lor.lhs.false4
-
-lor.lhs.false4:                                   ; preds = %lor.lhs.false
-  %arrayidx6 = getelementptr inbounds i8** %4, i64 2
-  %7 = load i8** %arrayidx6, align 8, !tbaa !4
-  %cmp7 = icmp eq i8* %7, null
-  br i1 %cmp7, label %if.then, label %lor.lhs.false8
-
-lor.lhs.false8:                                   ; preds = %lor.lhs.false4
-  %arrayidx10 = getelementptr inbounds i8** %4, i64 3
-  %8 = load i8** %arrayidx10, align 8, !tbaa !4
-  %cmp11 = icmp eq i8* %8, null
-  br i1 %cmp11, label %if.end, label %if.then
-
-if.then:                                          ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry
-  %9 = load %struct._IO_FILE** @stderr, align 8, !tbaa !4
-  %10 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %9)
-  call void @exit(i32 -1) #7
-  unreachable
-
-if.end:                                           ; preds = %lor.lhs.false8
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1
-  %11 = load i8*** %inpFiles, align 8, !tbaa !4
-  %12 = load i8** %11, align 8, !tbaa !4
-  %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %12, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1
-  %13 = load i8*** %inpFiles, align 8, !tbaa !4
-  %arrayidx17 = getelementptr inbounds i8** %13, i64 2
-  %14 = load i8** %arrayidx17, align 8, !tbaa !4
-  %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %14, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %15 = load i32* %matArow, align 4, !tbaa !5
-  %16 = load i32* %matAcol, align 4, !tbaa !5
-  %mul = mul nsw i32 %16, %15
-  %conv = sext i32 %mul to i64
-  %mul19 = shl nsw i64 %conv, 2
-  %17 = load i32* %matBrow, align 4, !tbaa !5
-  %18 = load i32* %matBcol, align 4, !tbaa !5
-  %mul20 = mul nsw i32 %18, %17
-  %conv21 = sext i32 %mul20 to i64
-  %mul22 = shl nsw i64 %conv21, 2
-  %mul23 = mul nsw i32 %18, %15
-  %conv24 = sext i32 %mul23 to i64
-  %mul25 = shl nsw i64 %conv24, 2
-  %19 = bitcast %"class.std::vector"* %matC to i8*
-  call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 24, i32 8, i1 false) #1
-  %cmp.i.i.i.i = icmp eq i32 %mul23, 0
-  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i
-
-cond.true.i.i.i.i:                                ; preds = %if.end
-  %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0
-  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6
-
-if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
-  call void @_ZSt17__throw_bad_allocv() #7
-  unreachable
-
-_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i
-  %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1
-  %20 = bitcast i8* %call2.i.i.i.i.i to float*
-  br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-
-_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i:    ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end
-  %cond.i.i.i.i = phi float* [ %20, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ]
-  %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0
-  store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1
-  store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4
-  %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24
-  %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2
-  store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !4
-  br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i
-
-for.body.lr.ph.i.i.i.i.i.i.i.i:                   ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7
-  %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i
-  %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24
-  %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i
-  br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-vector.body.i.i.i.i.i.i.i.i:                      ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ]
-  %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i
-  %21 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %21, align 4
-  %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4
-  %22 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i
-  %23 = bitcast float* %22 to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %23, align 4
-  %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8
-  %24 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i
-  br i1 %24, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-middle.block.i.i.i.i.i.i.i.i:                     ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24
-  br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader
-
-for.body.i.i.i.i.i.i.i.i.preheader:               ; preds = %middle.block.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8*
-  %25 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2
-  call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %25, i32 4, i1 false)
-  br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-
-_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit:            ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4
-  %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0
-  %26 = load float** %_M_start.i.i, align 8, !tbaa !4
-  %27 = bitcast float* %26 to i8*
-  call void @llvm_visc_track_mem(i8* %27, i64 %mul19) #1
-  %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0
-  %28 = load float** %_M_start.i.i82, align 8, !tbaa !4
-  %29 = bitcast float* %28 to i8*
-  call void @llvm_visc_track_mem(i8* %29, i64 %mul22) #1
-  %30 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %31 = bitcast float* %30 to i8*
-  call void @llvm_visc_track_mem(i8* %31, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %32 = load float** %_M_finish.i.i.i, align 8, !tbaa !4
-  %33 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %cmp3399 = icmp eq float* %32, %33
-  br i1 %cmp3399, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  %sub.ptr.lhs.cast.i = ptrtoint float* %32 to i64
-  %sub.ptr.rhs.cast.i = ptrtoint float* %33 to i64
-  %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i
-  %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %add.ptr.i = getelementptr inbounds float* %33, i64 %i.0100
-  store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !0
-  %inc = add i64 %i.0100, 1
-  %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i
-  br i1 %cmp33, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  %34 = load i32* %matArow, align 4, !tbaa !5
-  %35 = load i32* %matBcol, align 4, !tbaa !5
-  %36 = load i32* %matAcol, align 4, !tbaa !5
-  %37 = load float** %_M_start.i.i, align 8, !tbaa !4
-  %38 = load float** %_M_start.i.i82, align 8, !tbaa !4
-  %39 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %34, i32 %35, i32 %36, float 1.000000e+00, float* %37, i64 %mul19, i32 %34, float* %38, i64 %mul22, i32 %35, float 0.000000e+00, float* %39, i64 %mul25, i32 %34)
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
-  %40 = load i8** %outFile, align 8, !tbaa !4
-  %tobool = icmp eq i8* %40, null
-  br i1 %tobool, label %if.end42, label %if.then38
-
-if.then38:                                        ; preds = %for.end
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  %41 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %42 = bitcast float* %41 to i8*
-  call void @llvm_visc_request_mem(i8* %42, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1
-  %43 = load i8** %outFile, align 8, !tbaa !4
-  %44 = load i32* %matArow, align 4, !tbaa !5
-  %45 = load i32* %matBcol, align 4, !tbaa !5
-  %call41 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %43, i32 %44, i32 %45, %"class.std::vector"* %matC) #1
-  br label %if.end42
-
-if.end42:                                         ; preds = %if.then38, %for.end
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1
-  %46 = load float** %_M_start.i.i, align 8, !tbaa !4
-  %47 = bitcast float* %46 to i8*
-  call void @llvm_visc_untrack_mem(i8* %47) #1
-  %48 = load float** %_M_start.i.i82, align 8, !tbaa !4
-  %49 = bitcast float* %48 to i8*
-  call void @llvm_visc_untrack_mem(i8* %49) #1
-  %50 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %51 = bitcast float* %50 to i8*
-  call void @llvm_visc_untrack_mem(i8* %51) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2
-  %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1
-  %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1
-  %52 = load i32* %matArow, align 4, !tbaa !5
-  %conv50 = sitofp i32 %52 to double
-  %mul51 = fmul fast double %conv50, 2.000000e+00
-  %53 = load i32* %matBcol, align 4, !tbaa !5
-  %conv52 = sitofp i32 %53 to double
-  %mul53 = fmul fast double %mul51, %conv52
-  %54 = load i32* %matAcol, align 4, !tbaa !5
-  %conv54 = sitofp i32 %54 to double
-  %mul55 = fmul fast double %mul53, %conv54
-  %div = fdiv fast double %mul55, %call48
-  %div56 = fmul double %div, 1.000000e-09
-  %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1
-  %55 = bitcast %"class.std::basic_ostream"* %call.i to i8**
-  %vtable.i = load i8** %55, align 8, !tbaa !3
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %56 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %56, align 8
-  %57 = bitcast %"class.std::basic_ostream"* %call.i to i8*
-  %add.ptr.sum.i = add i64 %vbase.offset.i, 240
-  %_M_ctype.i.i = getelementptr inbounds i8* %57, i64 %add.ptr.sum.i
-  %58 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"**
-  %59 = load %"class.std::ctype"** %58, align 8, !tbaa !4
-  %tobool.i.i.i = icmp eq %"class.std::ctype"* %59, null
-  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-
-if.then.i.i.i:                                    ; preds = %if.end42
-  call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end42
-  %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 6
-  %60 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !1
-  %tobool.i3.i.i = icmp eq i8 %60, 0
-  br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i
-
-if.then.i4.i.i:                                   ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 7, i64 10
-  %61 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-if.end.i.i.i:                                     ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %59) #1
-  %62 = bitcast %"class.std::ctype"* %59 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %62, align 8, !tbaa !3
-  %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6
-  %63 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8
-  %call.i.i.i = call signext i8 %63(%"class.std::ctype"* %59, i8 signext 10) #1
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i
-  %retval.0.i.i.i = phi i8 [ %61, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ]
-  %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1
-  %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  call void @llvm.visc.cleanup()
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
-  %64 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
-  %tobool.i.i.i.i78 = icmp eq float* %64, null
-  br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79
-
-if.then.i.i.i.i79:                                ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %65 = bitcast float* %64 to i8*
-  call void @_ZdlPv(i8* %65) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit80
-
-_ZNSt6vectorIfSaIfEED1Ev.exit80:                  ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %66 = load float** %_M_start.i.i82, align 8, !tbaa !4
-  %tobool.i.i.i.i74 = icmp eq float* %66, null
-  br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75
-
-if.then.i.i.i.i75:                                ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %67 = bitcast float* %66 to i8*
-  call void @_ZdlPv(i8* %67) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit76
-
-_ZNSt6vectorIfSaIfEED1Ev.exit76:                  ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %68 = load float** %_M_start.i.i, align 8, !tbaa !4
-  %tobool.i.i.i.i = icmp eq float* %68, null
-  br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i
-
-if.then.i.i.i.i:                                  ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  %69 = bitcast float* %68 to i8*
-  call void @_ZdlPv(i8* %69) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit
-
-_ZNSt6vectorIfSaIfEED1Ev.exit:                    ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0
-
-; Function Attrs: noreturn nounwind
-declare void @exit(i32) #4
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
-
-declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0
-
-declare void @llvm_visc_track_mem(i8*, i64) #0
-
-declare void @llvm_visc_request_mem(i8*, i64) #0
-
-declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0
-
-declare void @llvm_visc_untrack_mem(i8*) #0
-
-declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt17__throw_bad_allocv() #5
-
-declare noalias i8* @_Znwm(i64) #0
-
-; Function Attrs: nounwind
-declare void @_ZdlPv(i8*) #6
-
-declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0
-
-declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt16__throw_bad_castv() #5
-
-declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0
-
-declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0
-
-; Function Attrs: nounwind
-define internal void @_GLOBAL__I_a() #1 section ".text.startup" {
-entry:
-  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1
-  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #7 = { noreturn nounwind }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"vtable pointer", metadata !2}
-!4 = metadata !{metadata !"any pointer", metadata !1}
-!5 = metadata !{metadata !"int", metadata !1}
-!6 = metadata !{metadata !"branch_weights", i32 4, i32 64}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll
deleted file mode 100644
index b6e9e3818e997156517574f16e6fd12a1bbebc52..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.visc.ll
+++ /dev/null
@@ -1,869 +0,0 @@
-; ModuleID = 'build/visc_vec_default/main.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::ios_base::Init" = type { i8 }
-%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
-%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
-%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
-%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
-%"struct.std::ios_base::_Words" = type { i8*, i64 }
-%"class.std::locale" = type { %"class.std::locale::_Impl"* }
-%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
-%"class.std::locale::facet" = type { i32 (...)**, i32 }
-%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
-%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 }
-%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
-%struct.__locale_data = type opaque
-%"class.std::num_put" = type { %"class.std::locale::facet" }
-%"class.std::num_get" = type { %"class.std::locale::facet" }
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%rtype = type {}
-%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }>
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%"class.std::vector" = type { %"struct.std::_Vector_base" }
-%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" }
-%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* }
-%struct.pb_Parameters = type { i8*, i8** }
-
-@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
-@__dso_handle = external global i8
-@_ZSt4cerr = external global %"class.std::basic_ostream"
-@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1
-@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1
-@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1
-@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1
-@stderr = external global %struct._IO_FILE*
-@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1
-@_ZSt4cout = external global %"class.std::basic_ostream"
-@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-
-declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0
-
-declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0
-
-; Function Attrs: nounwind
-declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1
-
-; Function Attrs: nounwind readnone
-declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 {
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode()
-  %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %2 = mul i32 %a0, %1
-  %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %4 = add i32 %2, %3
-  %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %a7 = mul i32 %5, %a6
-  %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  %a9 = add i32 %a7, %a8
-
-  %a10 = shl i32 %4, 3
-
-
-  ;a10 = %3, a9 = %5 
-  ;%1 = tail call i64 @_Z13get_global_idj(i32 0) #1
-  ;%2 = shl i64 %1, 3
-  ;%3 = trunc i64 %2 to i32
-  ;%4 = tail call i64 @_Z13get_global_idj(i32 1) #1
-  ;%5 = trunc i64 %4 to i32
-
-
-
-
-  %6 = icmp sgt i32 %k, 0
-  br i1 %6, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %.lr.ph, %0
-  %cp.021 = phi <8 x float> [ %20, %.lr.ph ], [ zeroinitializer, %0 ]
-  %i.020 = phi i32 [ %21, %.lr.ph ], [ 0, %0 ]
-  %7 = mul nsw i32 %i.020, %lda
-  %8 = add nsw i32 %7, %a10
-  %9 = sext i32 %8 to i64
-  %10 = getelementptr inbounds float* %A, i64 %9
-  %v10 = bitcast float* %10 to <8 x float>*
-  %11 = load <8 x float>* %v10
-  %12 = mul nsw i32 %i.020, %ldb
-  %13 = add nsw i32 %12, %a9
-  %14 = sext i32 %13 to i64
-  %15 = getelementptr inbounds float* %B, i64 %14
-  %16 = load float* %15, align 4, !tbaa !9
-  %17 = insertelement <8 x float> undef, float %16, i32 0
-  %18 = shufflevector <8 x float> %17, <8 x float> undef, <8 x i32> zeroinitializer
-  %19 = fmul <8 x float> %11, %18
-  %20 = fadd <8 x float> %cp.021, %19
-  %21 = add nsw i32 %i.020, 1
-  %22 = icmp slt i32 %21, %k
-  br i1 %22, label %.lr.ph, label %._crit_edge
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %0 ], [ %20, %.lr.ph ]
-  %23 = mul nsw i32 %a9, %ldc
-  %24 = add nsw i32 %23, %a10
-  %25 = sext i32 %24 to i64
-  %26 = getelementptr inbounds float* %C, i64 %25
-  %v26 = bitcast float* %26 to <8 x float>*
-  %27 = load <8 x float>* %v26
-  %28 = insertelement <8 x float> undef, float %beta, i32 0
-  %29 = shufflevector <8 x float> %28, <8 x float> undef, <8 x i32> zeroinitializer
-  %30 = insertelement <8 x float> undef, float %alpha, i32 0
-  %31 = shufflevector <8 x float> %30, <8 x float> undef, <8 x i32> zeroinitializer
-  %32 = fmul <8 x float> %31, %cp.0.lcssa
-  
-  ;%33 = tail call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %27, <8 x float> %29, <8 x float> %32)
-  
-  %mul = fmul <8 x float> %27, %29
-  %33 = fadd <8 x float> %mul, %32
-  store <8 x float> %33, <8 x float>* %v26
-
-;entry:
-  ;%_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode()
-  ;%_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  ;%0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  ;%1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  ;%2 = mul i32 %0, %1
-  ;%3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  ;%4 = add i32 %2, %3
-  ;%5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  ;%6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  ;%7 = mul i32 %5, %6
-  ;%8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-  ;%9 = add i32 %7, %8
-  ;%cmp32 = icmp sgt i32 %k, 0
-  ;br i1 %cmp32, label %for.body, label %for.end
-
-;for.body:                                         ; preds = %for.body, %entry
-  ;%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  ;%c.034 = phi float [ %add7, %for.body ], [ 0.000000e+00, %entry ]
-  ;%10 = trunc i64 %indvars.iv to i32
-  ;%mul = mul nsw i32 %10, %lda
-  ;%add = add nsw i32 %mul, %4
-  ;%idxprom = sext i32 %add to i64
-  ;%arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  ;%11 = load float* %arrayidx, align 4, !tbaa !3
-  ;%mul2 = mul nsw i32 %10, %ldb
-  ;%add3 = add nsw i32 %mul2, %9
-  ;%idxprom4 = sext i32 %add3 to i64
-  ;%arrayidx5 = getelementptr inbounds float* %B, i64 %idxprom4
-  ;%12 = load float* %arrayidx5, align 4, !tbaa !3
-  ;%mul6 = fmul fast float %11, %12
-  ;%add7 = fadd fast float %c.034, %mul6
-  ;%indvars.iv.next = add i64 %indvars.iv, 1
-  ;%lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  ;%exitcond = icmp eq i32 %lftr.wideiv, %k
-  ;br i1 %exitcond, label %for.end, label %for.body
-
-;for.end:                                          ; preds = %for.body, %entry
-  ;%c.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add7, %for.body ]
-  ;%mul8 = mul nsw i32 %9, %ldc
-  ;%add9 = add nsw i32 %mul8, %4
-  ;%idxprom10 = sext i32 %add9 to i64
-  ;%arrayidx11 = getelementptr inbounds float* %C, i64 %idxprom10
-  ;%13 = load float* %arrayidx11, align 4, !tbaa !3
-  ;%mul12 = fmul fast float %13, %beta
-  ;%mul13 = fmul fast float %c.0.lcssa, %alpha
-  ;%add14 = fadd fast float %mul13, %mul12
-  ;store float %add14, float* %arrayidx11, align 4, !tbaa !3
-  ret %rtype undef
-
-
-}
-
-; Function Attrs: noinline nounwind uwtable
-define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 {
-entry:
-  switch i8 %transa, label %if.then [
-    i8 78, label %if.end
-    i8 110, label %if.end
-  ]
-
-if.then:                                          ; preds = %entry
-  %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1
-  %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %0 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %0, align 8
-  %add.ptr.i.sum = add i64 %vbase.offset.i, 240
-  %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum
-  %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
-  %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7
-  %tobool.i97 = icmp eq %"class.std::ctype"* %2, null
-  br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-
-if.then.i98:                                      ; preds = %if.then
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit:    ; preds = %if.then
-  %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6
-  %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4
-  %tobool.i = icmp eq i8 %3, 0
-  br i1 %tobool.i, label %if.end.i, label %if.then.i
-
-if.then.i:                                        ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10
-  %4 = load i8* %arrayidx.i, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-if.end.i:                                         ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1
-  %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6
-  %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6
-  %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
-  %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-_ZNKSt5ctypeIcE5widenEc.exit:                     ; preds = %if.end.i, %if.then.i
-  %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ]
-  %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1
-  %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1
-  br label %return
-
-if.end:                                           ; preds = %entry, %entry
-  switch i8 %transb, label %if.then9 [
-    i8 84, label %if.end12
-    i8 116, label %if.end12
-  ]
-
-if.then9:                                         ; preds = %if.end
-  %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1
-  %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6
-  %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24
-  %7 = bitcast i8* %vbase.offset.ptr.i52 to i64*
-  %vbase.offset.i53 = load i64* %7, align 8
-  %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240
-  %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum
-  %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"**
-  %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7
-  %tobool.i100 = icmp eq %"class.std::ctype"* %9, null
-  br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-
-if.then.i101:                                     ; preds = %if.then9
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9
-  %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6
-  %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !4
-  %tobool.i76 = icmp eq i8 %10, 0
-  br i1 %tobool.i76, label %if.end.i82, label %if.then.i78
-
-if.then.i78:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10
-  %11 = load i8* %arrayidx.i77, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit84
-
-if.end.i82:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1
-  %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6
-  %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6
-  %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8
-  %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit84
-
-_ZNKSt5ctypeIcE5widenEc.exit84:                   ; preds = %if.end.i82, %if.then.i78
-  %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ]
-  %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1
-  %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1
-  br label %return
-
-if.end12:                                         ; preds = %if.end, %if.end
-  %rem44 = and i32 %m, 15
-  %tobool = icmp eq i32 %rem44, 0
-  br i1 %tobool, label %lor.lhs.false, label %if.then15
-
-lor.lhs.false:                                    ; preds = %if.end12
-  %rem1345 = and i32 %n, 15
-  %tobool14 = icmp eq i32 %rem1345, 0
-  br i1 %tobool14, label %if.end21, label %if.then15
-
-if.then15:                                        ; preds = %lor.lhs.false, %if.end12
-  %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1
-  %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1
-  %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1
-  %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1
-  %14 = bitcast %"class.std::basic_ostream"* %call19 to i8**
-  %vtable.i63 = load i8** %14, align 8, !tbaa !6
-  %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24
-  %15 = bitcast i8* %vbase.offset.ptr.i64 to i64*
-  %vbase.offset.i65 = load i64* %15, align 8
-  %16 = bitcast %"class.std::basic_ostream"* %call19 to i8*
-  %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240
-  %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum
-  %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"**
-  %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7
-  %tobool.i104 = icmp eq %"class.std::ctype"* %18, null
-  br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-
-if.then.i105:                                     ; preds = %if.then15
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15
-  %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6
-  %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !4
-  %tobool.i88 = icmp eq i8 %19, 0
-  br i1 %tobool.i88, label %if.end.i94, label %if.then.i90
-
-if.then.i90:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-  %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10
-  %20 = load i8* %arrayidx.i89, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit96
-
-if.end.i94:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1
-  %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6
-  %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6
-  %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8
-  %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit96
-
-_ZNKSt5ctypeIcE5widenEc.exit96:                   ; preds = %if.end.i94, %if.then.i90
-  %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ]
-  %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1
-  %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1
-  br label %if.end21
-
-if.end21:                                         ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false
-  %div = sdiv i32 %m, 16
-  %mul = and i32 %div, 2147483647
-  %div22 = sdiv i32 %n, 16
-  %mul24 = and i32 %div22, 268435455
-  %conv33 = fpext float %alpha to double
-  %conv34 = fpext float %beta to double
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19)
-  %in.addr = alloca %struct.arg
-  %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  store float* %A, float** %in.addr.A
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  store i64 %bytesA, i64* %in.addr.bytes_A
-  %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  store i32 %lda, i32* %in.addr.lda
-  %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  store float* %B, float** %in.addr.B
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  store i64 %bytesB, i64* %in.addr.bytes_B
-  %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  store i32 %ldb, i32* %in.addr.ldb
-  %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  store float* %C, float** %in.addr.C
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  store i64 %bytesC, i64* %in.addr.bytes_C
-  %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-  store i32 %ldc, i32* %in.addr.ldc
-  %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  store i32 %k, i32* %in.addr.k
-  %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10
-  %in.addr.alpha.cast = fptrunc double %conv33 to float
-  store float %in.addr.alpha.cast, float* %in.addr.alpha
-  %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11
-  %in.addr.beta.cast = fptrunc double %conv34 to float
-  store float %in.addr.beta.cast, float* %in.addr.beta
-  %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12
-  store i32 2, i32* %in.addr.dimX0
-  %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13
-  store i32 16, i32* %in.addr.dimY0
-  %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14
-  store i32 %mul, i32* %in.addr.dimX1
-  %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15
-  store i32 %mul24, i32* %in.addr.dimY1
-  %args = bitcast %struct.arg* %in.addr to i8*
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args)
-  call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2)
-  br label %return
-
-return:                                           ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit
-  ret void
-}
-
-declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #2 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  %matArow = alloca i32, align 4
-  %matAcol = alloca i32, align 4
-  %matBrow = alloca i32, align 4
-  %matBcol = alloca i32, align 4
-  %matA = alloca %"class.std::vector", align 8
-  %matBT = alloca %"class.std::vector", align 8
-  %matC = alloca %"class.std::vector", align 8
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !8
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %1 = bitcast %"class.std::vector"* %matA to i8*
-  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1
-  %2 = bitcast %"class.std::vector"* %matBT to i8*
-  call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1
-  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
-  %3 = load i8*** %inpFiles, align 8, !tbaa !7
-  %4 = load i8** %3, align 8, !tbaa !7
-  %cmp = icmp eq i8* %4, null
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %arrayidx2 = getelementptr inbounds i8** %3, i64 1
-  %5 = load i8** %arrayidx2, align 8, !tbaa !7
-  %cmp3 = icmp eq i8* %5, null
-  br i1 %cmp3, label %if.then, label %lor.lhs.false4
-
-lor.lhs.false4:                                   ; preds = %lor.lhs.false
-  %arrayidx6 = getelementptr inbounds i8** %3, i64 2
-  %6 = load i8** %arrayidx6, align 8, !tbaa !7
-  %cmp7 = icmp eq i8* %6, null
-  br i1 %cmp7, label %if.then, label %lor.lhs.false8
-
-lor.lhs.false8:                                   ; preds = %lor.lhs.false4
-  %arrayidx10 = getelementptr inbounds i8** %3, i64 3
-  %7 = load i8** %arrayidx10, align 8, !tbaa !7
-  %cmp11 = icmp eq i8* %7, null
-  br i1 %cmp11, label %if.end, label %if.then
-
-if.then:                                          ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry
-  %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7
-  %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8)
-  call void @exit(i32 -1) #7
-  unreachable
-
-if.end:                                           ; preds = %lor.lhs.false8
-  %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1
-  %10 = load i8*** %inpFiles, align 8, !tbaa !7
-  %arrayidx17 = getelementptr inbounds i8** %10, i64 2
-  %11 = load i8** %arrayidx17, align 8, !tbaa !7
-  %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %12 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %12, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  call void @llvm.visc.init()
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %13 = load i32* %matArow, align 4, !tbaa !8
-  %14 = load i32* %matAcol, align 4, !tbaa !8
-  %mul = mul nsw i32 %14, %13
-  %conv = sext i32 %mul to i64
-  %mul19 = shl nsw i64 %conv, 2
-  %15 = load i32* %matBrow, align 4, !tbaa !8
-  %16 = load i32* %matBcol, align 4, !tbaa !8
-  %mul20 = mul nsw i32 %16, %15
-  %conv21 = sext i32 %mul20 to i64
-  %mul22 = shl nsw i64 %conv21, 2
-  %mul23 = mul nsw i32 %16, %13
-  %conv24 = sext i32 %mul23 to i64
-  %mul25 = shl nsw i64 %conv24, 2
-  %17 = bitcast %"class.std::vector"* %matC to i8*
-  call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1
-  %cmp.i.i.i.i = icmp eq i32 %mul23, 0
-  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i
-
-cond.true.i.i.i.i:                                ; preds = %if.end
-  %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0
-  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9
-
-if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
-  call void @_ZSt17__throw_bad_allocv() #7
-  unreachable
-
-_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i
-  %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1
-  %18 = bitcast i8* %call2.i.i.i.i.i to float*
-  br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-
-_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i:    ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end
-  %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ]
-  %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0
-  store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1
-  store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24
-  %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2
-  store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7
-  br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i
-
-for.body.lr.ph.i.i.i.i.i.i.i.i:                   ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7
-  %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i
-  %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24
-  %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i
-  br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-vector.body.i.i.i.i.i.i.i.i:                      ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ]
-  %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i
-  %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %19, align 4
-  %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4
-  %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i
-  %21 = bitcast float* %20 to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %21, align 4
-  %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8
-  %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i
-  br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-middle.block.i.i.i.i.i.i.i.i:                     ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24
-  br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader
-
-for.body.i.i.i.i.i.i.i.i.preheader:               ; preds = %middle.block.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8*
-  %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2
-  call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false)
-  br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-
-_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit:            ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0
-  %24 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %25 = bitcast float* %24 to i8*
-  call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1
-  %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0
-  %26 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %27 = bitcast float* %26 to i8*
-  call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1
-  %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %29 = bitcast float* %28 to i8*
-  call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %cmp3399 = icmp eq float* %30, %31
-  br i1 %cmp3399, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64
-  %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64
-  %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i
-  %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100
-  store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3
-  %inc = add i64 %i.0100, 1
-  %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i
-  br i1 %cmp33, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  %32 = load i32* %matArow, align 4, !tbaa !8
-  %33 = load i32* %matBcol, align 4, !tbaa !8
-  %34 = load i32* %matAcol, align 4, !tbaa !8
-  %35 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %36 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32)
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %39 = bitcast float* %38 to i8*
-  call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1
-  %40 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %41 = bitcast float* %40 to i8*
-  call void @llvm_visc_untrack_mem(i8* %41) #1
-  %42 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %43 = bitcast float* %42 to i8*
-  call void @llvm_visc_untrack_mem(i8* %43) #1
-  %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %45 = bitcast float* %44 to i8*
-  call void @llvm_visc_untrack_mem(i8* %45) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  call void @llvm.visc.cleanup()
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
-  %46 = load i8** %outFile, align 8, !tbaa !7
-  %tobool = icmp eq i8* %46, null
-  br i1 %tobool, label %if.end45, label %if.then42
-
-if.then42:                                        ; preds = %for.end
-  %47 = load i32* %matArow, align 4, !tbaa !8
-  %48 = load i32* %matBcol, align 4, !tbaa !8
-  %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1
-  br label %if.end45
-
-if.end45:                                         ; preds = %if.then42, %for.end
-  %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2
-  %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1
-  %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1
-  %49 = load i32* %matArow, align 4, !tbaa !8
-  %conv50 = sitofp i32 %49 to double
-  %mul51 = fmul fast double %conv50, 2.000000e+00
-  %50 = load i32* %matBcol, align 4, !tbaa !8
-  %conv52 = sitofp i32 %50 to double
-  %mul53 = fmul fast double %mul51, %conv52
-  %51 = load i32* %matAcol, align 4, !tbaa !8
-  %conv54 = sitofp i32 %51 to double
-  %mul55 = fmul fast double %mul53, %conv54
-  %div = fdiv fast double %mul55, %call48
-  %div56 = fmul double %div, 1.000000e-09
-  %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1
-  %52 = bitcast %"class.std::basic_ostream"* %call.i to i8**
-  %vtable.i = load i8** %52, align 8, !tbaa !6
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %53 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %53, align 8
-  %54 = bitcast %"class.std::basic_ostream"* %call.i to i8*
-  %add.ptr.sum.i = add i64 %vbase.offset.i, 240
-  %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i
-  %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"**
-  %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7
-  %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null
-  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-
-if.then.i.i.i:                                    ; preds = %if.end45
-  call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45
-  %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6
-  %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4
-  %tobool.i3.i.i = icmp eq i8 %57, 0
-  br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i
-
-if.then.i4.i.i:                                   ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10
-  %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-if.end.i.i.i:                                     ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1
-  %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6
-  %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6
-  %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8
-  %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i
-  %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ]
-  %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1
-  %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
-  %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %tobool.i.i.i.i78 = icmp eq float* %61, null
-  br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79
-
-if.then.i.i.i.i79:                                ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %62 = bitcast float* %61 to i8*
-  call void @_ZdlPv(i8* %62) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit80
-
-_ZNSt6vectorIfSaIfEED1Ev.exit80:                  ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %63 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %tobool.i.i.i.i74 = icmp eq float* %63, null
-  br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75
-
-if.then.i.i.i.i75:                                ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %64 = bitcast float* %63 to i8*
-  call void @_ZdlPv(i8* %64) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit76
-
-_ZNSt6vectorIfSaIfEED1Ev.exit76:                  ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %65 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %tobool.i.i.i.i = icmp eq float* %65, null
-  br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i
-
-if.then.i.i.i.i:                                  ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  %66 = bitcast float* %65 to i8*
-  call void @_ZdlPv(i8* %66) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit
-
-_ZNSt6vectorIfSaIfEED1Ev.exit:                    ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0
-
-; Function Attrs: noreturn nounwind
-declare void @exit(i32) #4
-
-declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
-
-declare void @llvm_visc_track_mem(i8*, i64) #0
-
-declare void @llvm_visc_request_mem(i8*, i64) #0
-
-declare void @llvm_visc_untrack_mem(i8*) #0
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
-
-declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0
-
-declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt17__throw_bad_allocv() #5
-
-declare noalias i8* @_Znwm(i64) #0
-
-; Function Attrs: nounwind
-declare void @_ZdlPv(i8*) #6
-
-declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0
-
-declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt16__throw_bad_castv() #5
-
-declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0
-
-declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0
-
-; Function Attrs: nounwind
-define internal void @_GLOBAL__I_a() #1 section ".text.startup" {
-entry:
-  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1
-  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #7 = { noreturn nounwind }
-
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2}
-
-!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff}
-!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1}
-!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2}
-!3 = metadata !{metadata !"float", metadata !4}
-!4 = metadata !{metadata !"omnipotent char", metadata !5}
-!5 = metadata !{metadata !"Simple C/C++ TBAA"}
-!6 = metadata !{metadata !"vtable pointer", metadata !5}
-!7 = metadata !{metadata !"any pointer", metadata !4}
-!8 = metadata !{metadata !"int", metadata !4}
-!9 = metadata !{metadata !"branch_weights", i32 4, i32 64}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile
deleted file mode 100644
index f74ee8921a534b6963ba06d089398114571d070b..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# (c) 2010 The Board of Trustees of the University of Illinois.
-
-LANGUAGE=visc
-SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll
deleted file mode 100644
index b997cf7ebcabcb339e90258dd78f0b141483bbf9..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.visc.ll
+++ /dev/null
@@ -1,889 +0,0 @@
-; ModuleID = 'build/visc_vec_opt_default/main.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::ios_base::Init" = type { i8 }
-%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
-%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
-%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
-%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
-%"struct.std::ios_base::_Words" = type { i8*, i64 }
-%"class.std::locale" = type { %"class.std::locale::_Impl"* }
-%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
-%"class.std::locale::facet" = type { i32 (...)**, i32 }
-%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
-%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 }
-%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
-%struct.__locale_data = type opaque
-%"class.std::num_put" = type { %"class.std::locale::facet" }
-%"class.std::num_get" = type { %"class.std::locale::facet" }
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%rtype = type {}
-%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }>
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%"class.std::vector" = type { %"struct.std::_Vector_base" }
-%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" }
-%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* }
-%struct.pb_Parameters = type { i8*, i8** }
-
-@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
-@__dso_handle = external global i8
-@_ZSt4cerr = external global %"class.std::basic_ostream"
-@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1
-@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1
-@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1
-@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1
-@stderr = external global %struct._IO_FILE*
-@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1
-@_ZSt4cout = external global %"class.std::basic_ostream"
-@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-
-declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0
-
-declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0
-
-; Function Attrs: nounwind
-declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode()
-  %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-
-;  %call = call i32 @get_local_id(i32 1) #2
-  %call = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-
-;  %call1 = call i32 @get_local_size(i32 0) #2
-  %call1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-
-  %mul = mul i32 %call1, %call
-
-;  %call2 = call i32 @get_local_id(i32 0) #2
-  %call2 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
-
-  %add = add i32 %mul, %call2
-
-;  %call3 = call i32 @get_group_id(i32 0) #2
-  %call3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-
-  %mul4 = shl i32 %call3, 6
-  %add5 = add i32 %add, %mul4
-  %cmp89 = icmp sgt i32 %k, 0
-
-  %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %mul7 = shl i32 %call6, 3
-
-  br i1 %cmp89, label %for.body, label %for.end23
-
-for.body:                                         ; preds = %entry, %for.inc21
-  %cp.091 = phi <8 x float> [ %add20, %for.inc21 ], [ zeroinitializer, %entry ]
-  %i.090 = phi i32 [ %add22, %for.inc21 ], [ 0, %entry ]
-;  %call6 = call i32 @get_group_id(i32 1) #2
-;  %call6 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-;  %mul7 = shl i32 %call6, 3
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body
-  %cp.188 = phi <8 x float> [ %cp.091, %for.body ], [ %add20, %for.body12 ]
-  %j.087 = phi i32 [ 0, %for.body ], [ %inc, %for.body12 ]
-  %add13 = add i32 %j.087, %i.090
-  %mul14 = mul nsw i32 %add13, %lda
-  %add15 = add nsw i32 %mul14, %add5
-  %arrayidx = getelementptr inbounds float* %A, i32 %add15
-  %0 = load float* %arrayidx, align 4, !tbaa !3
-  %splat.splatinsert = insertelement <8 x float> undef, float %0, i32 0
-  %splat.splat = shufflevector <8 x float> %splat.splatinsert, <8 x float> undef, <8 x i32> zeroinitializer
-  %tmp83 = mul i32 %add13, %ldb
-  %add.ptr.sum = add i32 %tmp83, %mul7
-  %add.ptr17 = getelementptr inbounds float* %B, i32 %add.ptr.sum
-
-;  %call18 = call <8 x float> @_Z6vload8jPKU3AS1f(i32 0, float* %add.ptr17) #2
-  %add.ptr17.cast = bitcast float* %add.ptr17 to <8 x float>*
-  %call18 = load <8 x float>* %add.ptr17.cast, align 8
-
-  %mul19 = fmul fast <8 x float> %call18, %splat.splat
-  %add20 = fadd fast <8 x float> %cp.188, %mul19
-  %inc = add nsw i32 %j.087, 1
-  %exitcond92 = icmp eq i32 %inc, 8
-  br i1 %exitcond92, label %for.inc21, label %for.body12
-
-for.inc21:                                        ; preds = %for.body12
-  %add22 = add nsw i32 %i.090, 8
-  %cmp = icmp slt i32 %add22, %k
-  br i1 %cmp, label %for.body, label %for.end23
-
-for.end23:                                        ; preds = %for.inc21, %entry
-  %cp.0.lcssa = phi <8 x float> [ zeroinitializer, %entry ], [ %add20, %for.inc21 ]
-  %splat.splatinsert24 = insertelement <8 x float> undef, float %alpha, i32 0
-  %splat.splat25 = shufflevector <8 x float> %splat.splatinsert24, <8 x float> undef, <8 x i32> zeroinitializer
-  %mul26 = fmul fast <8 x float> %splat.splat25, %cp.0.lcssa
-  %1 = extractelement <8 x float> %mul26, i32 0
-  %2 = extractelement <8 x float> %mul26, i32 1
-  %3 = extractelement <8 x float> %mul26, i32 2
-  %4 = extractelement <8 x float> %mul26, i32 3
-  %5 = extractelement <8 x float> %mul26, i32 4
-  %6 = extractelement <8 x float> %mul26, i32 5
-  %7 = extractelement <8 x float> %mul26, i32 6
-  %8 = extractelement <8 x float> %mul26, i32 7
-;  %call35 = call i32 @get_group_id(i32 1) #2
-;  %call35 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
-  %mul37 = shl i32 %call6, 3
-  %tmp85 = mul i32 %mul37, %ldc
-  %add44 = add i32 %tmp85, %add5
-  %arrayidx45 = getelementptr inbounds float* %C, i32 %add44
-  %9 = load float* %arrayidx45, align 4, !tbaa !3
-  %mul46 = fmul fast float %9, %beta
-  %add48 = fadd fast float %mul46, %1
-  store float %add48, float* %arrayidx45, align 4, !tbaa !3
-  %tmp84.193 = or i32 %mul37, 1
-  %tmp85.1 = mul i32 %tmp84.193, %ldc
-  %add44.1 = add i32 %tmp85.1, %add5
-  %arrayidx45.1 = getelementptr inbounds float* %C, i32 %add44.1
-  %10 = load float* %arrayidx45.1, align 4, !tbaa !3
-  %mul46.1 = fmul fast float %10, %beta
-  %add48.1 = fadd fast float %mul46.1, %2
-  store float %add48.1, float* %arrayidx45.1, align 4, !tbaa !3
-  %tmp84.294 = or i32 %mul37, 2
-  %tmp85.2 = mul i32 %tmp84.294, %ldc
-  %add44.2 = add i32 %tmp85.2, %add5
-  %arrayidx45.2 = getelementptr inbounds float* %C, i32 %add44.2
-  %11 = load float* %arrayidx45.2, align 4, !tbaa !3
-  %mul46.2 = fmul fast float %11, %beta
-  %add48.2 = fadd fast float %mul46.2, %3
-  store float %add48.2, float* %arrayidx45.2, align 4, !tbaa !3
-  %tmp84.395 = or i32 %mul37, 3
-  %tmp85.3 = mul i32 %tmp84.395, %ldc
-  %add44.3 = add i32 %tmp85.3, %add5
-  %arrayidx45.3 = getelementptr inbounds float* %C, i32 %add44.3
-  %12 = load float* %arrayidx45.3, align 4, !tbaa !3
-  %mul46.3 = fmul fast float %12, %beta
-  %add48.3 = fadd fast float %mul46.3, %4
-  store float %add48.3, float* %arrayidx45.3, align 4, !tbaa !3
-  %tmp84.496 = or i32 %mul37, 4
-  %tmp85.4 = mul i32 %tmp84.496, %ldc
-  %add44.4 = add i32 %tmp85.4, %add5
-  %arrayidx45.4 = getelementptr inbounds float* %C, i32 %add44.4
-  %13 = load float* %arrayidx45.4, align 4, !tbaa !3
-  %mul46.4 = fmul fast float %13, %beta
-  %add48.4 = fadd fast float %mul46.4, %5
-  store float %add48.4, float* %arrayidx45.4, align 4, !tbaa !3
-  %tmp84.597 = or i32 %mul37, 5
-  %tmp85.5 = mul i32 %tmp84.597, %ldc
-  %add44.5 = add i32 %tmp85.5, %add5
-  %arrayidx45.5 = getelementptr inbounds float* %C, i32 %add44.5
-  %14 = load float* %arrayidx45.5, align 4, !tbaa !3
-  %mul46.5 = fmul fast float %14, %beta
-  %add48.5 = fadd fast float %mul46.5, %6
-  store float %add48.5, float* %arrayidx45.5, align 4, !tbaa !3
-  %tmp84.698 = or i32 %mul37, 6
-  %tmp85.6 = mul i32 %tmp84.698, %ldc
-  %add44.6 = add i32 %tmp85.6, %add5
-  %arrayidx45.6 = getelementptr inbounds float* %C, i32 %add44.6
-  %15 = load float* %arrayidx45.6, align 4, !tbaa !3
-  %mul46.6 = fmul fast float %15, %beta
-  %add48.6 = fadd fast float %mul46.6, %7
-  store float %add48.6, float* %arrayidx45.6, align 4, !tbaa !3
-  %tmp84.799 = or i32 %mul37, 7
-  %tmp85.7 = mul i32 %tmp84.799, %ldc
-  %add44.7 = add i32 %tmp85.7, %add5
-  %arrayidx45.7 = getelementptr inbounds float* %C, i32 %add44.7
-  %16 = load float* %arrayidx45.7, align 4, !tbaa !3
-  %mul46.7 = fmul fast float %16, %beta
-  %add48.7 = fadd fast float %mul46.7, %8
-  store float %add48.7, float* %arrayidx45.7, align 4, !tbaa !3
-
-
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 {
-entry:
-  switch i8 %transa, label %if.then [
-    i8 78, label %if.end
-    i8 110, label %if.end
-  ]
-
-if.then:                                          ; preds = %entry
-  %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1
-  %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %0 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %0, align 8
-  %add.ptr.i.sum = add i64 %vbase.offset.i, 240
-  %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum
-  %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
-  %2 = load %"class.std::ctype"** %1, align 8, !tbaa !7
-  %tobool.i93 = icmp eq %"class.std::ctype"* %2, null
-  br i1 %tobool.i93, label %if.then.i94, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-
-if.then.i94:                                      ; preds = %if.then
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit:    ; preds = %if.then
-  %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6
-  %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !4
-  %tobool.i = icmp eq i8 %3, 0
-  br i1 %tobool.i, label %if.end.i, label %if.then.i
-
-if.then.i:                                        ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10
-  %4 = load i8* %arrayidx.i, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-if.end.i:                                         ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1
-  %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i67 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !6
-  %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i67, i64 6
-  %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
-  %call.i68 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit
-
-_ZNKSt5ctypeIcE5widenEc.exit:                     ; preds = %if.end.i, %if.then.i
-  %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i68, %if.end.i ]
-  %call1.i43 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1
-  %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i43) #1
-  br label %return
-
-if.end:                                           ; preds = %entry, %entry
-  switch i8 %transb, label %if.then9 [
-    i8 84, label %if.end12
-    i8 116, label %if.end12
-  ]
-
-if.then9:                                         ; preds = %if.end
-  %call1.i45 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1
-  %vtable.i47 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !6
-  %vbase.offset.ptr.i48 = getelementptr i8* %vtable.i47, i64 -24
-  %7 = bitcast i8* %vbase.offset.ptr.i48 to i64*
-  %vbase.offset.i49 = load i64* %7, align 8
-  %add.ptr.i50.sum = add i64 %vbase.offset.i49, 240
-  %_M_ctype.i69 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i50.sum
-  %8 = bitcast i8* %_M_ctype.i69 to %"class.std::ctype"**
-  %9 = load %"class.std::ctype"** %8, align 8, !tbaa !7
-  %tobool.i96 = icmp eq %"class.std::ctype"* %9, null
-  br i1 %tobool.i96, label %if.then.i97, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99
-
-if.then.i97:                                      ; preds = %if.then9
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99:  ; preds = %if.then9
-  %_M_widen_ok.i71 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6
-  %10 = load i8* %_M_widen_ok.i71, align 1, !tbaa !4
-  %tobool.i72 = icmp eq i8 %10, 0
-  br i1 %tobool.i72, label %if.end.i78, label %if.then.i74
-
-if.then.i74:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99
-  %arrayidx.i73 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10
-  %11 = load i8* %arrayidx.i73, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit80
-
-if.end.i78:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit99
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1
-  %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i75 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !6
-  %vfn.i76 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i75, i64 6
-  %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i76, align 8
-  %call.i77 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit80
-
-_ZNKSt5ctypeIcE5widenEc.exit80:                   ; preds = %if.end.i78, %if.then.i74
-  %retval.0.i79 = phi i8 [ %11, %if.then.i74 ], [ %call.i77, %if.end.i78 ]
-  %call1.i52 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i79) #1
-  %call.i53 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i52) #1
-  br label %return
-
-if.end12:                                         ; preds = %if.end, %if.end
-  %rem40 = and i32 %m, 63
-  %tobool = icmp eq i32 %rem40, 0
-  br i1 %tobool, label %lor.lhs.false, label %if.then15
-
-lor.lhs.false:                                    ; preds = %if.end12
-  %rem1341 = and i32 %n, 7
-  %tobool14 = icmp eq i32 %rem1341, 0
-  br i1 %tobool14, label %if.end21, label %if.then15
-
-if.then15:                                        ; preds = %lor.lhs.false, %if.end12
-  %call1.i55 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1
-  %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 64) #1
-  %call1.i57 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1
-  %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 8) #1
-  %14 = bitcast %"class.std::basic_ostream"* %call19 to i8**
-  %vtable.i59 = load i8** %14, align 8, !tbaa !6
-  %vbase.offset.ptr.i60 = getelementptr i8* %vtable.i59, i64 -24
-  %15 = bitcast i8* %vbase.offset.ptr.i60 to i64*
-  %vbase.offset.i61 = load i64* %15, align 8
-  %16 = bitcast %"class.std::basic_ostream"* %call19 to i8*
-  %add.ptr.i62.sum = add i64 %vbase.offset.i61, 240
-  %_M_ctype.i81 = getelementptr inbounds i8* %16, i64 %add.ptr.i62.sum
-  %17 = bitcast i8* %_M_ctype.i81 to %"class.std::ctype"**
-  %18 = load %"class.std::ctype"** %17, align 8, !tbaa !7
-  %tobool.i100 = icmp eq %"class.std::ctype"* %18, null
-  br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-
-if.then.i101:                                     ; preds = %if.then15
-  tail call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then15
-  %_M_widen_ok.i83 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6
-  %19 = load i8* %_M_widen_ok.i83, align 1, !tbaa !4
-  %tobool.i84 = icmp eq i8 %19, 0
-  br i1 %tobool.i84, label %if.end.i90, label %if.then.i86
-
-if.then.i86:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  %arrayidx.i85 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10
-  %20 = load i8* %arrayidx.i85, align 1, !tbaa !4
-  br label %_ZNKSt5ctypeIcE5widenEc.exit92
-
-if.end.i90:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
-  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1
-  %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i87 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !6
-  %vfn.i88 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i87, i64 6
-  %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i88, align 8
-  %call.i89 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1
-  br label %_ZNKSt5ctypeIcE5widenEc.exit92
-
-_ZNKSt5ctypeIcE5widenEc.exit92:                   ; preds = %if.end.i90, %if.then.i86
-  %retval.0.i91 = phi i8 [ %20, %if.then.i86 ], [ %call.i89, %if.end.i90 ]
-  %call1.i64 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i91) #1
-  %call.i65 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i64) #1
-  br label %return
-
-if.end21:                                         ; preds = %lor.lhs.false
-  %mul = shl nsw i32 %m, 3
-  %div = sdiv i32 %mul, 64
-  %div27 = lshr i32 %div, 3
-  %div30 = lshr i32 %n, 3
-  %conv31 = fpext float %alpha to double
-  %conv32 = fpext float %beta to double
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19)
-  %in.addr = alloca %struct.arg
-  %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  store float* %A, float** %in.addr.A
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  store i64 %bytesA, i64* %in.addr.bytes_A
-  %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  store i32 %lda, i32* %in.addr.lda
-  %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  store float* %B, float** %in.addr.B
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  store i64 %bytesB, i64* %in.addr.bytes_B
-  %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  store i32 %ldb, i32* %in.addr.ldb
-  %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  store float* %C, float** %in.addr.C
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  store i64 %bytesC, i64* %in.addr.bytes_C
-  %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-  store i32 %ldc, i32* %in.addr.ldc
-  %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  store i32 %k, i32* %in.addr.k
-  %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10
-  %in.addr.alpha.cast = fptrunc double %conv31 to float
-  store float %in.addr.alpha.cast, float* %in.addr.alpha
-  %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11
-  %in.addr.beta.cast = fptrunc double %conv32 to float
-  store float %in.addr.beta.cast, float* %in.addr.beta
-  %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12
-  store i32 8, i32* %in.addr.dimX0
-  %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13
-  store i32 8, i32* %in.addr.dimY0
-  %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14
-  store i32 %div27, i32* %in.addr.dimX1
-  %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15
-  store i32 %div30, i32* %in.addr.dimY1
-  %args = bitcast %struct.arg* %in.addr to i8*
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args)
-  call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2)
-  br label %return
-
-return:                                           ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit92, %_ZNKSt5ctypeIcE5widenEc.exit80, %_ZNKSt5ctypeIcE5widenEc.exit
-  ret void
-}
-
-declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #2 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  %matArow = alloca i32, align 4
-  %matAcol = alloca i32, align 4
-  %matBrow = alloca i32, align 4
-  %matBcol = alloca i32, align 4
-  %matA = alloca %"class.std::vector", align 8
-  %matBT = alloca %"class.std::vector", align 8
-  %matC = alloca %"class.std::vector", align 8
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !8
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %1 = bitcast %"class.std::vector"* %matA to i8*
-  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1
-  %2 = bitcast %"class.std::vector"* %matBT to i8*
-  call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1
-  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
-  %3 = load i8*** %inpFiles, align 8, !tbaa !7
-  %4 = load i8** %3, align 8, !tbaa !7
-  %cmp = icmp eq i8* %4, null
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %arrayidx2 = getelementptr inbounds i8** %3, i64 1
-  %5 = load i8** %arrayidx2, align 8, !tbaa !7
-  %cmp3 = icmp eq i8* %5, null
-  br i1 %cmp3, label %if.then, label %lor.lhs.false4
-
-lor.lhs.false4:                                   ; preds = %lor.lhs.false
-  %arrayidx6 = getelementptr inbounds i8** %3, i64 2
-  %6 = load i8** %arrayidx6, align 8, !tbaa !7
-  %cmp7 = icmp eq i8* %6, null
-  br i1 %cmp7, label %if.then, label %lor.lhs.false8
-
-lor.lhs.false8:                                   ; preds = %lor.lhs.false4
-  %arrayidx10 = getelementptr inbounds i8** %3, i64 3
-  %7 = load i8** %arrayidx10, align 8, !tbaa !7
-  %cmp11 = icmp eq i8* %7, null
-  br i1 %cmp11, label %if.end, label %if.then
-
-if.then:                                          ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry
-  %8 = load %struct._IO_FILE** @stderr, align 8, !tbaa !7
-  %9 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %8)
-  call void @exit(i32 -1) #7
-  unreachable
-
-if.end:                                           ; preds = %lor.lhs.false8
-  %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %4, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1
-  %10 = load i8*** %inpFiles, align 8, !tbaa !7
-  %arrayidx17 = getelementptr inbounds i8** %10, i64 2
-  %11 = load i8** %arrayidx17, align 8, !tbaa !7
-  %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %11, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %12 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %12, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  call void @llvm.visc.init()
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %13 = load i32* %matArow, align 4, !tbaa !8
-  %14 = load i32* %matAcol, align 4, !tbaa !8
-  %mul = mul nsw i32 %14, %13
-  %conv = sext i32 %mul to i64
-  %mul19 = shl nsw i64 %conv, 2
-  %15 = load i32* %matBrow, align 4, !tbaa !8
-  %16 = load i32* %matBcol, align 4, !tbaa !8
-  %mul20 = mul nsw i32 %16, %15
-  %conv21 = sext i32 %mul20 to i64
-  %mul22 = shl nsw i64 %conv21, 2
-  %mul23 = mul nsw i32 %16, %13
-  %conv24 = sext i32 %mul23 to i64
-  %mul25 = shl nsw i64 %conv24, 2
-  %17 = bitcast %"class.std::vector"* %matC to i8*
-  call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 24, i32 8, i1 false) #1
-  %cmp.i.i.i.i = icmp eq i32 %mul23, 0
-  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i
-
-cond.true.i.i.i.i:                                ; preds = %if.end
-  %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0
-  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !9
-
-if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
-  call void @_ZSt17__throw_bad_allocv() #7
-  unreachable
-
-_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i
-  %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1
-  %18 = bitcast i8* %call2.i.i.i.i.i to float*
-  br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-
-_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i:    ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end
-  %cond.i.i.i.i = phi float* [ %18, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ]
-  %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0
-  store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1
-  store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24
-  %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2
-  store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !7
-  br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i
-
-for.body.lr.ph.i.i.i.i.i.i.i.i:                   ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7
-  %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i
-  %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24
-  %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i
-  br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-vector.body.i.i.i.i.i.i.i.i:                      ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ]
-  %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i
-  %19 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %19, align 4
-  %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4
-  %20 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i
-  %21 = bitcast float* %20 to <4 x float>*
-  store <4 x float> zeroinitializer, <4 x float>* %21, align 4
-  %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8
-  %22 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i
-  br i1 %22, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
-
-middle.block.i.i.i.i.i.i.i.i:                     ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
-  %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24
-  br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader
-
-for.body.i.i.i.i.i.i.i.i.preheader:               ; preds = %middle.block.i.i.i.i.i.i.i.i
-  %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8*
-  %23 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2
-  call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %23, i32 4, i1 false)
-  br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-
-_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit:            ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
-  store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0
-  %24 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %25 = bitcast float* %24 to i8*
-  call void @llvm_visc_track_mem(i8* %25, i64 %mul19) #1
-  %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0
-  %26 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %27 = bitcast float* %26 to i8*
-  call void @llvm_visc_track_mem(i8* %27, i64 %mul22) #1
-  %28 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %29 = bitcast float* %28 to i8*
-  call void @llvm_visc_track_mem(i8* %29, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %30 = load float** %_M_finish.i.i.i, align 8, !tbaa !7
-  %31 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %cmp3399 = icmp eq float* %30, %31
-  br i1 %cmp3399, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  %sub.ptr.lhs.cast.i = ptrtoint float* %30 to i64
-  %sub.ptr.rhs.cast.i = ptrtoint float* %31 to i64
-  %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i
-  %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %add.ptr.i = getelementptr inbounds float* %31, i64 %i.0100
-  store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !3
-  %inc = add i64 %i.0100, 1
-  %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i
-  br i1 %cmp33, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  %32 = load i32* %matArow, align 4, !tbaa !8
-  %33 = load i32* %matBcol, align 4, !tbaa !8
-  %34 = load i32* %matAcol, align 4, !tbaa !8
-  %35 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %36 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %37 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %32, i32 %33, i32 %34, float 1.000000e+00, float* %35, i64 %mul19, i32 %32, float* %36, i64 %mul22, i32 %33, float 0.000000e+00, float* %37, i64 %mul25, i32 %32)
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  %38 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %39 = bitcast float* %38 to i8*
-  call void @llvm_visc_request_mem(i8* %39, i64 %mul25) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1
-  %40 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %41 = bitcast float* %40 to i8*
-  call void @llvm_visc_untrack_mem(i8* %41) #1
-  %42 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %43 = bitcast float* %42 to i8*
-  call void @llvm_visc_untrack_mem(i8* %43) #1
-  %44 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %45 = bitcast float* %44 to i8*
-  call void @llvm_visc_untrack_mem(i8* %45) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  call void @llvm.visc.cleanup()
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
-  %46 = load i8** %outFile, align 8, !tbaa !7
-  %tobool = icmp eq i8* %46, null
-  br i1 %tobool, label %if.end45, label %if.then42
-
-if.then42:                                        ; preds = %for.end
-  %47 = load i32* %matArow, align 4, !tbaa !8
-  %48 = load i32* %matBcol, align 4, !tbaa !8
-  %call44 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %46, i32 %47, i32 %48, %"class.std::vector"* %matC) #1
-  br label %if.end45
-
-if.end45:                                         ; preds = %if.then42, %for.end
-  %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2
-  %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1
-  %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1
-  %49 = load i32* %matArow, align 4, !tbaa !8
-  %conv50 = sitofp i32 %49 to double
-  %mul51 = fmul fast double %conv50, 2.000000e+00
-  %50 = load i32* %matBcol, align 4, !tbaa !8
-  %conv52 = sitofp i32 %50 to double
-  %mul53 = fmul fast double %mul51, %conv52
-  %51 = load i32* %matAcol, align 4, !tbaa !8
-  %conv54 = sitofp i32 %51 to double
-  %mul55 = fmul fast double %mul53, %conv54
-  %div = fdiv fast double %mul55, %call48
-  %div56 = fmul double %div, 1.000000e-09
-  %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1
-  %52 = bitcast %"class.std::basic_ostream"* %call.i to i8**
-  %vtable.i = load i8** %52, align 8, !tbaa !6
-  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
-  %53 = bitcast i8* %vbase.offset.ptr.i to i64*
-  %vbase.offset.i = load i64* %53, align 8
-  %54 = bitcast %"class.std::basic_ostream"* %call.i to i8*
-  %add.ptr.sum.i = add i64 %vbase.offset.i, 240
-  %_M_ctype.i.i = getelementptr inbounds i8* %54, i64 %add.ptr.sum.i
-  %55 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"**
-  %56 = load %"class.std::ctype"** %55, align 8, !tbaa !7
-  %tobool.i.i.i = icmp eq %"class.std::ctype"* %56, null
-  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-
-if.then.i.i.i:                                    ; preds = %if.end45
-  call void @_ZSt16__throw_bad_castv() #7
-  unreachable
-
-_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end45
-  %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 6
-  %57 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !4
-  %tobool.i3.i.i = icmp eq i8 %57, 0
-  br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i
-
-if.then.i4.i.i:                                   ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %56, i64 0, i32 7, i64 10
-  %58 = load i8* %arrayidx.i.i.i, align 1, !tbaa !4
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-if.end.i.i.i:                                     ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
-  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %56) #1
-  %59 = bitcast %"class.std::ctype"* %56 to i8 (%"class.std::ctype"*, i8)***
-  %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %59, align 8, !tbaa !6
-  %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6
-  %60 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8
-  %call.i.i.i = call signext i8 %60(%"class.std::ctype"* %56, i8 signext 10) #1
-  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-
-_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i
-  %retval.0.i.i.i = phi i8 [ %58, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ]
-  %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1
-  %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
-  %61 = load float** %_M_start.i.i.i81, align 8, !tbaa !7
-  %tobool.i.i.i.i78 = icmp eq float* %61, null
-  br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79
-
-if.then.i.i.i.i79:                                ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %62 = bitcast float* %61 to i8*
-  call void @_ZdlPv(i8* %62) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit80
-
-_ZNSt6vectorIfSaIfEED1Ev.exit80:                  ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
-  %63 = load float** %_M_start.i.i82, align 8, !tbaa !7
-  %tobool.i.i.i.i74 = icmp eq float* %63, null
-  br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75
-
-if.then.i.i.i.i75:                                ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %64 = bitcast float* %63 to i8*
-  call void @_ZdlPv(i8* %64) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit76
-
-_ZNSt6vectorIfSaIfEED1Ev.exit76:                  ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80
-  %65 = load float** %_M_start.i.i, align 8, !tbaa !7
-  %tobool.i.i.i.i = icmp eq float* %65, null
-  br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i
-
-if.then.i.i.i.i:                                  ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  %66 = bitcast float* %65 to i8*
-  call void @_ZdlPv(i8* %66) #1
-  br label %_ZNSt6vectorIfSaIfEED1Ev.exit
-
-_ZNSt6vectorIfSaIfEED1Ev.exit:                    ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0
-
-; Function Attrs: noreturn nounwind
-declare void @exit(i32) #4
-
-declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
-
-declare void @llvm_visc_track_mem(i8*, i64) #0
-
-declare void @llvm_visc_request_mem(i8*, i64) #0
-
-declare void @llvm_visc_untrack_mem(i8*) #0
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
-
-declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0
-
-declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt17__throw_bad_allocv() #5
-
-declare noalias i8* @_Znwm(i64) #0
-
-; Function Attrs: nounwind
-declare void @_ZdlPv(i8*) #6
-
-declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0
-
-declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0
-
-; Function Attrs: noreturn
-declare void @_ZSt16__throw_bad_castv() #5
-
-declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0
-
-declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0
-
-; Function Attrs: nounwind
-define internal void @_GLOBAL__I_a() #1 section ".text.startup" {
-entry:
-  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1
-  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 {
-entry:
-  %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12)
-  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #7 = { noreturn nounwind }
-
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2}
-
-!0 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff}
-!1 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1}
-!2 = metadata !{%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2}
-!3 = metadata !{metadata !"float", metadata !4}
-!4 = metadata !{metadata !"omnipotent char", metadata !5}
-!5 = metadata !{metadata !"Simple C/C++ TBAA"}
-!6 = metadata !{metadata !"vtable pointer", metadata !5}
-!7 = metadata !{metadata !"any pointer", metadata !4}
-!8 = metadata !{metadata !"int", metadata !4}
-!9 = metadata !{metadata !"branch_weights", i32 4, i32 64}
diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile
index 23e1d4990031404b8e365d9430499b5fddb2af01..aff3e54712256348ebd9d0054d87fd62616fa15b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = spmv
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
similarity index 88%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
index a289d68f342ba488f8ce4d90faf26816d4d00829..06af6bebea2aa6a94f56196e0399a25ebfdda030 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
@@ -1,9 +1,9 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 TOOLS_SRC=common_src/convert-dataset
 SRCDIR_OBJS=gpu_info.ll file.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
 APP_CXXFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.h
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
similarity index 68%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
index f6ce5ccfb2412036f4eadcdab419ceca0a6c8f30..4414744b4995a9ae09bb88fdda297150dfbe1031 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
@@ -8,11 +8,11 @@
 
 //#include <CL/cl.h>
 //#include <CL/cl_ext.h>
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
 #include "convert_dataset.h"
 #include "file.h"
@@ -54,15 +54,15 @@ void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
 
   int ix = gx * gridx + lx;
   int warp_id = ix >> WARP_BITS;
@@ -126,25 +126,25 @@ void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmv_jds, dim_X1);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
 }
 
 void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data,
@@ -153,26 +153,26 @@ void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
-  __visc__bindIn(spmv_node, 15, 15, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmvLvl1, dim_X2);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
+  __hpvm__bindIn(spmv_node, 15, 15, 0);
 }
 
 void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data,
@@ -181,27 +181,27 @@ void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
-  __visc__bindIn(spmv_node, 15, 15, 0);
-  __visc__bindIn(spmv_node, 16, 16, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmvLvl2, dim_X2);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
+  __hpvm__bindIn(spmv_node, 15, 15, 0);
+  __hpvm__bindIn(spmv_node, 16, 16, 0);
 }
 
 int main(int argc, char **argv) {
@@ -261,7 +261,7 @@ int main(int argc, char **argv) {
   input_vec(parameters->inpFiles[1], h_x_vector, dim);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memset(h_Ax_vector, 0, dim * sizeof(float));
@@ -271,14 +271,14 @@ int main(int argc, char **argv) {
 
   compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float));
-  llvm_visc_track_mem(h_data, len * sizeof(float));
-  llvm_visc_track_mem(h_indices, len * sizeof(int));
-  llvm_visc_track_mem(h_perm, dim * sizeof(int));
-  llvm_visc_track_mem(h_x_vector, dim * sizeof(float));
-  llvm_visc_track_mem(h_ptr, depth * sizeof(int));
-  llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int));
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_Ax_vector, dim * sizeof(float));
+  llvm_hpvm_track_mem(h_data, len * sizeof(float));
+  llvm_hpvm_track_mem(h_indices, len * sizeof(int));
+  llvm_hpvm_track_mem(h_perm, dim * sizeof(int));
+  llvm_hpvm_track_mem(h_x_vector, dim * sizeof(float));
+  llvm_hpvm_track_mem(h_ptr, depth * sizeof(int));
+  llvm_hpvm_track_mem(h_nzcnt, nzcnt_len * sizeof(int));
 
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -306,9 +306,9 @@ int main(int argc, char **argv) {
                             block,
                             (grid / block)};
     *(RootIn *)root_in = root_in_local;
-    void *spmvDFG = __visc__launch(0, spmvLvl3, root_in);
+    void *spmvDFG = __hpvm__launch(0, spmvLvl3, root_in);
 
-    __visc__wait(spmvDFG);
+    __hpvm__wait(spmvDFG);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
     /******************************* Issues *******************************
@@ -326,21 +326,21 @@ int main(int argc, char **argv) {
 
   // HtoD memory copy
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float));
+  llvm_hpvm_request_mem(h_Ax_vector, dim * sizeof(float));
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
 
-  llvm_visc_untrack_mem(h_Ax_vector);
-  llvm_visc_untrack_mem(h_data);
-  llvm_visc_untrack_mem(h_indices);
-  llvm_visc_untrack_mem(h_perm);
-  llvm_visc_untrack_mem(h_x_vector);
-  llvm_visc_untrack_mem(h_ptr);
-  llvm_visc_untrack_mem(h_nzcnt);
+  llvm_hpvm_untrack_mem(h_Ax_vector);
+  llvm_hpvm_untrack_mem(h_data);
+  llvm_hpvm_untrack_mem(h_indices);
+  llvm_hpvm_untrack_mem(h_perm);
+  llvm_hpvm_untrack_mem(h_x_vector);
+  llvm_hpvm_untrack_mem(h_ptr);
+  llvm_hpvm_untrack_mem(h_nzcnt);
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
index a19184a9659eaa91223da57e1b926ac6bff54b4e..8bff8a1d0af3c22348daad7bde0fed51f4c6f58d 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
@@ -81,7 +81,7 @@ int main(int argc, char **argv) {
   input_vec(parameters->inpFiles[1], h_x_vector, dim);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // parameters declaration
   cl_int clStatus;
@@ -127,7 +127,7 @@ int main(int argc, char **argv) {
   cl_mem jds_ptr_int;
   cl_mem sh_zcnt_int;
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
   /*cl_program clProgram =
    * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
@@ -217,7 +217,7 @@ int main(int argc, char **argv) {
   //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
   //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
   CHECK_ERROR("clSetKernelArg")
   clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
@@ -240,7 +240,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   int i;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int j = 0; j < 20; j++) {
     for (i = 0; i < 50; i++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                           dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
 
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
index d4fc026b73894e47c94dd7f2c9ef8f31e366eec6..f704f96ed291269457d99563b2779dae93da78c7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
@@ -236,7 +236,7 @@ int main(int argc, char **argv) {
   // main execution
 
   int i;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   /*for(int j=0; j<20; j++) {*/
   for (i = 0; i < 50; i++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
index 42ffab597d028eacba7f9975473908bdf812524e..a6fe5012f96ee73f54af85c20e665517b22c1b1e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
@@ -81,7 +81,7 @@ int main(int argc, char **argv) {
   input_vec(parameters->inpFiles[1], h_x_vector, dim);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // parameters declaration
   cl_int clStatus;
@@ -127,7 +127,7 @@ int main(int argc, char **argv) {
   cl_mem jds_ptr_int;
   cl_mem sh_zcnt_int;
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
   /*cl_program clProgram =
    * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
@@ -217,7 +217,7 @@ int main(int argc, char **argv) {
   //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
   //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
   CHECK_ERROR("clSetKernelArg")
   clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
@@ -240,7 +240,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   int i;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int j = 0; j < 1; j++) {
     for (i = 0; i < 50; i++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                           dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
 
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
index fbd272b32f7f60fbd0c651b0f329550b47e4db27..bc3655c4abfec2463cef9082e1f3d3e0b25b7d3b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
@@ -81,7 +81,7 @@ int main(int argc, char **argv) {
   input_vec(parameters->inpFiles[1], h_x_vector, dim);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // parameters declaration
   cl_int clStatus;
@@ -127,7 +127,7 @@ int main(int argc, char **argv) {
   cl_mem jds_ptr_int;
   cl_mem sh_zcnt_int;
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
   /*cl_program clProgram =
    * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
@@ -217,7 +217,7 @@ int main(int argc, char **argv) {
   //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
   //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
   CHECK_ERROR("clSetKernelArg")
   clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
@@ -240,7 +240,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   int i;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int j = 0; j < 20; j++) {
     for (i = 0; i < 50; i++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                           dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
 
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
index 343814149aa74139930380c2178e2f447c64e806..88fd0c878bb8e128c3790716b82b5aec8acbe41a 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
@@ -224,7 +224,7 @@ int main(int argc, char **argv) {
 
   // main execution
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   int i;
   for (i = 0; i < 50; i++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
index 4600a3e6b8d580ad6fc3986d24a712ad592e25eb..ca538e3a95f56498c8ba8deb90b7820035dcbe11 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
@@ -83,7 +83,7 @@ int main(int argc, char **argv) {
   printf("Col count = %d, dim = %d\n", col_count, dim);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   cl_int clStatus;
   cl_platform_id clPlatform;
@@ -137,7 +137,7 @@ int main(int argc, char **argv) {
   cl_mem jds_ptr_int;
   cl_mem sh_zcnt_int;
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCLDeviceProp clDeviceProp;
   clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
@@ -215,7 +215,7 @@ int main(int argc, char **argv) {
   compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
                         clDeviceProp.minor, clDeviceProp.multiProcessorCount);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
   CHECK_ERROR("clSetKernelArg")
   clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
@@ -237,7 +237,7 @@ int main(int argc, char **argv) {
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   int i;
   for (int j = 0; j < 5; j++) {
     for (i = 0; i < 50; i++) {
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                           dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
 
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
index d2375af91dd8d4812fcb82b78b856e85feda376f..21973c2fa75fc95f4496d26b3d2c8870d9a1e577 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
@@ -83,7 +83,7 @@ int main(int argc, char **argv) {
   printf("Col count = %d, dim = %d\n", col_count, dim);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   cl_int clStatus;
   cl_platform_id clPlatform;
@@ -137,7 +137,7 @@ int main(int argc, char **argv) {
   cl_mem jds_ptr_int;
   cl_mem sh_zcnt_int;
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   OpenCLDeviceProp clDeviceProp;
   clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
@@ -215,7 +215,7 @@ int main(int argc, char **argv) {
   compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
                         clDeviceProp.minor, clDeviceProp.multiProcessorCount);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
   CHECK_ERROR("clSetKernelArg")
   clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
@@ -237,7 +237,7 @@ int main(int argc, char **argv) {
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   int i;
   for (int j = 0; j < 100; j++) {
     for (i = 0; i < 50; i++) {
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                           dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
 
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc
deleted file mode 100644
index b804d14d16cff805c0c1850d1f5079ab6e973ecf..0000000000000000000000000000000000000000
Binary files a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc and /dev/null differ
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll
deleted file mode 100644
index 5604d70e8a005ee7e21c5ae9bf6dbf0dbac77d15..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll
+++ /dev/null
@@ -1,138 +0,0 @@
-; ModuleID = 'build/visc_default/main.visc.ll.kernels.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-target triple = "spir64-unknown-unknown"
-
-%rtype = type {}
-
-; Function Attrs: optsize zeroext
-define void @spmv_jds(float* %dst_vector, i64 %bytes_dst_vector, float* %d_data, i64 %bytes_d_data, i32* %d_index, i64 %bytes_d_index, i32* %d_perm, i64 %bytes_d_perm, float* %x_vec, i64 %bytes_x_vec, i32 %dim, i32* %jds_ptr_int, i64 %bytes_jds_ptr_int, i32* %sh_zcnt_int, i64 %bytes_sh_zcnt_int) #0 {
-entry:
-  ;%0 = call i64 @_Z12get_group_idj(i32 0)
-  ;%1 = trunc i64 %0 to i32
-  ;%2 = call i64 @_Z14get_local_sizej(i32 0)
-  ;%3 = trunc i64 %2 to i32
-  ;%4 = mul i32 %1, %3
-  ;%5 = call i64 @_Z12get_local_idj(i32 0)
-  ;%6 = trunc i64 %5 to i32
-  ;%7 = add i32 %4, %6
-  %0 = add i32 0, 0
-  %1 = add i32 0, 0
-  %2 = add i32 0, 0
-  %3 = add i32 0, 0
-  %4 = add i32 0, 0
-  %5 = add i32 0, 0
-  %6 = call i64 @_Z13get_global_idj(i32 0)
-  %7 = trunc i64 %6 to i32
-  %cmp = icmp slt i32 %7, %dim
-  br i1 %cmp, label %if.then, label %if.end38
-
-if.then:                                          ; preds = %entry
-  %shr = ashr i32 %7, 5
-  %idxprom = sext i32 %shr to i64
-  %arrayidx = getelementptr inbounds i32* %sh_zcnt_int, i64 %idxprom
-  %8 = load i32* %arrayidx, align 4, !tbaa !4
-  %9 = load i32* %jds_ptr_int, align 4, !tbaa !4
-  %add = add nsw i32 %9, %7
-  %idxprom3 = sext i32 %add to i64
-  %arrayidx4 = getelementptr inbounds float* %d_data, i64 %idxprom3
-  %10 = load float* %arrayidx4, align 4, !tbaa !8
-  %arrayidx6 = getelementptr inbounds i32* %d_index, i64 %idxprom3
-  %11 = load i32* %arrayidx6, align 4, !tbaa !4
-  %idxprom7 = sext i32 %11 to i64
-  %arrayidx8 = getelementptr inbounds float* %x_vec, i64 %idxprom7
-  %12 = load float* %arrayidx8, align 4, !tbaa !8
-  %cmp9 = icmp sgt i32 %8, 1
-  br i1 %cmp9, label %if.then10, label %if.end
-
-if.then10:                                        ; preds = %if.then
-  %arrayidx11 = getelementptr inbounds i32* %jds_ptr_int, i64 1
-  %.pn77 = load i32* %arrayidx11, align 4
-  %idxprom13.pn.in78 = add nsw i32 %.pn77, %7
-  %idxprom13.pn79 = sext i32 %idxprom13.pn.in78 to i64
-  %i.0.in80 = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn79
-  %i.081 = load i32* %i.0.in80, align 4
-  %cmp1582 = icmp sgt i32 %8, 2
-  %arrayidx1783 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn79
-  %13 = load float* %arrayidx1783, align 4, !tbaa !8
-  br i1 %cmp1582, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %if.then10
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 2, %if.then10 ]
-  %14 = phi float [ %16, %for.body ], [ %13, %if.then10 ]
-  %i.088 = phi i32 [ %i.0, %for.body ], [ %i.081, %if.then10 ]
-  %sum.086 = phi float [ %add25, %for.body ], [ 0.000000e+00, %if.then10 ]
-  %t.085 = phi float [ %15, %for.body ], [ %12, %if.then10 ]
-  %d.084 = phi float [ %14, %for.body ], [ %10, %if.then10 ]
-  %arrayidx19 = getelementptr inbounds i32* %jds_ptr_int, i64 %indvars.iv
-  %idxprom23 = sext i32 %i.088 to i64
-  %arrayidx24 = getelementptr inbounds float* %x_vec, i64 %idxprom23
-  %15 = load float* %arrayidx24, align 4, !tbaa !8
-  %mul = fmul fast float %d.084, %t.085
-  %add25 = fadd fast float %sum.086, %mul
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %.pn = load i32* %arrayidx19, align 4
-  %idxprom13.pn.in = add nsw i32 %.pn, %7
-  %idxprom13.pn = sext i32 %idxprom13.pn.in to i64
-  %i.0.in = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn
-  %i.0 = load i32* %i.0.in, align 4
-  %arrayidx17 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn
-  %16 = load float* %arrayidx17, align 4, !tbaa !8
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %8
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %if.then10
-  %.lcssa = phi float [ %13, %if.then10 ], [ %16, %for.body ]
-  %i.0.lcssa = phi i32 [ %i.081, %if.then10 ], [ %i.0, %for.body ]
-  %sum.0.lcssa = phi float [ 0.000000e+00, %if.then10 ], [ %add25, %for.body ]
-  %t.0.lcssa = phi float [ %12, %if.then10 ], [ %15, %for.body ]
-  %d.0.lcssa = phi float [ %10, %if.then10 ], [ %14, %for.body ]
-  %idxprom28 = sext i32 %i.0.lcssa to i64
-  %arrayidx29 = getelementptr inbounds float* %x_vec, i64 %idxprom28
-  %17 = load float* %arrayidx29, align 4, !tbaa !8
-  %mul30 = fmul fast float %d.0.lcssa, %t.0.lcssa
-  %add31 = fadd fast float %sum.0.lcssa, %mul30
-  br label %if.end
-
-if.end:                                           ; preds = %for.end, %if.then
-  %d.1 = phi float [ %.lcssa, %for.end ], [ %10, %if.then ]
-  %t.1 = phi float [ %17, %for.end ], [ %12, %if.then ]
-  %sum.1 = phi float [ %add31, %for.end ], [ 0.000000e+00, %if.then ]
-  %mul32 = fmul fast float %d.1, %t.1
-  %add33 = fadd fast float %sum.1, %mul32
-  %idxprom34 = sext i32 %7 to i64
-  %arrayidx35 = getelementptr inbounds i32* %d_perm, i64 %idxprom34
-  %18 = load i32* %arrayidx35, align 4, !tbaa !4
-  %idxprom36 = sext i32 %18 to i64
-  %arrayidx37 = getelementptr inbounds float* %dst_vector, i64 %idxprom36
-  store float %add33, float* %arrayidx37, align 4, !tbaa !8
-  br label %if.end38
-
-if.end38:                                         ; preds = %if.end, %entry
-  ret void
-}
-
-declare i64 @_Z13get_global_idj(i32)
-
-declare i64 @_Z12get_group_idj(i32)
-
-declare i64 @_Z14get_local_sizej(i32)
-
-declare i64 @_Z12get_local_idj(i32)
-
-attributes #0 = { optsize zeroext "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
-!visc_hint_gpu = !{}
-!visc_hint_cpu = !{!0, !1}
-!opencl.kernels = !{!2}
-
-!0 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32)* undef}
-!1 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32, i32)* undef}
-!2 = metadata !{void (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64)* @spmv_jds, metadata !3}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"int", metadata !6}
-!6 = metadata !{metadata !"omnipotent char", metadata !7}
-!7 = metadata !{metadata !"Simple C/C++ TBAA"}
-!8 = metadata !{metadata !9, metadata !9, i64 0}
-!9 = metadata !{metadata !"float", metadata !6}
diff --git a/hpvm/test/parboil/benchmarks/stencil/Makefile b/hpvm/test/parboil/benchmarks/stencil/Makefile
index a44dd0dbf0d678c7e8417345854254a1c2676653..e761d7b4f5f020fc19c5f59040ca5eb82b117381 100644
--- a/hpvm/test/parboil/benchmarks/stencil/Makefile
+++ b/hpvm/test/parboil/benchmarks/stencil/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = stencil
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
similarity index 80%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
index cf61fb3a6c77e07bf8ccc67902bd1a1997902763..35b36dcf3c053da03017c72d442204590675ecb4 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=file.ll
-VISC_OBJS=stencil.visc.ll
+HPVM_OBJS=stencil.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/common.h
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.h
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
similarity index 66%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
index 5672a3ee490917d1374783eae5ab0ba1956ef441..e5810fc8101bef72dd4636b0b6c11826a8b18318 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
@@ -9,11 +9,11 @@
 
 #include "common.h"
 #include "file.h"
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
 static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
   int s = 0;
@@ -42,23 +42,23 @@ typedef struct __attribute__((__packed__)) {
 
 void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                   size_t bytes_Anext, int nx, int ny, int nz) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
 
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int ly = __visc__getNodeInstanceID_y(thisNode);
-  int lz = __visc__getNodeInstanceID_z(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int ly = __hpvm__getNodeInstanceID_y(thisNode);
+  int lz = __hpvm__getNodeInstanceID_z(thisNode);
 
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
-  int gz = __visc__getNodeInstanceID_z(parentNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
+  int gz = __hpvm__getNodeInstanceID_z(parentNode);
 
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
-  int gridy = __visc__getNumNodeInstances_y(thisNode);
-  int gridz = __visc__getNumNodeInstances_z(thisNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
+  int gridy = __hpvm__getNumNodeInstances_y(thisNode);
+  int gridz = __hpvm__getNumNodeInstances_z(thisNode);
 
   int i = gx * gridx + lx + 1;
   int j = gy * gridy + ly + 1;
@@ -78,65 +78,65 @@ void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
 void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
   void *stencil_node =
-      __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
+      __hpvm__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
 }
 
 void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
                  size_t dim_Z2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
   void *stencil_node =
-      __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
-  __visc__bindIn(stencil_node, 9, 9, 0);
-  __visc__bindIn(stencil_node, 10, 10, 0);
-  __visc__bindIn(stencil_node, 11, 11, 0);
+      __hpvm__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
+  __hpvm__bindIn(stencil_node, 9, 9, 0);
+  __hpvm__bindIn(stencil_node, 10, 10, 0);
+  __hpvm__bindIn(stencil_node, 11, 11, 0);
 }
 
 void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
                  size_t dim_Z2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, A0, Anext, 1, Anext);
-  void *stencil_node = __visc__createNodeND(0, stencilLvl2);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
-  __visc__bindIn(stencil_node, 9, 9, 0);
-  __visc__bindIn(stencil_node, 10, 10, 0);
-  __visc__bindIn(stencil_node, 11, 11, 0);
-  __visc__bindIn(stencil_node, 12, 12, 0);
-  __visc__bindIn(stencil_node, 13, 13, 0);
-  __visc__bindIn(stencil_node, 14, 14, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
+  void *stencil_node = __hpvm__createNodeND(0, stencilLvl2);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
+  __hpvm__bindIn(stencil_node, 9, 9, 0);
+  __hpvm__bindIn(stencil_node, 10, 10, 0);
+  __hpvm__bindIn(stencil_node, 11, 11, 0);
+  __hpvm__bindIn(stencil_node, 12, 12, 0);
+  __hpvm__bindIn(stencil_node, 13, 13, 0);
+  __hpvm__bindIn(stencil_node, 14, 14, 0);
 }
 
 int main(int argc, char **argv) {
@@ -195,11 +195,11 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(h_A0, sizeof(float) * size);
-  llvm_visc_track_mem(h_Anext, sizeof(float) * size);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_A0, sizeof(float) * size);
+  llvm_hpvm_track_mem(h_Anext, sizeof(float) * size);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -241,9 +241,9 @@ int main(int argc, char **argv) {
                             grid[1] / block[1],
                             grid[2] / block[2]};
     *(RootIn *)root_in = root_in_local;
-    void *stencilDFG = __visc__launch(0, stencilLvl3, root_in);
+    void *stencilDFG = __hpvm__launch(0, stencilLvl3, root_in);
 
-    __visc__wait(stencilDFG);
+    __hpvm__wait(stencilDFG);
     // printf("iteration %d\n",t);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     float *h_temp = h_A0;
@@ -255,19 +255,19 @@ int main(int argc, char **argv) {
   h_A0 = h_Anext;
   h_Anext = h_temp;
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(h_Anext, bytes);
+  llvm_hpvm_request_mem(h_Anext, bytes);
   printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
   printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
 
-  llvm_visc_untrack_mem(h_A0);
-  llvm_visc_untrack_mem(h_Anext);
+  llvm_hpvm_untrack_mem(h_A0);
+  llvm_hpvm_untrack_mem(h_Anext);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c
similarity index 90%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c
index bb6e45c932a68d951f5559bd856017ecf71aade6..35c5ed960c2031b0b84124bbdd1aeb95042625ee 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c
@@ -9,11 +9,11 @@
 
 #include "common.h"
 #include "file.h"
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
 static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
   int s = 0;
@@ -31,7 +31,7 @@ static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
 
 void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny,
                   int nz) {
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
   int i = get_global_id(0) + 1;
   int j = get_global_id(1) + 1;
   int k = get_global_id(2) + 1;
@@ -106,11 +106,11 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(h_A0, sizeof(float) * size);
-  llvm_visc_track_mem(h_Anext, sizeof(float) * size);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_A0, sizeof(float) * size);
+  llvm_hpvm_track_mem(h_Anext, sizeof(float) * size);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -133,11 +133,11 @@ int main(int argc, char **argv) {
   printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]);
   for (t = 0; t < iteration; t++) {
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    unsigned stencilDFG = __visc__node(
+    unsigned stencilDFG = __hpvm__node(
         naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0],
         grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0,
         bytes, h_Anext, bytes, nx, ny, nz, 0);
-    __visc__wait(stencilDFG);
+    __hpvm__wait(stencilDFG);
     // printf("iteration %d\n",t);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     float *h_temp = h_A0;
@@ -149,19 +149,19 @@ int main(int argc, char **argv) {
   h_A0 = h_Anext;
   h_Anext = h_temp;
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(h_Anext, bytes);
+  llvm_hpvm_request_mem(h_Anext, bytes);
   printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
   printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
 
-  llvm_visc_untrack_mem(h_A0);
-  llvm_visc_untrack_mem(h_Anext);
+  llvm_hpvm_untrack_mem(h_A0);
+  llvm_hpvm_untrack_mem(h_Anext);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
index ec47c22227648df094cbf03ea1b667943207207e..1157b6198888a547a7d9c29b6f17970410ddb865 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
@@ -174,7 +174,7 @@ int main(int argc, char **argv) {
   CHECK_ERROR("clSetKernelArg")
 
   // main execution
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   int t;
   for (t = 0; t < iteration; t++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
index 61382182d1c8b406a2e2ba9dee250327914dbac4..70a86245b75e98e93607d135949af5637c8ab32f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
   //  printf("block x is %d and y is %d z \n",block[0],block[1]);
   //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
 
   // main execution
   /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 10; i++) {
     int t;
     for (t = 0; t < iteration; t++) {
@@ -219,7 +219,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
index 217352e036b0d03bcc578286fd62c4339dedfe94..3a5dfa3b3a5d00395e01e71a54e71154a34f02c3 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
   //  printf("block x is %d and y is %d z \n",block[0],block[1]);
   //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -182,7 +182,7 @@ int main(int argc, char **argv) {
 
   // main execution
   /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 1; i++) {
     int t;
     for (t = 0; t < iteration; t++) {
@@ -219,7 +219,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
index 28c0e5fd7bf24ac79857b3488dc28f12b3c354df..264cec20a92a1ce6a6b5f821773a65ca727ecba9 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
   //  printf("block x is %d and y is %d z \n",block[0],block[1]);
   //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -195,7 +195,7 @@ int main(int argc, char **argv) {
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
     clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
     clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
index f767f6a9d29094623296e012a6b2671954b0546a..7b5db72237cadd39a3b560f26dc5c65e58f8f6f9 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
   clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
@@ -140,7 +140,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -170,7 +170,7 @@ int main(int argc, char **argv) {
   //  printf("block x is %d and y is %d z \n",block[0],block[1]);
   //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -195,7 +195,7 @@ int main(int argc, char **argv) {
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
     clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
     clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
index 10626bed59111d3ded3429626463966914218a5c..51c263f0efaa2ef561d471af396530f7f6113d94 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -175,7 +175,7 @@ int main(int argc, char **argv) {
   //  printf("block x is %d and y is %d z \n",block[0],block[1]);
   //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -200,7 +200,7 @@ int main(int argc, char **argv) {
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
     clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
     clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -218,7 +218,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
index 1d03111f209173dfc2462cb274e1bb0ac56e9c8c..a2a98e923364de634a4ba3e3cc6db2ce23203d7b 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -184,7 +184,7 @@ int main(int argc, char **argv) {
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   int t;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   /*for(int i=0; i<1; i++) {*/
   for (t = 0; t < iteration; t++) {
     clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
@@ -216,7 +216,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll
index 9ea545c1841fcf2afa6dab59a6fd695aa25d0188..a288b7649ac6bb5c9a1cc90abea8e40bfe069c17 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/kernel-spir64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
index cf86734a8639ce38eb2b1ac8280582e7bde4531c..9fc78af4b9a911fd0ef857209e04f13d3c931171 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -177,7 +177,7 @@ int main(int argc, char **argv) {
   printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1],
          grid[2], block[0], block[1], block[2]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -190,7 +190,7 @@ int main(int argc, char **argv) {
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   int t;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 2; i++) {
     for (t = 0; t < iteration; t++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
       d_A0 = d_Anext;
       d_Anext = d_temp;
 
-      /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
+      /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/
       clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
       clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
       /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
@@ -226,7 +226,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll
index 9ea545c1841fcf2afa6dab59a6fd695aa25d0188..a288b7649ac6bb5c9a1cc90abea8e40bfe069c17 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/kernel-spir64.ll
@@ -1,4 +1,4 @@
-; ModuleID = '/home/psrivas2/visc/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl'
+; ModuleID = '/home/psrivas2.hpvm.llvm/test/HPVM/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
 
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
index 3b009e370e284a5b5b705bcc3a8122547a83c177..a1e1c4e74ebd305236e9b2c5e27eda6eca3457c7 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   cl_int clStatus;
 
   cl_uint numPlatforms;
@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memcpy(h_Anext, h_A0, sizeof(float) * size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
 
   // memory allocation
   d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
@@ -177,7 +177,7 @@ int main(int argc, char **argv) {
   printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1],
          grid[2], block[0], block[1], block[2]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
   clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
   clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
@@ -190,7 +190,7 @@ int main(int argc, char **argv) {
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   int t;
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
   for (int i = 0; i < 1; i++) {
     for (t = 0; t < iteration; t++) {
       /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
       d_A0 = d_Anext;
       d_Anext = d_temp;
 
-      /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
+      /*pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);*/
       clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
       clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
       /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
@@ -226,7 +226,7 @@ int main(int argc, char **argv) {
                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_SETUP);
   clStatus = clReleaseMemObject(d_A0);
   clStatus = clReleaseMemObject(d_Anext);
   clStatus = clReleaseKernel(clKernel);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll
deleted file mode 100644
index 7dc32f37603e16c20a72d6a4b4b808c7b38afb79..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.visc.ll
+++ /dev/null
@@ -1,673 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
-; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll parboil.ll  -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
-; ModuleID = 'build/visc_vec_default/stencil.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%rtype = type {}
-%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
-%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
-%struct.pb_Timer = type { i32, i64, i64 }
-%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
-%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
-%struct.pb_Parameters = type { i8*, i8** }
-%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
-%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
-%struct.arg = type <{ float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, %rtype }>
-
-@.str3 = private unnamed_addr constant [3 x i8] c"rb\00", align 1
-@.str4 = private unnamed_addr constant [37 x i8] c"grid(%d, %d, %d), block(%d, %d, %d)\0A\00", align 1
-@.str5 = private unnamed_addr constant [17 x i8] c"A[126,1,1] = %f\0A\00", align 1
-@.str6 = private unnamed_addr constant [17 x i8] c"A[125,1,1] = %f\0A\00", align 1
-@str = private unnamed_addr constant [46 x i8] c"OpenCL accelerated 7 points stencil codes****\00"
-@str7 = private unnamed_addr constant [45 x i8] c"Author: Li-Wen Chang <lchang20@illinois.edu>\00"
-@str8 = private unnamed_addr constant [106 x i8] c"Usage: probe nx ny nz t\0Anx: the grid size x\0Any: the grid size y\0Anz: the grid size z\0At: the iteration time\00"
-@viscTimerSet_GenVISC = common global i8* null
-@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
-
-; Function Attrs: nounwind uwtable
-define %rtype @naive_kernel(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz) #0 {
-entry:
-  %naive_kernel.node = call i8* @llvm.visc.getNode()
-  %naive_kernel.parentNode = call i8* @llvm.visc.getParentNode(i8* %naive_kernel.node)
-  %a0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.parentNode)
-  %a1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %naive_kernel.node)
-  %a2 = mul i32 %a0, %a1
-  %a3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %naive_kernel.node)
-  %a4 = add i32 %a2, %a3
-  ;%add = add nsw i32 %4, 1
-  %a5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.parentNode)
-  %a6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %naive_kernel.node)
-  %a7 = mul i32 %a5, %a6
-  %a8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %naive_kernel.node)
-  %a9 = add i32 %a7, %a8
-  ;%add3 = add nsw i32 %9, 1
-  %a10 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.parentNode)
-  %a11 = call i32 @llvm.visc.getNumNodeInstances.z(i8* %naive_kernel.node)
-  %a12 = mul i32 %a10, %a11
-  %a13 = call i32 @llvm.visc.getNodeInstanceID.z(i8* %naive_kernel.node)
-  %a14 = add i32 %a12, %a13
-  ;%sub = add nsw i32 %nx, -1
-  ;%cmp = icmp slt i32 %add, %sub
-  ;br i1 %cmp, label %if.then, label %if.end
-
-
-  ;%call = tail call i32 @get_global_id(i32 0) #2
-  ;%mul = shl i32 %call, 2
-  %mul = shl i32 %a4, 2
-  %add258 = or i32 %mul, 1
-  ;%call1 = tail call i32 @get_global_id(i32 1) #2
-  ;%add2 = add i32 %call1, 1
-  %add2 = add i32 %a9, 1
-  ;%call3 = tail call i32 @get_global_id(i32 2) #2
-  ;%add4 = add i32 %call3, 1
-  %add4 = add i32 %a14, 1
-  %sub = add i32 %add258, 3
-  %sub6 = add i32 %nx, -1
-  %cmp = icmp slt i32 %sub, %sub6
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-  %mul7 = mul nsw i32 %add4, %ny
-  %add8 = add nsw i32 %mul7, %add2
-  %mul9 = mul nsw i32 %add8, %nx
-  %add11 = add i32 %sub, %mul9
-  %add.ptr = getelementptr inbounds float* %A0, i32 %add11
-  ;%call12 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr) #2
-  %vadd.ptr = bitcast float* %add.ptr to <4 x float>*
-  %call12 = load <4 x float>* %vadd.ptr
-
-  %add13 = add i32 %a14, 2
-  %mul14 = mul nsw i32 %add13, %ny
-  %add15 = add nsw i32 %mul14, %add2
-  %mul16 = mul nsw i32 %add15, %nx
-  %add18 = add i32 %sub, %mul16
-  %add.ptr19 = getelementptr inbounds float* %A0, i32 %add18
-  ;%call20 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr19) #2
-  %vadd.ptr19 = bitcast float* %add.ptr19 to <4 x float>*
-  %call20 = load <4 x float>* %vadd.ptr19
-
-  %mul22 = mul nsw i32 %a14, %ny
-  %add23 = add nsw i32 %mul22, %add2
-  %mul24 = mul nsw i32 %add23, %nx
-  %add26 = add i32 %sub, %mul24
-  %add.ptr27 = getelementptr inbounds float* %A0, i32 %add26
-  ;%call28 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr27) #2
-  %vadd.ptr27 = bitcast float* %add.ptr27 to <4 x float>*
-  %call28 = load <4 x float>* %vadd.ptr27
-
-  %add29 = add i32 %a9, 2
-  %add31 = add nsw i32 %add29, %mul7
-  %mul32 = mul nsw i32 %add31, %nx
-  %add34 = add i32 %sub, %mul32
-  %add.ptr35 = getelementptr inbounds float* %A0, i32 %add34
-  ;%call36 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr35) #2
-  %vadd.ptr35 = bitcast float* %add.ptr35 to <4 x float>*
-  %call36 = load <4 x float>* %vadd.ptr35
-
-  %add39 = add nsw i32 %mul7, %a9
-  %mul40 = mul nsw i32 %add39, %nx
-  %add42 = add i32 %sub, %mul40
-  %add.ptr43 = getelementptr inbounds float* %A0, i32 %add42
-  ;%call44 = tail call <4 x float> @_Z6vload4jPKU3AS1f(i32 0, float* %add.ptr43) #2
-  %vadd.ptr43 = bitcast float* %add.ptr43 to <4 x float>*
-  %call44 = load <4 x float>* %vadd.ptr43
-
-  %add49 = add i32 %add258, 4
-  %add50 = add i32 %add49, %mul9
-  %arrayidx = getelementptr inbounds float* %A0, i32 %add50
-  %0 = load float* %arrayidx, align 4, !tbaa !2
-  %add55261 = or i32 %mul, 3
-  %add56 = add i32 %add55261, %mul9
-  %arrayidx57 = getelementptr inbounds float* %A0, i32 %add56
-  %1 = load float* %arrayidx57, align 4, !tbaa !2
-  %2 = shufflevector <4 x float> %call12, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
-  %vext = shufflevector <3 x float> %2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  %vecinit58 = insertelement <4 x float> %vext, float %0, i32 3
-  %vecinit60 = insertelement <4 x float> undef, float %1, i32 0
-  %vecinit62 = shufflevector <4 x float> %vecinit60, <4 x float> %call12, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
-  %splat.splatinsert = insertelement <4 x float> undef, float %c1, i32 0
-  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
-  %add63 = fadd fast <4 x float> %call20, %call28
-  %add64 = fadd fast <4 x float> %add63, %call36
-  %add65 = fadd fast <4 x float> %add64, %call44
-  %add66 = fadd fast <4 x float> %add65, %vecinit58
-  %add67 = fadd fast <4 x float> %add66, %vecinit62
-  %mul68 = fmul fast <4 x float> %splat.splat, %add67
-  %splat.splatinsert69 = insertelement <4 x float> undef, float %c0, i32 0
-  %splat.splat70 = shufflevector <4 x float> %splat.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
-  %mul71 = fmul fast <4 x float> %splat.splat70, %call12
-  %sub72 = fsub fast <4 x float> %mul68, %mul71
-  %add.ptr78 = getelementptr inbounds float* %Anext, i32 %add11
-  ;tail call void @_Z7vstore4Dv4_fjPU3AS1f(<4 x float> %sub72, i32 0, float* %add.ptr78) #2
-  %vadd.ptr78 = bitcast float* %add.ptr78 to <4 x float>*
-  store <4 x float> %sub72, <4 x float>* %vadd.ptr78
-
-  br label %if.end146
-
-if.else:                                          ; preds = %entry
-  %cmp80 = icmp slt i32 %add258, %sub6
-  br i1 %cmp80, label %for.body.lr.ph, label %if.end146
-
-for.body.lr.ph:                                   ; preds = %if.else
-  %add84 = add i32 %a14, 2
-  %mul85 = mul nsw i32 %add84, %ny
-  %add86 = add nsw i32 %mul85, %add2
-  %mul87 = mul nsw i32 %add86, %nx
-  %add88 = add i32 %mul87, 3
-  %mul92 = mul nsw i32 %a14, %ny
-  %add93 = add nsw i32 %mul92, %add2
-  %mul94 = mul nsw i32 %add93, %nx
-  %add95 = add i32 %mul94, 3
-  %add99 = add i32 %a9, 2
-  %mul100 = mul nsw i32 %add4, %ny
-  %add101 = add nsw i32 %add99, %mul100
-  %mul102 = mul nsw i32 %add101, %nx
-  %add103 = add i32 %mul102, 3
-  %add109 = add nsw i32 %mul100, %a9
-  %mul110 = mul nsw i32 %add109, %nx
-  %add111 = add i32 %mul110, 3
-  %add117 = add nsw i32 %mul100, %add2
-  %mul118 = mul nsw i32 %add117, %nx
-  %add119 = add i32 %mul118, 3
-  %add127 = add i32 %mul118, 2
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %vid.0260 = phi i32 [ %add258, %for.body.lr.ph ], [ %add115, %for.body ]
-  %add89 = add i32 %add88, %vid.0260
-  %arrayidx90 = getelementptr inbounds float* %A0, i32 %add89
-  %3 = load float* %arrayidx90, align 4, !tbaa !2
-  %add96 = add i32 %add95, %vid.0260
-  %arrayidx97 = getelementptr inbounds float* %A0, i32 %add96
-  %4 = load float* %arrayidx97, align 4, !tbaa !2
-  %add98 = fadd fast float %3, %4
-  %add104 = add i32 %add103, %vid.0260
-  %arrayidx105 = getelementptr inbounds float* %A0, i32 %add104
-  %5 = load float* %arrayidx105, align 4, !tbaa !2
-  %add106 = fadd fast float %add98, %5
-  %add112 = add i32 %add111, %vid.0260
-  %arrayidx113 = getelementptr inbounds float* %A0, i32 %add112
-  %6 = load float* %arrayidx113, align 4, !tbaa !2
-  %add114 = fadd fast float %add106, %6
-  %add115 = add nsw i32 %vid.0260, 1
-  %add120 = add i32 %add119, %add115
-  %arrayidx121 = getelementptr inbounds float* %A0, i32 %add120
-  %7 = load float* %arrayidx121, align 4, !tbaa !2
-  %add122 = fadd fast float %add114, %7
-  %add128 = add i32 %add127, %vid.0260
-  %arrayidx129 = getelementptr inbounds float* %A0, i32 %add128
-  %8 = load float* %arrayidx129, align 4, !tbaa !2
-  %add130 = fadd fast float %add122, %8
-  %mul131 = fmul fast float %add130, %c1
-  %add136 = add i32 %add119, %vid.0260
-  %arrayidx137 = getelementptr inbounds float* %A0, i32 %add136
-  %9 = load float* %arrayidx137, align 4, !tbaa !2
-  %mul138 = fmul fast float %9, %c0
-  %sub139 = fsub fast float %mul131, %mul138
-  %arrayidx145 = getelementptr inbounds float* %Anext, i32 %add136
-  store float %sub139, float* %arrayidx145, align 4, !tbaa !2
-  %exitcond = icmp eq i32 %add115, %sub6
-  br i1 %exitcond, label %if.end146, label %for.body
-
-if.end146:                                        ; preds = %for.body, %if.else, %if.then
-  ;ret void
-
-
-
-
-
-;if.then:                                          ; preds = %entry
-  ;%add5 = add nsw i32 %14, 1
-  ;%add6 = add nsw i32 %14, 2
-  ;%mul = mul nsw i32 %add6, %ny
-  ;%add7 = add nsw i32 %mul, %add3
-  ;%mul8 = mul nsw i32 %add7, %nx
-  ;%add9 = add i32 %4, 4
-  ;%add10 = add i32 %add9, %mul8
-  ;%idxprom = sext i32 %add10 to i64
-  ;%arrayidx = getelementptr inbounds float* %A0, i64 %idxprom
-  ;%15 = load float* %arrayidx, align 4, !tbaa !2
-  ;%mul12 = mul nsw i32 %14, %ny
-  ;%add13 = add nsw i32 %mul12, %add3
-  ;%mul14 = mul nsw i32 %add13, %nx
-  ;%add16 = add i32 %add9, %mul14
-  ;%idxprom17 = sext i32 %add16 to i64
-  ;%arrayidx18 = getelementptr inbounds float* %A0, i64 %idxprom17
-  ;%16 = load float* %arrayidx18, align 4, !tbaa !2
-  ;%add19 = fadd fast float %15, %16
-  ;%add20 = add nsw i32 %9, 2
-  ;%mul21 = mul nsw i32 %add5, %ny
-  ;%add22 = add nsw i32 %add20, %mul21
-  ;%mul23 = mul nsw i32 %add22, %nx
-  ;%add25 = add i32 %add9, %mul23
-  ;%idxprom26 = sext i32 %add25 to i64
-  ;%arrayidx27 = getelementptr inbounds float* %A0, i64 %idxprom26
-  ;%17 = load float* %arrayidx27, align 4, !tbaa !2
-  ;%add28 = fadd fast float %add19, %17
-  ;%add31 = add nsw i32 %mul21, %9
-  ;%mul32 = mul nsw i32 %add31, %nx
-  ;%add34 = add i32 %add9, %mul32
-  ;%idxprom35 = sext i32 %add34 to i64
-  ;%arrayidx36 = getelementptr inbounds float* %A0, i64 %idxprom35
-  ;%18 = load float* %arrayidx36, align 4, !tbaa !2
-  ;%add37 = fadd fast float %add28, %18
-  ;%add40 = add nsw i32 %mul21, %add3
-  ;%mul41 = mul nsw i32 %add40, %nx
-  ;%add42 = add i32 %4, 5
-  ;%add43 = add i32 %add42, %mul41
-  ;%idxprom44 = sext i32 %add43 to i64
-  ;%arrayidx45 = getelementptr inbounds float* %A0, i64 %idxprom44
-  ;%19 = load float* %arrayidx45, align 4, !tbaa !2
-  ;%add46 = fadd fast float %add37, %19
-  ;%add51 = add i32 %4, 3
-  ;%add52 = add i32 %add51, %mul41
-  ;%idxprom53 = sext i32 %add52 to i64
-  ;%arrayidx54 = getelementptr inbounds float* %A0, i64 %idxprom53
-  ;%20 = load float* %arrayidx54, align 4, !tbaa !2
-  ;%add55 = fadd fast float %add46, %20
-  ;%mul56 = fmul fast float %add55, %c1
-  ;%add61 = add i32 %add9, %mul41
-  ;%idxprom62 = sext i32 %add61 to i64
-  ;%arrayidx63 = getelementptr inbounds float* %A0, i64 %idxprom62
-  ;%21 = load float* %arrayidx63, align 4, !tbaa !2
-  ;%mul64 = fmul fast float %21, %c0
-  ;%sub65 = fsub fast float %mul56, %mul64
-  ;%arrayidx72 = getelementptr inbounds float* %Anext, i64 %idxprom62
-  ;store float %sub65, float* %arrayidx72, align 4, !tbaa !2
-  ;br label %if.end
-
-;if.end:                                           ; preds = %if.then, %entry
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** %argv) #0 {
-entry:
-  %argc.addr = alloca i32, align 4
-  %timers = alloca %struct.pb_TimerSet, align 8
-  store i32 %argc, i32* %argc.addr, align 4, !tbaa !5
-  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
-  call void @llvm.lifetime.start(i64 800, i8* %0) #1
-  %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8]* @str, i64 0, i64 0))
-  %puts186 = call i32 @puts(i8* getelementptr inbounds ([45 x i8]* @str7, i64 0, i64 0))
-  %call2 = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
-  %1 = load i32* %argc.addr, align 4, !tbaa !5
-  %cmp = icmp slt i32 %1, 5
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  %puts187 = call i32 @puts(i8* getelementptr inbounds ([106 x i8]* @str8, i64 0, i64 0))
-  br label %cleanup
-
-if.end:                                           ; preds = %entry
-  %arrayidx = getelementptr inbounds i8** %argv, i64 1
-  %2 = load i8** %arrayidx, align 8, !tbaa !6
-  %call.i = call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #1
-  %conv.i = trunc i64 %call.i to i32
-  %cmp5 = icmp slt i32 %conv.i, 1
-  br i1 %cmp5, label %cleanup, label %if.end7
-
-if.end7:                                          ; preds = %if.end
-  %arrayidx8 = getelementptr inbounds i8** %argv, i64 2
-  %3 = load i8** %arrayidx8, align 8, !tbaa !6
-  %call.i188 = call i64 @strtol(i8* nocapture %3, i8** null, i32 10) #1
-  %conv.i189 = trunc i64 %call.i188 to i32
-  %cmp10 = icmp slt i32 %conv.i189, 1
-  br i1 %cmp10, label %cleanup, label %if.end12
-
-if.end12:                                         ; preds = %if.end7
-  %arrayidx13 = getelementptr inbounds i8** %argv, i64 3
-  %4 = load i8** %arrayidx13, align 8, !tbaa !6
-  %call.i190 = call i64 @strtol(i8* nocapture %4, i8** null, i32 10) #1
-  %conv.i191 = trunc i64 %call.i190 to i32
-  %cmp15 = icmp slt i32 %conv.i191, 1
-  br i1 %cmp15, label %cleanup, label %if.end17
-
-if.end17:                                         ; preds = %if.end12
-  %arrayidx18 = getelementptr inbounds i8** %argv, i64 4
-  %5 = load i8** %arrayidx18, align 8, !tbaa !6
-  %call.i192 = call i64 @strtol(i8* nocapture %5, i8** null, i32 10) #1
-  %conv.i193 = trunc i64 %call.i192 to i32
-  %cmp20 = icmp slt i32 %conv.i193, 1
-  br i1 %cmp20, label %cleanup, label %for.cond1.preheader.lr.ph.i
-
-for.cond1.preheader.lr.ph.i:                      ; preds = %if.end17
-  %mul = shl i64 %call.i, 32
-  %mul23 = mul i64 %mul, %call.i188
-  %sext = mul i64 %mul23, %call.i190
-  %add = ashr exact i64 %sext, 30
-  %mul24 = add i64 %add, 12
-  %call25 = call noalias i8* @malloc(i64 %mul24) #1
-  %6 = bitcast i8* %call25 to float*
-  %call27 = call noalias i8* @malloc(i64 %mul24) #1
-  %7 = bitcast i8* %call27 to float*
-  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 1
-  %8 = load i8*** %inpFiles, align 8, !tbaa !6
-  %9 = load i8** %8, align 8, !tbaa !6
-  %call29 = call %struct._IO_FILE* @fopen(i8* %9, i8* getelementptr inbounds ([3 x i8]* @.str3, i64 0, i64 0)) #1
-  %add.ptr = getelementptr inbounds i8* %call25, i64 12
-  %10 = bitcast i8* %add.ptr to float*
-  %cmp24.i = icmp sgt i32 %conv.i189, 0
-  %cmp51.i = icmp sgt i32 %conv.i, 0
-  %or.cond = and i1 %cmp24.i, %cmp51.i
-  br i1 %or.cond, label %for.cond4.preheader.lr.ph.us.i.preheader.split.us, label %read_data.exit
-
-for.cond4.preheader.lr.ph.us.i.preheader.split.us: ; preds = %for.cond1.preheader.lr.ph.i
-  %11 = mul i32 %conv.i, %conv.i189
-  br label %for.body6.lr.ph.us.us.i.preheader.us
-
-for.body6.lr.ph.us.us.i.us:                       ; preds = %for.body6.lr.ph.us.us.i.preheader.us, %for.inc8.us.us.i.us
-  %j.06.us.us.i.us = phi i32 [ %inc9.us.us.i.us, %for.inc8.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.preheader.us ]
-  %s.15.us.us.i.us = phi i32 [ %14, %for.inc8.us.us.i.us ], [ %s.09.us.i.us, %for.body6.lr.ph.us.us.i.preheader.us ]
-  %12 = sext i32 %s.15.us.us.i.us to i64
-  br label %for.body6.us.us.i.us
-
-for.body6.us.us.i.us:                             ; preds = %for.body6.us.us.i.us, %for.body6.lr.ph.us.us.i.us
-  %indvars.iv.i.us = phi i64 [ %indvars.iv.next.i.us, %for.body6.us.us.i.us ], [ %12, %for.body6.lr.ph.us.us.i.us ]
-  %k.03.us.us.i.us = phi i32 [ %inc7.us.us.i.us, %for.body6.us.us.i.us ], [ 0, %for.body6.lr.ph.us.us.i.us ]
-  %add.ptr.us.us.i.us = getelementptr inbounds float* %10, i64 %indvars.iv.i.us
-  %13 = bitcast float* %add.ptr.us.us.i.us to i8*
-  %call.us.us.i.us = call i64 @fread(i8* %13, i64 4, i64 1, %struct._IO_FILE* %call29) #1
-  %indvars.iv.next.i.us = add i64 %indvars.iv.i.us, 1
-  %inc7.us.us.i.us = add nsw i32 %k.03.us.us.i.us, 1
-  %exitcond.i.us = icmp eq i32 %inc7.us.us.i.us, %conv.i
-  br i1 %exitcond.i.us, label %for.inc8.us.us.i.us, label %for.body6.us.us.i.us
-
-for.inc8.us.us.i.us:                              ; preds = %for.body6.us.us.i.us
-  %14 = add i32 %s.15.us.us.i.us, %conv.i
-  %inc9.us.us.i.us = add nsw i32 %j.06.us.us.i.us, 1
-  %exitcond33.i.us = icmp eq i32 %inc9.us.us.i.us, %conv.i189
-  br i1 %exitcond33.i.us, label %for.inc11.us.i.us, label %for.body6.lr.ph.us.us.i.us
-
-for.inc11.us.i.us:                                ; preds = %for.inc8.us.us.i.us
-  %15 = add i32 %11, %s.09.us.i.us
-  %inc12.us.i.us = add nsw i32 %i.010.us.i.us, 1
-  %exitcond34.i.us = icmp eq i32 %inc12.us.i.us, %conv.i191
-  br i1 %exitcond34.i.us, label %read_data.exit, label %for.body6.lr.ph.us.us.i.preheader.us
-
-for.body6.lr.ph.us.us.i.preheader.us:             ; preds = %for.inc11.us.i.us, %for.cond4.preheader.lr.ph.us.i.preheader.split.us
-  %i.010.us.i.us = phi i32 [ %inc12.us.i.us, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ]
-  %s.09.us.i.us = phi i32 [ %15, %for.inc11.us.i.us ], [ 0, %for.cond4.preheader.lr.ph.us.i.preheader.split.us ]
-  br label %for.body6.lr.ph.us.us.i.us
-
-read_data.exit:                                   ; preds = %for.inc11.us.i.us, %for.cond1.preheader.lr.ph.i
-  %call31 = call i32 @fclose(%struct._IO_FILE* %call29) #1
-  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  %16 = call i8* @llvm_visc_initializeTimerSet()
-  store i8* %16, i8** @viscTimerSet_GenVISC
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  call void @llvm.visc.init()
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 15) #1
-  call void @llvm_visc_track_mem(i8* %call25, i64 %mul24) #1
-  call void @llvm_visc_track_mem(i8* %call27, i64 %mul24) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call27, i8* %call25, i64 %mul24, i32 4, i1 false)
-  %sub40 = add nsw i32 %conv.i, 253
-  %div = sdiv i32 %sub40, 256
-  %mul42 = shl nsw i32 %div, 6
-  %sub44 = add nsw i32 %conv.i189, -2
-  %sub46 = add nsw i32 %conv.i191, -2
-  %call53 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([37 x i8]* @.str4, i64 0, i64 0), i32 %mul42, i32 %sub44, i32 %sub46, i32 64, i32 1, i32 1) #1
-  %add56 = add nsw i32 %conv.i189, 1
-  %mul57 = mul nsw i32 %add56, %conv.i
-  %add59 = add nsw i32 %mul57, 129
-  %idxprom = sext i32 %add59 to i64
-  %arrayidx60 = getelementptr inbounds float* %6, i64 %idxprom
-  %17 = load float* %arrayidx60, align 4, !tbaa !2
-  %conv61 = fpext float %17 to double
-  %call62 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv61) #1
-  %add67 = add nsw i32 %mul57, 128
-  %idxprom68 = sext i32 %add67 to i64
-  %arrayidx69 = getelementptr inbounds float* %6, i64 %idxprom68
-  %18 = load float* %arrayidx69, align 4, !tbaa !2
-  %conv70 = fpext float %18 to double
-  %call71 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv70) #1
-  %cmp72194 = icmp sgt i32 %conv.i193, 0
-  br i1 %cmp72194, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %read_data.exit
-  %h_A0.0197 = phi float* [ %h_Anext.0196, %for.body ], [ %6, %read_data.exit ]
-  %h_Anext.0196 = phi float* [ %h_A0.0197, %for.body ], [ %7, %read_data.exit ]
-  %t.0195 = phi i32 [ %inc, %for.body ], [ 0, %read_data.exit ]
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19)
-  %in.addr = alloca %struct.arg
-  %in.addr.c0 = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.c0.cast = fptrunc double 0x3FC5555560000000 to float
-  store float %in.addr.c0.cast, float* %in.addr.c0
-  %in.addr.c1 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.c1.cast = fptrunc double 0x3F9C71C720000000 to float
-  store float %in.addr.c1.cast, float* %in.addr.c1
-  %in.addr.A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  store float* %h_A0.0197, float** %in.addr.A0
-  %in.addr.bytes_A0 = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  store i64 %mul24, i64* %in.addr.bytes_A0
-  %in.addr.Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  store float* %h_Anext.0196, float** %in.addr.Anext
-  %in.addr.bytes_Anext = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  store i64 %mul24, i64* %in.addr.bytes_Anext
-  %in.addr.nx = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  store i32 %conv.i, i32* %in.addr.nx
-  %in.addr.ny = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  store i32 %conv.i189, i32* %in.addr.ny
-  %in.addr.nz = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-  store i32 %conv.i191, i32* %in.addr.nz
-  %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  store i32 64, i32* %in.addr.dimX0
-  %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 10
-  store i32 1, i32* %in.addr.dimY0
-  %in.addr.dimZ1 = getelementptr %struct.arg* %in.addr, i32 0, i32 11
-  store i32 1, i32* %in.addr.dimZ1
-  %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 12
-  store i32 %div, i32* %in.addr.dimX1
-  %in.addr.dimY2 = getelementptr %struct.arg* %in.addr, i32 0, i32 13
-  store i32 %sub44, i32* %in.addr.dimY2
-  %in.addr.dimZ2 = getelementptr %struct.arg* %in.addr, i32 0, i32 14
-  store i32 %sub46, i32* %in.addr.dimZ2
-  %args = bitcast %struct.arg* %in.addr to i8*
-  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
-  %graphnaive_kernelInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2 to i8*), i8* %args)
-  call void @llvm.visc.wait(i8* %graphnaive_kernelInternal_level2)
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
-  %inc = add nsw i32 %t.0195, 1
-  %exitcond = icmp eq i32 %inc, %conv.i193
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %read_data.exit
-  %h_A0.0.lcssa = phi float* [ %6, %read_data.exit ], [ %h_Anext.0196, %for.body ]
-  %h_Anext.0.lcssa = phi float* [ %7, %read_data.exit ], [ %h_A0.0197, %for.body ]
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
-  %19 = bitcast float* %h_A0.0.lcssa to i8*
-  call void @llvm_visc_request_mem(i8* %19, i64 %mul24) #1
-  %arrayidx97 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom
-  %20 = load float* %arrayidx97, align 4, !tbaa !2
-  %conv98 = fpext float %20 to double
-  %call99 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str5, i64 0, i64 0), double %conv98) #1
-  %arrayidx106 = getelementptr inbounds float* %h_A0.0.lcssa, i64 %idxprom68
-  %21 = load float* %arrayidx106, align 4, !tbaa !2
-  %conv107 = fpext float %21 to double
-  %call108 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @.str6, i64 0, i64 0), double %conv107) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1
-  %22 = bitcast float* %h_Anext.0.lcssa to i8*
-  call void @llvm_visc_untrack_mem(i8* %22) #1
-  call void @llvm_visc_untrack_mem(i8* %19) #1
-  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
-  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
-  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
-  call void @llvm.visc.cleanup()
-  %outFile = getelementptr inbounds %struct.pb_Parameters* %call2, i64 0, i32 0
-  %23 = load i8** %outFile, align 8, !tbaa !6
-  %tobool = icmp eq i8* %23, null
-  br i1 %tobool, label %if.end113, label %if.then110
-
-if.then110:                                       ; preds = %for.end
-  %add.ptr112 = getelementptr inbounds float* %h_A0.0.lcssa, i64 3
-  call void @outputData(i8* %23, float* %add.ptr112, i32 %conv.i, i32 %conv.i189, i32 %conv.i191) #1
-  br label %if.end113
-
-if.end113:                                        ; preds = %if.then110, %for.end
-  call void @free(i8* %22) #1
-  call void @free(i8* %19) #1
-  call void @pb_FreeParameters(%struct.pb_Parameters* %call2) #1
-  br label %cleanup
-
-cleanup:                                          ; preds = %if.end113, %if.end17, %if.end12, %if.end7, %if.end, %if.then
-  %retval.0 = phi i32 [ -1, %if.then ], [ 0, %if.end113 ], [ -1, %if.end ], [ -1, %if.end7 ], [ -1, %if.end12 ], [ -1, %if.end17 ]
-  call void @llvm.lifetime.end(i64 800, i8* %0) #1
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #2
-
-declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #3
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #2
-
-; Function Attrs: nounwind
-declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #2
-
-; Function Attrs: nounwind
-declare i32 @fclose(%struct._IO_FILE* nocapture) #2
-
-declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #3
-
-declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #3
-
-declare void @llvm_visc_track_mem(i8*, i64) #3
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #1
-
-declare void @llvm_visc_request_mem(i8*, i64) #3
-
-declare void @llvm_visc_untrack_mem(i8*) #3
-
-declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #3
-
-declare void @outputData(i8*, float*, i32, i32, i32) #3
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #2
-
-declare void @pb_FreeParameters(%struct.pb_Parameters*) #3
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2
-
-; Function Attrs: nounwind
-declare i64 @strtol(i8*, i8** nocapture, i32) #2
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #1
-
-declare i8* @llvm_visc_initializeTimerSet()
-
-declare void @llvm_visc_switchToTimer(i8**, i32)
-
-declare void @llvm_visc_printTimerSet(i8**, i8*)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.z(i8*) #1
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.z(i8*) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @naive_kernelInternal_level1(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ) #0 {
-entry:
-  %naive_kernel.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32)* @naive_kernel to i8*), i32 %dimX, i32 %dimY, i32 %dimZ)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %naive_kernel.node, i32 8, i32 8)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32) #1
-
-; Function Attrs: nounwind uwtable
-define %rtype @naive_kernelInternal_level2(float %c0, float %c1, float* in %A0, i64 %bytes_A0, float* in out %Anext, i64 %bytes_Anext, i32 %nx, i32 %ny, i32 %nz, i32 %dimX, i32 %dimY, i32 %dimZ, i32 %dimX1, i32 %dimY2, i32 %dimZ3) #0 {
-entry:
-  %naive_kernelInternal_level1.node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1 to i8*), i32 %dimX1, i32 %dimY2, i32 %dimZ3)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 1, i32 1)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 2, i32 2)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 3, i32 3)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 4, i32 4)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 5, i32 5)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 6, i32 6)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 7, i32 7)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 8, i32 8)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 9, i32 9)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 10, i32 10)
-  call void @llvm.visc.bind.input(i8* %naive_kernelInternal_level1.node, i32 11, i32 11)
-  ret %rtype undef
-}
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
-!visc_hint_gpu = !{}
-!visc_hint_cpu = !{!0, !1}
-
-!0 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level1}
-!1 = metadata !{%rtype (float, float, float*, i64, float*, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32)* @naive_kernelInternal_level2}
-!2 = metadata !{metadata !"float", metadata !3}
-!3 = metadata !{metadata !"omnipotent char", metadata !4}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
-!5 = metadata !{metadata !"int", metadata !3}
-!6 = metadata !{metadata !"any pointer", metadata !3}
diff --git a/hpvm/test/parboil/benchmarks/tpacf/Makefile b/hpvm/test/parboil/benchmarks/tpacf/Makefile
index 6140acd5ac3a196c8750b997c2e5904ba9585839..e76139ba384fed18f7487e723d0859e4e44075f6 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/Makefile
+++ b/hpvm/test/parboil/benchmarks/tpacf/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = tpacf
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile
similarity index 82%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile
index ba6459d78a16e381f4f3b75ee026b380583f87c5..040e2c7994ff0c0ace28099f6f193a7cb7b3d272 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=args.ll model.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.cc
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/args.h
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc
similarity index 76%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc
index 3239be6c92f641422f2ba6910894ae68cc8b220e..49208f579c87545dcbfccc01fd054c20e1123d40 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/main.cc
@@ -14,7 +14,7 @@
 #include "args.h"
 #include "model.h"
 
-#include <visc.h>
+#include <hpvm.h>
 
 extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
@@ -62,13 +62,13 @@ void packData(RootIn *args, hist_t *histograms, size_t bytes_histograms,
 
 void Allocation(long block) {
   // Memory shared between threadblocks
-  // void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE);
+  // void* data_s = __hpvm__malloc(sizeof(struct cartesian)*BLOCK_SIZE);
   void *warp_hists =
-      __visc__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS);
+      __hpvm__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS);
 
-  //__visc__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE,
+  //__hpvm__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE,
   // warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS);
-  __visc__return(2, warp_hists,
+  __hpvm__return(2, warp_hists,
                  sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS);
 }
 
@@ -80,14 +80,14 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms,
                // struct cartesian* data_s, size_t bytes_data_s,
                unsigned int *warp_hists, size_t bytes_warp_hists) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, all_x_data, binb, 1, histograms);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, all_x_data, binb, 1, histograms);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int dimx = __visc__getNumNodeInstances_x(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int dimx = __hpvm__getNumNodeInstances_x(thisNode);
 
   float *all_y_data = all_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
   float *all_z_data = all_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
@@ -170,7 +170,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms,
       unsigned int warpnum = tid / (WARP_SIZE / HISTS_PER_WARP);
       if ((distance < binb[min]) && (distance >= binb[max]) &&
           (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) {
-        __visc__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1);
+        __hpvm__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1);
       }
     }
   }
@@ -181,7 +181,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms,
   for (unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; offset >>= 1) {
     for (unsigned int bin_base = 0; bin_base < NUM_BINS;
          bin_base += BLOCK_SIZE / (NUM_HISTOGRAMS >> 1)) {
-      __visc__barrier();
+      __hpvm__barrier();
       if (warp_index < offset && bin_base + bin_index < NUM_BINS) {
         unsigned long sum =
             warp_hists(bin_base + bin_index, warp_index) +
@@ -191,7 +191,7 @@ void TPACFLeaf(hist_t *histograms, size_t bytes_histograms,
     }
   }
 
-  __visc__barrier();
+  __hpvm__barrier();
 
   // Put the results back in the real histogram
   // warp_hists(x, 0) holds sum of all locations of bin x
@@ -207,26 +207,26 @@ void BlockingTPACF(hist_t *histograms, size_t bytes_histograms,
                    float *binb, size_t bytes_binb, int NUM_SETS,
                    int NUM_ELEMENTS, long block) {
 
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, all_x_data, binb, 1, histograms);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, all_x_data, binb, 1, histograms);
 
-  void *AllocationNode = __visc__createNodeND(0, Allocation);
-  void *TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block);
+  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
+  void *TPACFLeafNode = __hpvm__createNodeND(1, TPACFLeaf, block);
 
   // Bind Inputs
-  __visc__bindIn(AllocationNode, 8, 0, 0); // Bind block
-  __visc__bindIn(TPACFLeafNode, 0, 0, 0);  // Bind histograms
-  __visc__bindIn(TPACFLeafNode, 1, 1, 0);  // Bind bytes_histograms
-  __visc__bindIn(TPACFLeafNode, 2, 2, 0);  // Bind all_x_data
-  __visc__bindIn(TPACFLeafNode, 3, 3, 0);  // Bind bytes_all_data
-  __visc__bindIn(TPACFLeafNode, 4, 4, 0);  // Bind binb
-  __visc__bindIn(TPACFLeafNode, 5, 5, 0);  // Bind bytes_binb
-  __visc__bindIn(TPACFLeafNode, 6, 6, 0);  // Bind NUM_SETS
-  __visc__bindIn(TPACFLeafNode, 7, 7, 0);  // Bind NUM_ELEMENTS
+  __hpvm__bindIn(AllocationNode, 8, 0, 0); // Bind block
+  __hpvm__bindIn(TPACFLeafNode, 0, 0, 0);  // Bind histograms
+  __hpvm__bindIn(TPACFLeafNode, 1, 1, 0);  // Bind bytes_histograms
+  __hpvm__bindIn(TPACFLeafNode, 2, 2, 0);  // Bind all_x_data
+  __hpvm__bindIn(TPACFLeafNode, 3, 3, 0);  // Bind bytes_all_data
+  __hpvm__bindIn(TPACFLeafNode, 4, 4, 0);  // Bind binb
+  __hpvm__bindIn(TPACFLeafNode, 5, 5, 0);  // Bind bytes_binb
+  __hpvm__bindIn(TPACFLeafNode, 6, 6, 0);  // Bind NUM_SETS
+  __hpvm__bindIn(TPACFLeafNode, 7, 7, 0);  // Bind NUM_ELEMENTS
 
   // Create Edges
-  __visc__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists
-  __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9,
+  __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists
+  __hpvm__edge(AllocationNode, TPACFLeafNode, 1, 1, 9,
                0); // Edge bytes_warp_hists
 }
 
@@ -236,21 +236,21 @@ void TPACFRoot(hist_t *histograms, size_t bytes_histograms, float *all_x_data,
                float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS,
                long block, long grid) {
 
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, all_x_data, binb, 1, histograms);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, all_x_data, binb, 1, histograms);
 
-  void *BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid);
+  void *BlockingTPACFNode = __hpvm__createNodeND(1, BlockingTPACF, grid);
 
   // Bind Inputs
-  __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
-  __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms
-  __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data
-  __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data
-  __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb
-  __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb
-  __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS
-  __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS
-  __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block
+  __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
+  __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms
+  __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data
+  __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data
+  __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb
+  __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb
+  __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS
+  __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS
+  __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block
 }
 
 void TPACFWrapper(hist_t *histograms, size_t bytes_histograms,
@@ -258,22 +258,22 @@ void TPACFWrapper(hist_t *histograms, size_t bytes_histograms,
                   // next arg is read-only constant
                   float *binb, size_t bytes_binb, int NUM_SETS,
                   int NUM_ELEMENTS, long block, long grid) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, all_x_data, binb, 1, histograms);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, all_x_data, binb, 1, histograms);
 
-  void *BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot);
+  void *BlockingTPACFNode = __hpvm__createNodeND(0, TPACFRoot);
 
   // Bind Inputs
-  __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
-  __visc__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms
-  __visc__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data
-  __visc__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data
-  __visc__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb
-  __visc__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb
-  __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS
-  __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS
-  __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block
-  __visc__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid
+  __hpvm__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
+  __hpvm__bindIn(BlockingTPACFNode, 1, 1, 0); // Bind bytes_histograms
+  __hpvm__bindIn(BlockingTPACFNode, 2, 2, 0); // Bind all_x_data
+  __hpvm__bindIn(BlockingTPACFNode, 3, 3, 0); // Bind bytes_all_data
+  __hpvm__bindIn(BlockingTPACFNode, 4, 4, 0); // Bind binb
+  __hpvm__bindIn(BlockingTPACFNode, 5, 5, 0); // Bind bytes_binb
+  __hpvm__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS
+  __hpvm__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS
+  __hpvm__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block
+  __hpvm__bindIn(BlockingTPACFNode, 9, 9, 0); // Bind grid
 }
 
 // **===-----------------------------------------------------------===**
@@ -324,14 +324,14 @@ int main(int argc, char **argv) {
   }
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // split into x, y, and z arrays
   // AOS to SOA transformation
   size_t bytes_h_x_data = 3 * f_mem_size;
   float *h_x_data = (float *)malloc(bytes_h_x_data);
-  llvm_visc_track_mem(h_x_data, bytes_h_x_data);
+  llvm_hpvm_track_mem(h_x_data, bytes_h_x_data);
 
   float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
   float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
@@ -349,12 +349,12 @@ int main(int argc, char **argv) {
   // allocate system memory for final histograms
   size_t bytes_hists = NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t);
   hist_t *hists = (hist_t *)malloc(bytes_hists);
-  llvm_visc_track_mem(hists, bytes_hists);
+  llvm_hpvm_track_mem(hists, bytes_hists);
 
   // Initialize the boundary constants for bin search
   size_t bytes_binb = (NUM_BINS + 1) * sizeof(float);
   float *binb = (float *)malloc(bytes_binb);
-  llvm_visc_track_mem(binb, bytes_binb);
+  llvm_hpvm_track_mem(binb, bytes_binb);
 
   for (int k = 0; k < NUM_BINS + 1; k++) {
     binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
@@ -369,17 +369,17 @@ int main(int argc, char **argv) {
   RootIn *graph_args = (RootIn *)malloc(sizeof(RootIn));
   packData(graph_args, hists, bytes_hists, h_x_data, bytes_h_x_data, binb,
            bytes_binb, NUM_SETS, NUM_ELEMENTS, block, grid);
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
-  void *TPACF_DFG = __visc__launch(0, TPACFRoot, (void *)graph_args);
-  __visc__wait(TPACF_DFG);
+  void *TPACF_DFG = __hpvm__launch(0, TPACFRoot, (void *)graph_args);
+  __hpvm__wait(TPACF_DFG);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
-  llvm_visc_request_mem(hists, bytes_hists);
+  llvm_hpvm_request_mem(hists, bytes_hists);
   // references into output histograms
   hist_t *dd_hist = hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
@@ -407,7 +407,7 @@ int main(int argc, char **argv) {
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   FILE *outfile;
   if ((outfile = fopen(params->outFile, "w")) == NULL) {
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.cc
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h
rename to hpvm/test/parboil/benchmarks/tpacf/src/hpvm/model.h
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
index d945bccf4eae7f296394d74ac0617f3e20426dcd..d89d556a100157164445ec46f649828791edfd29 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
@@ -199,7 +199,7 @@ int main(int argc, char **argv) {
                                   3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
index 791b5fbdd6aa70359d37ca5a85139c7f8374c56d..ef2a21daed14a9ada398130c4cf4ac650621056e 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
@@ -203,7 +203,7 @@ int main(int argc, char **argv) {
                                   3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_COMPUTATION);
 
   TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h
index 30ad6721c3190610dd08ec131603b6fe622f897e..ba25726c027a5c67283c68a703216ad7ee785ef5 100644
--- a/hpvm/test/parboil/common/include/parboil.h
+++ b/hpvm/test/parboil/common/include/parboil.h
@@ -102,23 +102,23 @@ enum pb_TimerID {
                           * host activity: automatically filled in,
                           * not intended for direct usage */
   // GPU FUNCTION
-  visc_TimerID_INIT_CTX,
-  visc_TimerID_CLEAR_CTX,
-  visc_TimerID_COPY_SCALAR,
-  visc_TimerID_COPY_PTR,
-  visc_TimerID_MEM_FREE,
-  visc_TimerID_READ_OUTPUT,
-  visc_TimerID_SETUP,
-  visc_TimerID_MEM_TRACK,
-  visc_TimerID_MEM_UNTRACK,
-  visc_TimerID_MISC,
+  hpvm_TimerID_INIT_CTX,
+  hpvm_TimerID_CLEAR_CTX,
+  hpvm_TimerID_COPY_SCALAR,
+  hpvm_TimerID_COPY_PTR,
+  hpvm_TimerID_MEM_FREE,
+  hpvm_TimerID_READ_OUTPUT,
+  hpvm_TimerID_SETUP,
+  hpvm_TimerID_MEM_TRACK,
+  hpvm_TimerID_MEM_UNTRACK,
+  hpvm_TimerID_MISC,
   // LAUNCH FUNCTION
-  visc_TimerID_PTHREAD_CREATE,
-  visc_TimerID_ARG_PACK,
-  visc_TimerID_ARG_UNPACK,
-  visc_TimerID_COMPUTATION,
-  visc_TimerID_OUTPUT_PACK,
-  visc_TimerID_OUTPUT_UNPACK,
+  hpvm_TimerID_PTHREAD_CREATE,
+  hpvm_TimerID_ARG_PACK,
+  hpvm_TimerID_ARG_UNPACK,
+  hpvm_TimerID_COMPUTATION,
+  hpvm_TimerID_OUTPUT_PACK,
+  hpvm_TimerID_OUTPUT_UNPACK,
 
   pb_TimerID_LAST /* Number of timer IDs */
 };
diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/hpvm.mk
similarity index 81%
rename from hpvm/test/parboil/common/mk/visc.mk
rename to hpvm/test/parboil/common/mk/hpvm.mk
index eb11371ccdb931d5160e5143af907a308215eb54..1c59d4d8fd7802698df9fcc78cfd16adc64ad641 100755
--- a/hpvm/test/parboil/common/mk/visc.mk
+++ b/hpvm/test/parboil/common/mk/hpvm.mk
@@ -9,38 +9,38 @@ CFLAGS=$(LANG_CFLAGS) $(PLATFORM_CFLAGS) $(APP_CFLAGS)
 CXXFLAGS=$(LANG_CXXFLAGS) $(PLATFORM_CXXFLAGS) $(APP_CXXFLAGS)
 LDFLAGS=$(LANG_LDFLAGS) $(PLATFORM_LDFLAGS) $(APP_LDFLAGS)
 
-# VISC
+# HPVM
 LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs
-#VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
+#HPVM_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/hpvm-rt
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt
 
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll
 #LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc
 LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc
 #LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc
 
 LLVM_34_AS = /opt/llvm/bin/llvm-as
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
 
 ifeq ($(TARGET),x86)
   DEVICE = SPIR_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
   CFLAGS += -DOPENCL_CPU
 else ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
 else ifeq ($(TARGET),seqx86)
   DEVICE = CPU_OR_SPIR_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
   CFLAGS += -DOPENCL_CPU
 else ifeq ($(TARGET),seqgpu)
   DEVICE = CPU_OR_GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
 endif
 
 CFLAGS += -DDEVICE=$(DEVICE)
@@ -49,31 +49,31 @@ CXXFLAGS += -DDEVICE=$(DEVICE)
 HOST_LINKFLAGS =
 
 ifeq ($(TIMER),x86)
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else ifeq ($(TIMER),ptx)
-  VISC_OPTFLAGS += -visc-timers-ptx
+  HPVM_OPTFLAGS += -hpvm-timers-ptx
 else ifeq ($(TIMER),gen)
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 else ifeq ($(TIMER),spir)
-  TESTGEN_OPTFLAGS += -visc-timers-spir
+  TESTGEN_OPTFLAGS += -hpvm-timers-spir
 else ifeq ($(TIMER),no)
 else
   ifeq ($(TARGET),x86)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir
   else ifeq ($(TARGET),seq)
-    VISC_OPTFLAGS += -visc-timers-x86
+    HPVM_OPTFLAGS += -hpvm-timers-x86
   else ifeq ($(TARGET),seqx86)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir
   else ifeq ($(TARGET),seqgpu)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
   else
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
   endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 endif
 
 ifeq ($(DABSTRACTION),true)
-  VISC_OPTFLAGS += -visc-eda
+  HPVM_OPTFLAGS += -hpvm-eda
 endif
 
 # Rules common to all makefiles
@@ -121,7 +121,7 @@ endif
 ########################################
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll)
 KERNEL = $(TEST_OBJS).kernels.ll
 KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll
@@ -190,14 +190,14 @@ $(KERNEL_OPT) : $(KERNEL)
 $(BIN) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
+$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp
 	make -C $(LLVM_LIB_PATH)
 
-$(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS)
-	$(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILDDIR)/$(HPVM_OBJS)
+	$(OPT) --debug $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(RUNDIR) :
 	mkdir -p $(RUNDIR)
@@ -214,11 +214,11 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cc
 $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp
 	$(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@
 
-$(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll
+$(BUILDDIR)/%.hpvm.ll: $(BUILDDIR)/%.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 	cat $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil/RUN.parboil.script $@ > $@.tmp
-	mv $@.tmp $(BUILDDIR)/$(APP).visc.ll
-	#@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS)
+	mv $@.tmp $(BUILDDIR)/$(APP).hpvm.ll
+	#@cp $(HPVM_OBJS) $(BUILDDIR)/$(HPVM_OBJS)
 
 $(BUILDDIR)/%.o : $(SRCDIR)/%.c
 	$(CC) $(CFLAGS) -c $< -o $@
diff --git a/hpvm/test/parboil/common/platform/visc.default.mk b/hpvm/test/parboil/common/platform/hpvm.default.mk
similarity index 61%
rename from hpvm/test/parboil/common/platform/visc.default.mk
rename to hpvm/test/parboil/common/platform/hpvm.default.mk
index 03a9b0874aa2b2617afab71b27470b97f5b1f4b0..ca90d453a38d0b63d16e850b57de5622cbd1f2e1 100644
--- a/hpvm/test/parboil/common/platform/visc.default.mk
+++ b/hpvm/test/parboil/common/platform/hpvm.default.mk
@@ -12,20 +12,20 @@
 #OPENCL_LIB_PATH=$(OPENCL_PATH)/lib/x86_64
 
 #build
-VISC_BUILD_DIR = $(LLVM_SRC_ROOT)/../build
+HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build
 # gcc (default)
-CC = $(VISC_BUILD_DIR)/bin/clang
-OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include
+CC = $(HPVM_BUILD_DIR)/bin/clang
+OCLBE = $(HPVM_BUILD_DIR)/bin/llvm-cbe
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include
+CXX = $(HPVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(HPVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lOpenCL
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/parboil/driver/options.py b/hpvm/test/parboil/driver/options.py
index b80fc16168b54a326d16aaa99703e0bc172385f6..e15883b753c71a0e2d4fa68294a589fa7324aeca 100644
--- a/hpvm/test/parboil/driver/options.py
+++ b/hpvm/test/parboil/driver/options.py
@@ -264,7 +264,7 @@ def time_options(progname, cmd, args):
             label_ptx = 'NVPTX_Timer'
             #label_ptx = 'SPIR_Timer'
             label_x86 = 'X86_Timer'
-            label_gen = 'GenVISC_Timer'
+            label_gen = 'GenHPVM_Timer'
             timings[label_f] = {}
             timings[label_f]['IO'] = addTime([(label_pb, 'IO')], timings)
             timings[label_f]['Memory Track'] = addTime([(label_pb, 'Mem_Track')], timings)
@@ -297,11 +297,11 @@ def time_options(progname, cmd, args):
                 timerName = 'Parboil'
                 timings[timerName] = {}
                 continue
-            if line.startswith('Printing VISC Timer'):
-                regex = re.search('Printing VISC Timer: *(?P<name>[a-zA-Z0-9 _]+)', line)
+            if line.startswith('Printing HPVM Timer'):
+                regex = re.search('Printing HPVM Timer: *(?P<name>[a-zA-Z0-9 _]+)', line)
                 timerName = regex.group('name').strip()
                 timings[timerName] = {}
-                if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenVISC_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer':
+                if timerName != 'NVPTX_Timer' and timerName != 'X86_Timer' and timerName != 'GenHPVM_Timer' and timerName != 'KernelTimer' and timerName != 'SPIR_Timer':
                     print "Warning: Found unknown timer " + timerName
                 continue
             m = re.search('(?P<timerID>[a-zA-Z _/]+) *: *(?P<value>[0-9]*\.[0-9]*) *$', line)
@@ -352,67 +352,67 @@ def time_options(progname, cmd, args):
         globals.verbose = opts.verbose
 
         configs = [
-                    ('spmv',    {  'VERSION'   : ["opencl_nvidia", "visc"],
+                    ('spmv',    {  'VERSION'   : ["opencl_nvidia", "hpvm"],
                                     'TEST'      : [("large", 10)]
                                 }
                     )
-                    ,('sgemm',   {  'VERSION'   : ["opencl_nvidia", "visc_sh"],
+                    ,('sgemm',   {  'VERSION'   : ["opencl_nvidia", "hpvm_sh"],
                                     'TEST'      : [("4K", 10)]
                                 }
                     )
-                    ,('lbm',     {   'VERSION'   : ["opencl_nvidia", "visc"],
+                    ,('lbm',     {   'VERSION'   : ["opencl_nvidia", "hpvm"],
                                     'TEST'      : [("long", 10)]
                                 }
                     )
-                    ,('stencil', {   'VERSION'   : ["opencl_base", "visc"],
+                    ,('stencil', {   'VERSION'   : ["opencl_base", "hpvm"],
                                     'TEST'      : [("large", 10)]
                                  }
                     )
-                    ,('bfs',    {   'VERSION'   : ["opencl_nvidia", "visc"],
+                    ,('bfs',    {   'VERSION'   : ["opencl_nvidia", "hpvm"],
                                     'TEST'      : [("1M", 10), ("SF", 10)]
                                 }
                     )
-                    ,('tpacf',  {   'VERSION'   : ["opencl_base", "visc"],
+                    ,('tpacf',  {   'VERSION'   : ["opencl_base", "hpvm"],
                                     'TEST'      : [("large", 10)]
                                 }
                     )
-                    ,('cutcp',  {   'VERSION'   : ["opencl_nvidia", "visc"],
+                    ,('cutcp',  {   'VERSION'   : ["opencl_nvidia", "hpvm"],
                                     'TEST'      : [("large", 10)]
                                 }
                     )
-                    #('histo',  {   'VERSION'   : ["opencl_nvidia", "visc"],
+                    #('histo',  {   'VERSION'   : ["opencl_nvidia", "hpvm"],
                                     #'TEST'      : [("default", 10), ("large", 10)]
                                 #}
                     #)
-                    #('spmv',    {  'VERSION'   : ["opencl_cpu_baseline", "visc"],
+                    #('spmv',    {  'VERSION'   : ["opencl_cpu_baseline", "hpvm"],
                                     #'TEST'      : [("large", 10), ("huge", 10)]
                                 #}
                     #)
-                    #('sgemm',   {   'VERSION'   : ["opencl_cpu_sm", "visc_sh"],
+                    #('sgemm',   {   'VERSION'   : ["opencl_cpu_sm", "hpvm_sh"],
                                     #'TEST'       : [("medium", 1), ("4K", 1)]
                                 #}
                     #)
-                   #('lbm',     {   'VERSION'   : ["opencl_cpu_baseline", "visc"],
+                   #('lbm',     {   'VERSION'   : ["opencl_cpu_baseline", "hpvm"],
                                     #'TEST'      : [("short", 10), ("long", 10)]
                                 #}
                     #)
-                   #,('stencil', {   'VERSION'   : ["opencl_cpu_baseline", "visc"],
+                   #,('stencil', {   'VERSION'   : ["opencl_cpu_baseline", "hpvm"],
                                     #'TEST'      : [("default", 10), ("large", 10)]
                                  #}
                     #)
-                    #('bfs',    {   'VERSION'   : ["opencl_cpu_baseline", "visc_base"],
+                    #('bfs',    {   'VERSION'   : ["opencl_cpu_baseline", "hpvm_base"],
                                     #'TEST'      : [("1M", 5), ("SF", 5)]
                                 #}
                     #)
-                    #,('tpacf',  {   'VERSION'   : ["opencl_cpu_base", "visc"],
+                    #,('tpacf',  {   'VERSION'   : ["opencl_cpu_base", "hpvm"],
                                     #'TEST'      : [("medium", 1), ("large", 1)]
                                 #}
                     #)
-                    #,('cutcp',  {   'VERSION'   : ["opencl_cpu_baseline", "visc"],
+                    #,('cutcp',  {   'VERSION'   : ["opencl_cpu_baseline", "hpvm"],
                                     #'TEST'      : [("small", 1), ("large", 1)]
                                 #}
                     #)
-                    #,('histo',  {   'VERSION'   : ["opencl_cpu_baseline", "visc"],
+                    #,('histo',  {   'VERSION'   : ["opencl_cpu_baseline", "hpvm"],
                                     #'TEST'      : [("default", 1), ("large", 1)]
                                 #}
                     #)
diff --git a/hpvm/test/parboil/parboilParser.py b/hpvm/test/parboil/parboilParser.py
index 0d1f10b6862c15fb8f591972dea8f13dfba45e30..5ea1346349d124c08cfe63a79a8c07e54f8f3e18 100755
--- a/hpvm/test/parboil/parboilParser.py
+++ b/hpvm/test/parboil/parboilParser.py
@@ -77,7 +77,7 @@ def parseCSVFile(filename):
   file.close()
 
   #print csvDict['a']['b']['c']['d']['e']
-  #print csvDict['sgemm']['visc']['c']['d']['e']
+  #print csvDict['sgemm']['hpvm']['c']['d']['e']
   #print csvDict['sgemm']['opencl_base']['c']['d']['e']
   #print csvDict['sgemm']['opencl_base']['small']['d']['e']
   #print csvDict['sgemm']['opencl_base']['small']['Final']['e']
@@ -96,14 +96,14 @@ def parseCSVFile(filename):
 
 
 # returns a list of available tests for the given application
-# the tests are found based on the visc version, because it exists
+# the tests are found based on the hpvm version, because it exists
 # for all apps in the dict
 def getTests(app, csvDict):
-  return csvDict[app]["visc"].keys()
+  return csvDict[app]["hpvm"].keys()
 
 
-def isViscVersion(version):
-  return version.startswith("visc")
+def isHPVMVersion(version):
+  return version.startswith("hpvm")
 
 
 def getAllVersions(csvDict):
@@ -142,7 +142,7 @@ def printTimerDecomposition(csvDict, version):
   # get apps
   apps = csvDict.keys()
 
-  isVisc = isViscVersion(version)
+  isHPVM = isHPVMVersion(version)
 
   # get tests for each app
   tests = dict()
@@ -150,7 +150,7 @@ def printTimerDecomposition(csvDict, version):
     tests[app] = csvDict[app][version].keys()
 
   # list of timer-category pairs
-  if isVisc:
+  if isHPVM:
     timers =[('Final', 'Kernel'), 
              ('Final', 'Load Program Binary'), 
              ('Final', 'Argument Unpack'), 
@@ -170,7 +170,7 @@ def printTimerDecomposition(csvDict, version):
              ('Parboil', 'Clear_Ctx'),
              ('Final', 'Timer Wall - IO'),
              ('Final', 'IO'), 
-             ('GenVISC_Timer', 'Timer Wall Time')]
+             ('GenHPVM_Timer', 'Timer Wall Time')]
   else: 
     timers =[('Final', 'Init_Ctx'),
              ('Final', 'Arg_Unpack'), 
diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile
index 421c9a853264854a2ec943035a41244f892f93ab..3fc794393cf6342d949940ea74ed3bcb5002258f 100644
--- a/hpvm/test/pipeline/Makefile
+++ b/hpvm/test/pipeline/Makefile
@@ -23,12 +23,12 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 EXE = pipeline-$(TARGET)
 
 INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR)
-INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include
 
 ## BEGIN HPVM MAKEFILE
 SRCDIR_OBJS= io.ll
 OBJS_SRC=src/io.cc
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP = $(EXE)
 APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
 APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
@@ -39,21 +39,21 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/visc-rt
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.ll
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 
 ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
 endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 
 CFLAGS += -DDEVICE=$(DEVICE)
 CXXFLAGS += -DDEVICE=$(DEVICE)
@@ -64,7 +64,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
 .PRECIOUS: $(BUILD_DIR)/%.ll
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
 ifeq ($(TARGET),seq)
@@ -91,14 +91,14 @@ $(KERNEL_OCL) : $(KERNEL)
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
+$(HPVM_RT_LIB) : $(HPVM_RT_PATH)/hpvm-rt.cpp
 	make -C $(LLVM_LIB_PATH)
 
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
+	$(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -109,7 +109,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc
 $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc
 	$(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $<
 
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
+	$(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/pipeline/copyToVersions.sh b/hpvm/test/pipeline/copyToVersions.sh
index 3b9c19bad6dd86de7eb9a82edc7f17b92265155e..67551aff2f1b47fb2ad9c69be44936e8145a68da 100755
--- a/hpvm/test/pipeline/copyToVersions.sh
+++ b/hpvm/test/pipeline/copyToVersions.sh
@@ -1,12 +1,12 @@
 
-declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC")
+declare -a versionList=("hpvmGPU" "hpvmVector" "hpvmScalar" "hpvmGPU-Scalar-MaxG" "hpvmVector-Scalar-MaxG" "hpvmGPU-Scalar-ZC" "hpvmVector-Scalar-ZC")
 declare -a fileList=("Makefile" "io.cc" "main.cc")
 
 for version in "${versionList[@]}"; do
   echo $version
   for filename in "${fileList[@]}"; do
-    echo cp ./src/visc_parallel/$filename ./src/$version/
-    cp ./src/visc_parallel/$filename ./src/$version/
+    echo cp ./src/hpvm_parallel/$filename ./src/$version/
+    cp ./src/hpvm_parallel/$filename ./src/$version/
   done
   echo
 done
diff --git a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
similarity index 95%
rename from hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll
rename to hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
index 06ec055bb746c7cc0cd58f75ed1f8090e0afa459..8056cc12eed0e4d20d45e294bf674dfc689f6bb8 100644
--- a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll
+++ b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'build/Gradient_default/main.visc.ll'
+; ModuleID = 'build/Gradient_default/main.hpvm.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -167,9 +167,9 @@ entry:
 ; Function Attrs: nounwind uwtable
 define %emptyStruct @squareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 {
 entry:
-  %call3 = tail call i8* @llvm.visc.getNode()
-  %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3)
-  %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3)
+  %call3 = tail call i8* @llvm.hpvm.getNode()
+  %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3)
+  %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3)
   %cmp = icmp slt i32 %call14, %n
   %cmp3 = icmp slt i32 %call25, %m
   %or.cond = and i1 %cmp, %cmp3
@@ -198,51 +198,51 @@ if.end:                                           ; preds = %if.then, %entry
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.23 @WrapperSquareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 {
 entry:
-  %squareRoot.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false)
+  %squareRoot.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false)
   ret %emptyStruct.23 undef
 }
 
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.24 @Gradient(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %Sx, i64 %bytesSx, float* nocapture in %Sy, i64 %bytesSy, float* nocapture out %Gx, i64 %bytesGx, float* nocapture out %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n) #2 {
 entry:
-  %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*))
-  %WrapperSquareRoot.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*))
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false)
-  %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false)
-  %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false)
+  %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*))
+  %WrapperSquareRoot.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*))
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false)
+  %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false)
+  %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false)
   ret %emptyStruct.24 undef
 }
 
@@ -866,7 +866,7 @@ cond.false:                                       ; preds = %land.lhs.true58, %l
 
 cond.end:                                         ; preds = %land.lhs.true58
   call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5
   %104 = load i32* %103, align 4, !tbaa !9
   %arrayidx.i296 = getelementptr inbounds i32* %103, i64 1
@@ -1137,15 +1137,15 @@ cond.false87:                                     ; preds = %_Z12getNextFrameRN2
   unreachable
 
 cond.end88:                                       ; preds = %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit335
-  call void @llvm_visc_track_mem(i8* %150, i64 %mul65) #1
-  call void @llvm_visc_track_mem(i8* %106, i64 36) #1
-  call void @llvm_visc_track_mem(i8* %113, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %150, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %113, i64 36) #1
   %176 = load i8** %data73, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %176, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %176, i64 %mul65) #1
   %177 = load i8** %data74, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %177, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %177, i64 %mul65) #1
   %178 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %178, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %178, i64 %mul65) #1
   %179 = load i8** %data, align 8, !tbaa !5
   %180 = bitcast i8* %179 to float*
   store float* %180, float** %I1.i, align 1, !tbaa !5
@@ -1154,8 +1154,8 @@ cond.end88:                                       ; preds = %_Z12getNextFrameRN2
 
 for.body:                                         ; preds = %for.body, %cond.end88
   %j.0480 = phi i32 [ 0, %cond.end88 ], [ %inc, %for.body ]
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false)
-  call void @llvm.visc.wait(i8* %graphID)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %inc = add i32 %j.0480, 1
   %exitcond = icmp eq i32 %inc, 2994
   br i1 %exitcond, label %for.end, label %for.body
@@ -1163,19 +1163,19 @@ for.body:                                         ; preds = %for.body, %cond.end
 for.end:                                          ; preds = %for.body
   call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
   %181 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_request_mem(i8* %181, i64 %mul65) #1
+  call void @llvm_hpvm_request_mem(i8* %181, i64 %mul65) #1
   %182 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %182) #1
-  call void @llvm_visc_untrack_mem(i8* %106) #1
-  call void @llvm_visc_untrack_mem(i8* %113) #1
+  call void @llvm_hpvm_untrack_mem(i8* %182) #1
+  call void @llvm_hpvm_untrack_mem(i8* %106) #1
+  call void @llvm_hpvm_untrack_mem(i8* %113) #1
   %183 = load i8** %data73, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %183) #1
+  call void @llvm_hpvm_untrack_mem(i8* %183) #1
   %184 = load i8** %data74, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %184) #1
+  call void @llvm_hpvm_untrack_mem(i8* %184) #1
   %185 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %185) #1
+  call void @llvm_hpvm_untrack_mem(i8* %185) #1
   call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1
   %u.i.i.i342 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9
   %186 = load %"struct.cv::UMatData"** %u.i.i.i342, align 8, !tbaa !5
@@ -1647,13 +1647,13 @@ declare noalias i8* @malloc(i64) #5
 
 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0
 
-declare void @llvm_visc_track_mem(i8*, i64) #0
+declare void @llvm_hpvm_track_mem(i8*, i64) #0
 
 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
 
-declare void @llvm_visc_request_mem(i8*, i64) #0
+declare void @llvm_hpvm_request_mem(i8*, i64) #0
 
-declare void @llvm_visc_untrack_mem(i8*) #0
+declare void @llvm_hpvm_untrack_mem(i8*) #0
 
 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
 
@@ -1713,50 +1713,50 @@ entry:
 declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.visc.getNode() #7
+declare i8* @llvm.hpvm.getNode() #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind readonly
 declare float @llvm.sqrt.f32(float) #8
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #1
+declare i8* @llvm.hpvm.createNode(i8*) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
+declare void @llvm.hpvm.wait(i8*) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
 define %horizontal.vertical.ty @horizontal_vertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %call3.i = tail call i8* @llvm.visc.getNode() #1
-  %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1
-  %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1
+  %call3.i = tail call i8* @llvm.hpvm.getNode() #1
+  %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1
+  %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1
   %mul.i = mul nsw i32 %call25.i, %n1_n
   %add.i = add nsw i32 %mul.i, %call14.i
   %cmp.i = icmp slt i32 %call14.i, %n1_n
@@ -2139,25 +2139,25 @@ vertical.exit:                                    ; preds = %if.end42.2.i67.us,
 ; Function Attrs: nounwind
 define %WrapperHorizontal.WrapperVertical.ty @WrapperHorizontal_WrapperVertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %horizontal_vertical.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n)
-  tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false)
+  %horizontal_vertical.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n)
+  tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false)
   ret %WrapperHorizontal.WrapperVertical.ty undef
 }
 
@@ -2172,9 +2172,9 @@ attributes #7 = { nounwind readnone }
 attributes #8 = { nounwind readonly }
 attributes #9 = { noreturn nounwind }
 
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2, !3, !4}
-!visc_hint_spir = !{}
+!hpvm_hint_gpu = !{!0, !1}
+!hpvm_hint_cpu = !{!2, !3, !4}
+!hpvm_hint_spir = !{}
 
 !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot}
 !1 = metadata !{%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical}
diff --git a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
similarity index 95%
rename from hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll
rename to hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
index 4b0458625157e1c6535941ec5c663f8a16660c22..aa4a0d19a0ec80910b8d82b03de018ad41470a22 100644
--- a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll
+++ b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'build/Laplacian_default/main.visc.ll'
+; ModuleID = 'build/Laplacian_default/main.hpvm.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -170,9 +170,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #1
 ; Function Attrs: nounwind uwtable
 define %emptyStruct @lincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 {
 entry:
-  %call3 = tail call i8* @llvm.visc.getNode()
-  %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3)
-  %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3)
+  %call3 = tail call i8* @llvm.hpvm.getNode()
+  %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3)
+  %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3)
   %cmp = icmp slt i32 %call14, %n
   %cmp3 = icmp slt i32 %call25, %m
   %or.cond = and i1 %cmp, %cmp3
@@ -202,55 +202,55 @@ if.end:                                           ; preds = %if.then, %entry
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.23 @WrapperLincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 {
 entry:
-  %lincomb.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false)
+  %lincomb.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false)
   ret %emptyStruct.23 undef
 }
 
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.24 @LaplacianEstimate(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %B, i64 %bytesB, float* nocapture out %D, i64 %bytesD, float* nocapture out %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n) #2 {
 entry:
-  %WrapperDilate_WrapperErode.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*))
-  %WrapperLincomb.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*))
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false)
-  %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false)
-  %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false)
+  %WrapperDilate_WrapperErode.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*))
+  %WrapperLincomb.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*))
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false)
+  %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false)
+  %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false)
   ret %emptyStruct.24 undef
 }
 
@@ -873,7 +873,7 @@ cond.false:                                       ; preds = %land.lhs.true58, %l
 
 cond.end:                                         ; preds = %land.lhs.true58
   call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5
   %104 = load i32* %103, align 4, !tbaa !9
   %arrayidx.i290 = getelementptr inbounds i32* %103, i64 1
@@ -1062,18 +1062,18 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328,
   call void @llvm.lifetime.end(i64 24, i8* %134) #1
   %data = getelementptr inbounds %"class.cv::Mat"* %src, i64 0, i32 4
   %139 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %139, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %139, i64 %mul65) #1
   %arraydecay = getelementptr inbounds [9 x float]* %B, i64 0, i64 0
-  call void @llvm_visc_track_mem(i8* %106, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1
   %data81 = getelementptr inbounds %"class.cv::Mat"* %D, i64 0, i32 4
   %140 = load i8** %data81, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %140, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %140, i64 %mul65) #1
   %data82 = getelementptr inbounds %"class.cv::Mat"* %E, i64 0, i32 4
   %141 = load i8** %data82, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %141, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %141, i64 %mul65) #1
   %data83 = getelementptr inbounds %"class.cv::Mat"* %L, i64 0, i32 4
   %142 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %142, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %142, i64 %mul65) #1
   %143 = load i8** %data, align 8, !tbaa !5
   %144 = bitcast i8* %143 to float*
   %145 = load i8** %data81, align 8, !tbaa !5
@@ -1126,8 +1126,8 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328,
 
 for.body:                                         ; preds = %for.body, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332
   %j.0474 = phi i32 [ 0, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 ], [ %inc, %for.body ]
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false)
-  call void @llvm.visc.wait(i8* %graphID)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %inc = add nsw i32 %j.0474, 1
   %exitcond = icmp eq i32 %inc, 2994
   br i1 %exitcond, label %for.end, label %for.body
@@ -1135,18 +1135,18 @@ for.body:                                         ; preds = %for.body, %_Z12getN
 for.end:                                          ; preds = %for.body
   call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
   %165 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_request_mem(i8* %165, i64 %mul65) #1
+  call void @llvm_hpvm_request_mem(i8* %165, i64 %mul65) #1
   %166 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %166) #1
-  call void @llvm_visc_untrack_mem(i8* %106) #1
+  call void @llvm_hpvm_untrack_mem(i8* %166) #1
+  call void @llvm_hpvm_untrack_mem(i8* %106) #1
   %167 = load i8** %data81, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %167) #1
+  call void @llvm_hpvm_untrack_mem(i8* %167) #1
   %168 = load i8** %data82, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %168) #1
+  call void @llvm_hpvm_untrack_mem(i8* %168) #1
   %169 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %169) #1
+  call void @llvm_hpvm_untrack_mem(i8* %169) #1
   call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1
   %u.i.i.i336 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9
   %170 = load %"struct.cv::UMatData"** %u.i.i.i336, align 8, !tbaa !5
@@ -1614,13 +1614,13 @@ declare noalias i8* @malloc(i64) #5
 
 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0
 
-declare void @llvm_visc_track_mem(i8*, i64) #0
+declare void @llvm_hpvm_track_mem(i8*, i64) #0
 
 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
 
-declare void @llvm_visc_request_mem(i8*, i64) #0
+declare void @llvm_hpvm_request_mem(i8*, i64) #0
 
-declare void @llvm_visc_untrack_mem(i8*) #0
+declare void @llvm_hpvm_untrack_mem(i8*) #0
 
 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
 
@@ -1677,47 +1677,47 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.visc.getNode() #7
+declare i8* @llvm.hpvm.getNode() #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #1
+declare i8* @llvm.hpvm.createNode(i8*) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
+declare void @llvm.hpvm.wait(i8*) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
 define %dilate.erode.ty @dilate_erode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %call3.i = tail call i8* @llvm.visc.getNode() #1
-  %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1
-  %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1
+  %call3.i = tail call i8* @llvm.hpvm.getNode() #1
+  %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1
+  %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1
   %cmp.i = icmp slt i32 %call14.i, %n1_n
   %cmp3.i = icmp slt i32 %call25.i, %n1_m
   %or.cond.i = and i1 %cmp.i, %cmp3.i
@@ -2070,25 +2070,25 @@ erode.exit:                                       ; preds = %dilate.exit, %cond.
 ; Function Attrs: nounwind
 define %WrapperDilate.WrapperErode.ty @WrapperDilate_WrapperErode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %dilate_erode.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n)
-  tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false)
+  %dilate_erode.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n)
+  tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false)
   ret %WrapperDilate.WrapperErode.ty undef
 }
 
@@ -2103,9 +2103,9 @@ attributes #7 = { nounwind readnone }
 attributes #8 = { noreturn nounwind }
 attributes #9 = { nounwind readonly }
 
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2, !3, !4}
-!visc_hint_spir = !{}
+!hpvm_hint_gpu = !{!0, !1}
+!hpvm_hint_cpu = !{!2, !3, !4}
+!hpvm_hint_spir = !{}
 
 !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb}
 !1 = metadata !{%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode}
diff --git a/hpvm/test/pipeline/run.sh b/hpvm/test/pipeline/run.sh
index 0c8435764bd87c92dd30ad51aa97011ddb07b339..5ac734026bf839c511dfdfb843b07382e6d8d4d6 100755
--- a/hpvm/test/pipeline/run.sh
+++ b/hpvm/test/pipeline/run.sh
@@ -4,7 +4,7 @@ echo Pipeline Script $1 $2
 version=$1
 pos=$2
 
-if [[ ($version == *"GPU"*) ||  ($version == "visc_parallel") ]]
+if [[ ($version == *"GPU"*) ||  ($version == "hpvm_parallel") ]]
 then
   target=""
 elif [[ $version == *"Vector"* ]]
diff --git a/hpvm/test/pipeline/runscript.sh b/hpvm/test/pipeline/runscript.sh
index 5a2933e78801993ee440ead6e19f84aae66b3577..c95af8f831eeeb7f5f464e4acbc90dd49fcb67a1 100755
--- a/hpvm/test/pipeline/runscript.sh
+++ b/hpvm/test/pipeline/runscript.sh
@@ -2,21 +2,21 @@
 echo Pipeline Script
 
 # Compile all version
-make VERSION=viscGPU clean
-make VERSION=viscVector TARGET=x86 clean
-make VERSION=viscScalar TARGET=seq clean
+make VERSION=hpvmGPU clean
+make VERSION=hpvmVector TARGET=x86 clean
+make VERSION=hpvmScalar TARGET=seq clean
 
 
-make VERSION=viscGPU
-make VERSION=viscVector TARGET=x86
-make VERSION=viscScalar TARGET=seq
+make VERSION=hpvmGPU
+make VERSION=hpvmVector TARGET=x86
+make VERSION=hpvmScalar TARGET=seq
 
 #Run all version
-make VERSION=viscGPU run &
+make VERSION=hpvmGPU run &
 ID_GPU=$!
-make VERSION=viscVector TARGET=x86 run &
+make VERSION=hpvmVector TARGET=x86 run &
 ID_Vector=$!
-make VERSION=viscScalar TARGET=seq run
+make VERSION=hpvmScalar TARGET=seq run
 ID_Scalar=$!
 
 #echo Wait 60 seconds
diff --git a/hpvm/test/pipeline/src/Makefile b/hpvm/test/pipeline/src/Makefile
index ec39b86f1cf71e2e8b6131b076c2953b566cbb56..55acb2e0982edc2a914340f2bfacbbfc1d06397f 100644
--- a/hpvm/test/pipeline/src/Makefile
+++ b/hpvm/test/pipeline/src/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc
index 9314833d25d0a3a25f13dfb24fb8a239b94956b1..ef9d8412c70813fcae123b0ef84de1850fa6b28c 100644
--- a/hpvm/test/pipeline/src/main.cc
+++ b/hpvm/test/pipeline/src/main.cc
@@ -13,6 +13,7 @@
 #include "opencv2/ocl/ocl.hpp"
 #include "opencv2/opencv.hpp"
 #include <cassert>
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -20,7 +21,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <visc.h>
 
 #define NUM_RUNS 100
 #define DEPTH 3
@@ -147,12 +147,12 @@ void packData(struct InStruct *args, float *I, size_t bytesI, float *Is,
 void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs,
                        float *Is, size_t bytesIs, long m, long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, I, Gs, 1, Is);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, I, Gs, 1, Is);
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   int gloc = gx + gy * n;
 
@@ -187,26 +187,26 @@ void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs,
 
     Is[gloc] = smoothedVal;
   }
-  __visc__return(2, bytesIs, bytesIs);
+  __hpvm__return(2, bytesIs, bytesIs);
 }
 
 void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs,
                               size_t bytesGs, float *Is, size_t bytesIs, long m,
                               long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, I, Gs, 1, Is);
-  void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
-  __visc__bindIn(GSNode, 0, 0, 0); // Bind I
-  __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
-  __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
-  __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
-  __visc__bindIn(GSNode, 4, 4, 0); // Bind Is
-  __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
-  __visc__bindIn(GSNode, 6, 6, 0); // Bind m
-  __visc__bindIn(GSNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(GSNode, 0, 0, 0); // bind output bytesIs
-  __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, I, Gs, 1, Is);
+  void *GSNode = __hpvm__createNodeND(2, gaussianSmoothing, m, n);
+  __hpvm__bindIn(GSNode, 0, 0, 0); // Bind I
+  __hpvm__bindIn(GSNode, 1, 1, 0); // Bind bytesI
+  __hpvm__bindIn(GSNode, 2, 2, 0); // Bind Gs
+  __hpvm__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
+  __hpvm__bindIn(GSNode, 4, 4, 0); // Bind Is
+  __hpvm__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
+  __hpvm__bindIn(GSNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(GSNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(GSNode, 0, 0, 0); // bind output bytesIs
+  __hpvm__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
 }
 
 /* Compute a non-linear laplacian estimate of input image I of size m x n */
@@ -220,14 +220,14 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs,
 void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB,
                        float *L, size_t bytesL, long m, long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, Is, B, 1, L);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB * SZB];
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
@@ -300,25 +300,25 @@ void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB,
     float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1];
     L[gy * n + gx] = laplacian;
   }
-  __visc__return(1, bytesL);
+  __hpvm__return(1, bytesL);
 }
 
 void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B,
                               size_t bytesB, float *L, size_t bytesL, long m,
                               long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, Is, B, 1, L);
-  void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
-  __visc__bindIn(LNode, 0, 0, 0); // Bind Is
-  __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
-  __visc__bindIn(LNode, 2, 2, 0); // Bind B
-  __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB
-  __visc__bindIn(LNode, 4, 4, 0); // Bind L
-  __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL
-  __visc__bindIn(LNode, 6, 6, 0); // Bind m
-  __visc__bindIn(LNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, Is, B, 1, L);
+  void *LNode = __hpvm__createNodeND(2, laplacianEstimate, m, n);
+  __hpvm__bindIn(LNode, 0, 0, 0); // Bind Is
+  __hpvm__bindIn(LNode, 1, 1, 0); // Bind bytesIs
+  __hpvm__bindIn(LNode, 2, 2, 0); // Bind B
+  __hpvm__bindIn(LNode, 3, 3, 0); // Bind bytesB
+  __hpvm__bindIn(LNode, 4, 4, 0); // Bind L
+  __hpvm__bindIn(LNode, 5, 5, 0); // Bind bytesL
+  __hpvm__bindIn(LNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(LNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(LNode, 0, 0, 0); // bind output bytesL
 }
 
 /* Compute the zero crossings of input image L of size m x n */
@@ -331,16 +331,16 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B,
  */
 void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB,
                           float *S, size_t bytesS, long m, long n) {
-  __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, L, B, 1, S);
+  __hpvm__hint(hpvm::DEVICE);
+  //__hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, L, B, 1, S);
 
   // 3x3 image area
   float imageArea[SZB][SZB];
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
@@ -416,25 +416,25 @@ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB,
     float pixelSign = dilatedPixel - erodedPixel;
     S[gy * n + gx] = pixelSign;
   }
-  __visc__return(1, bytesS);
+  __hpvm__return(1, bytesS);
 }
 
 void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B,
                                  size_t bytesB, float *S, size_t bytesS, long m,
                                  long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, L, B, 1, S);
-  void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
-  __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
-  __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
-  __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
-  __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
-  __visc__bindIn(ZCNode, 4, 4, 0); // Bind S
-  __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
-  __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
-  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, L, B, 1, S);
+  void *ZCNode = __hpvm__createNodeND(2, computeZeroCrossings, m, n);
+  __hpvm__bindIn(ZCNode, 0, 0, 0); // Bind L
+  __hpvm__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
+  __hpvm__bindIn(ZCNode, 2, 2, 0); // Bind B
+  __hpvm__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
+  __hpvm__bindIn(ZCNode, 4, 4, 0); // Bind S
+  __hpvm__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
+  __hpvm__bindIn(ZCNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(ZCNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
 }
 
 /*
@@ -458,12 +458,12 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx,
                      float *Sy, size_t bytesSy, float *G, size_t bytesG, long m,
                      long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, Is, Sx, Sy, 1, G);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, Is, Sx, Sy, 1, G);
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   int gloc = gx + gy * n;
 
@@ -498,27 +498,27 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx,
 
     G[gloc] = sqrt(Gx * Gx + Gy * Gy);
   }
-  __visc__return(1, bytesG);
+  __hpvm__return(1, bytesG);
 }
 
 void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx,
                             size_t bytesSx, float *Sy, size_t bytesSy, float *G,
                             size_t bytesG, long m, long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, Is, Sx, Sy, 1, G);
-  void *CGNode = __visc__createNodeND(2, computeGradient, m, n);
-  __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
-  __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
-  __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
-  __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
-  __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy
-  __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
-  __visc__bindIn(CGNode, 6, 6, 0); // Bind G
-  __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG
-  __visc__bindIn(CGNode, 8, 8, 0); // Bind m
-  __visc__bindIn(CGNode, 9, 9, 0); // Bind n
-
-  __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, Is, Sx, Sy, 1, G);
+  void *CGNode = __hpvm__createNodeND(2, computeGradient, m, n);
+  __hpvm__bindIn(CGNode, 0, 0, 0); // Bind Is
+  __hpvm__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
+  __hpvm__bindIn(CGNode, 2, 2, 0); // Bind Sx
+  __hpvm__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
+  __hpvm__bindIn(CGNode, 4, 4, 0); // Bind Sy
+  __hpvm__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
+  __hpvm__bindIn(CGNode, 6, 6, 0); // Bind G
+  __hpvm__bindIn(CGNode, 7, 7, 0); // Bind bytesG
+  __hpvm__bindIn(CGNode, 8, 8, 0); // Bind m
+  __hpvm__bindIn(CGNode, 9, 9, 0); // Bind n
+
+  __hpvm__bindOut(CGNode, 0, 0, 0); // bind output bytesG
 }
 
 /*
@@ -531,13 +531,13 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx,
 void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG,
                             size_t bytesMaxG, long m, long n) {
 
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(1, G, 1, maxG);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(1, G, 1, maxG);
 
-  void *thisNode = __visc__getNode();
+  void *thisNode = __hpvm__getNode();
 
-  long lx = __visc__getNodeInstanceID_x(thisNode);     // threadIdx.x
-  long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x
+  long lx = __hpvm__getNodeInstanceID_x(thisNode);     // threadIdx.x
+  long dimx = __hpvm__getNumNodeInstances_x(thisNode); // blockDim.x
 
   // Assume a single thread block
   // Thread block iterates over all elements
@@ -556,39 +556,39 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG,
     *maxG = G[lx];
   }
 
-  __visc__return(1, bytesMaxG);
+  __hpvm__return(1, bytesMaxG);
 }
 
 void computeMaxGradientTB(float *G, size_t bytesG, float *maxG,
                           size_t bytesMaxG, long m, long n, long block_x) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, G, maxG, 1, maxG);
-  void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
-  __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
-  __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
-  __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
-  __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
-  __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
-  __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
-
-  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, G, maxG, 1, maxG);
+  void *CMGLeafNode = __hpvm__createNodeND(1, computeMaxGradientLeaf, block_x);
+  __hpvm__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
+  __hpvm__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
+  __hpvm__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
+  __hpvm__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
+  __hpvm__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
+  __hpvm__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
+
+  __hpvm__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG,
                                size_t bytesMaxG, long m, long n, long block_x,
                                long grid_x) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, G, maxG, 1, maxG);
-  void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
-  __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
-  __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
-  __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
-  __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
-  __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
-  __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
-  __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
-
-  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, G, maxG, 1, maxG);
+  void *CMGTBNode = __hpvm__createNodeND(1, computeMaxGradientTB, grid_x);
+  __hpvm__bindIn(CMGTBNode, 0, 0, 0); // Bind G
+  __hpvm__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
+  __hpvm__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
+  __hpvm__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
+  __hpvm__bindIn(CMGTBNode, 4, 4, 0); // Bind m
+  __hpvm__bindIn(CMGTBNode, 5, 5, 0); // Bind n
+  __hpvm__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
+
+  __hpvm__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 /* Reject the zero crossings where the gradient is below a threshold */
@@ -604,39 +604,39 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG,
 void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG,
                          float *maxG, size_t bytesMaxG, float *E, size_t bytesE,
                          long m, long n) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, S, G, maxG, 1, E);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, S, G, maxG, 1, E);
 
-  void *thisNode = __visc__getNode();
-  int gx = __visc__getNodeInstanceID_x(thisNode);
-  int gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  int gx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   float mG = *maxG;
   if ((gx < n) && (gy < m)) {
     E[gy * n + gx] =
         ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0;
   }
-  __visc__return(1, bytesE);
+  __hpvm__return(1, bytesE);
 }
 
 void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G,
                                 size_t bytesG, float *maxG, size_t bytesMaxG,
                                 float *E, size_t bytesE, long m, long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, S, G, maxG, 1, E);
-  void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
-  __visc__bindIn(RZCNode, 0, 0, 0); // Bind S
-  __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
-  __visc__bindIn(RZCNode, 2, 2, 0); // Bind G
-  __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
-  __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG
-  __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG
-  __visc__bindIn(RZCNode, 6, 6, 0); // Bind E
-  __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE
-  __visc__bindIn(RZCNode, 8, 8, 0); // Bind m
-  __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
-
-  __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, S, G, maxG, 1, E);
+  void *RZCNode = __hpvm__createNodeND(2, rejectZeroCrossings, m, n);
+  __hpvm__bindIn(RZCNode, 0, 0, 0); // Bind S
+  __hpvm__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
+  __hpvm__bindIn(RZCNode, 2, 2, 0); // Bind G
+  __hpvm__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
+  __hpvm__bindIn(RZCNode, 4, 4, 0); // Bind maxG
+  __hpvm__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG
+  __hpvm__bindIn(RZCNode, 6, 6, 0); // Bind E
+  __hpvm__bindIn(RZCNode, 7, 7, 0); // Bind bytesE
+  __hpvm__bindIn(RZCNode, 8, 8, 0); // Bind m
+  __hpvm__bindIn(RZCNode, 9, 9, 0); // Bind n
+
+  __hpvm__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
 }
 
 // Pipelined Root node
@@ -656,80 +656,80 @@ void edgeDetection(float *I, size_t bytesI,       // 0
                    long block_x,                  // 24
                    long grid_x                    // 25
 ) {
-  __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
-  __visc__hint(visc::CPU_TARGET);
-  void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
-  void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
-  void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
-  void *CGNode = __visc__createNodeND(0, WrapperComputeGradient);
-  void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
-  void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
+  __hpvm__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  void *GSNode = __hpvm__createNodeND(0, WrapperGaussianSmoothing);
+  void *LNode = __hpvm__createNodeND(0, WrapperlaplacianEstimate);
+  void *CZCNode = __hpvm__createNodeND(0, WrapperComputeZeroCrossings);
+  void *CGNode = __hpvm__createNodeND(0, WrapperComputeGradient);
+  void *CMGNode = __hpvm__createNodeND(0, WrapperComputeMaxGradient);
+  void *RZCNode = __hpvm__createNodeND(0, WrapperRejectZeroCrossings);
 
   // Gaussian Inputs
-  __visc__bindIn(GSNode, 0, 0, 1);  // Bind I
-  __visc__bindIn(GSNode, 1, 1, 1);  // Bind bytesI
-  __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
-  __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
-  __visc__bindIn(GSNode, 2, 4, 1);  // Bind Is
-  __visc__bindIn(GSNode, 3, 5, 1);  // Bind bytesIs
-  __visc__bindIn(GSNode, 22, 6, 1); // Bind m
-  __visc__bindIn(GSNode, 23, 7, 1); // Bind n
+  __hpvm__bindIn(GSNode, 0, 0, 1);  // Bind I
+  __hpvm__bindIn(GSNode, 1, 1, 1);  // Bind bytesI
+  __hpvm__bindIn(GSNode, 14, 2, 1); // Bind Gs
+  __hpvm__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
+  __hpvm__bindIn(GSNode, 2, 4, 1);  // Bind Is
+  __hpvm__bindIn(GSNode, 3, 5, 1);  // Bind bytesIs
+  __hpvm__bindIn(GSNode, 22, 6, 1); // Bind m
+  __hpvm__bindIn(GSNode, 23, 7, 1); // Bind n
 
   // Laplacian Inputs
-  __visc__bindIn(LNode, 2, 0, 1);          // Bind Is
-  __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
-  __visc__bindIn(LNode, 16, 2, 1);         // Bind B
-  __visc__bindIn(LNode, 17, 3, 1);         // Bind bytesB
-  __visc__bindIn(LNode, 4, 4, 1);          // Bind L
-  __visc__bindIn(LNode, 5, 5, 1);          // Bind bytesL
-  __visc__bindIn(LNode, 22, 6, 1);         // Bind m
-  __visc__bindIn(LNode, 23, 7, 1);         // Bind n
+  __hpvm__bindIn(LNode, 2, 0, 1);          // Bind Is
+  __hpvm__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
+  __hpvm__bindIn(LNode, 16, 2, 1);         // Bind B
+  __hpvm__bindIn(LNode, 17, 3, 1);         // Bind bytesB
+  __hpvm__bindIn(LNode, 4, 4, 1);          // Bind L
+  __hpvm__bindIn(LNode, 5, 5, 1);          // Bind bytesL
+  __hpvm__bindIn(LNode, 22, 6, 1);         // Bind m
+  __hpvm__bindIn(LNode, 23, 7, 1);         // Bind n
 
   // Compute ZC Inputs
-  __visc__bindIn(CZCNode, 4, 0, 1);         // Bind L
-  __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
-  __visc__bindIn(CZCNode, 16, 2, 1);        // Bind B
-  __visc__bindIn(CZCNode, 17, 3, 1);        // Bind bytesB
-  __visc__bindIn(CZCNode, 6, 4, 1);         // Bind S
-  __visc__bindIn(CZCNode, 7, 5, 1);         // Bind bytesS
-  __visc__bindIn(CZCNode, 22, 6, 1);        // Bind m
-  __visc__bindIn(CZCNode, 23, 7, 1);        // Bind n
+  __hpvm__bindIn(CZCNode, 4, 0, 1);         // Bind L
+  __hpvm__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
+  __hpvm__bindIn(CZCNode, 16, 2, 1);        // Bind B
+  __hpvm__bindIn(CZCNode, 17, 3, 1);        // Bind bytesB
+  __hpvm__bindIn(CZCNode, 6, 4, 1);         // Bind S
+  __hpvm__bindIn(CZCNode, 7, 5, 1);         // Bind bytesS
+  __hpvm__bindIn(CZCNode, 22, 6, 1);        // Bind m
+  __hpvm__bindIn(CZCNode, 23, 7, 1);        // Bind n
 
   // Gradient Inputs
-  __visc__bindIn(CGNode, 2, 0, 1);          // Bind Is
-  __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
-  __visc__bindIn(CGNode, 18, 2, 1);         // Bind Sx
-  __visc__bindIn(CGNode, 19, 3, 1);         // Bind bytesSx
-  __visc__bindIn(CGNode, 20, 4, 1);         // Bind Sy
-  __visc__bindIn(CGNode, 21, 5, 1);         // Bind bytesSy
-  __visc__bindIn(CGNode, 8, 6, 1);          // Bind G
-  __visc__bindIn(CGNode, 9, 7, 1);          // Bind bytesG
-  __visc__bindIn(CGNode, 22, 8, 1);         // Bind m
-  __visc__bindIn(CGNode, 23, 9, 1);         // Bind n
+  __hpvm__bindIn(CGNode, 2, 0, 1);          // Bind Is
+  __hpvm__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
+  __hpvm__bindIn(CGNode, 18, 2, 1);         // Bind Sx
+  __hpvm__bindIn(CGNode, 19, 3, 1);         // Bind bytesSx
+  __hpvm__bindIn(CGNode, 20, 4, 1);         // Bind Sy
+  __hpvm__bindIn(CGNode, 21, 5, 1);         // Bind bytesSy
+  __hpvm__bindIn(CGNode, 8, 6, 1);          // Bind G
+  __hpvm__bindIn(CGNode, 9, 7, 1);          // Bind bytesG
+  __hpvm__bindIn(CGNode, 22, 8, 1);         // Bind m
+  __hpvm__bindIn(CGNode, 23, 9, 1);         // Bind n
 
   // Max Gradient Inputs
-  __visc__bindIn(CMGNode, 8, 0, 1);          // Bind G
-  __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
-  __visc__bindIn(CMGNode, 10, 2, 1);         // Bind maxG
-  __visc__bindIn(CMGNode, 11, 3, 1);         // Bind bytesMaxG
-  __visc__bindIn(CMGNode, 22, 4, 1);         // Bind m
-  __visc__bindIn(CMGNode, 23, 5, 1);         // Bind n
-  __visc__bindIn(CMGNode, 24, 6, 1);         // Bind block_x
-  __visc__bindIn(CMGNode, 25, 7, 1);         // Bind grid_x
+  __hpvm__bindIn(CMGNode, 8, 0, 1);          // Bind G
+  __hpvm__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
+  __hpvm__bindIn(CMGNode, 10, 2, 1);         // Bind maxG
+  __hpvm__bindIn(CMGNode, 11, 3, 1);         // Bind bytesMaxG
+  __hpvm__bindIn(CMGNode, 22, 4, 1);         // Bind m
+  __hpvm__bindIn(CMGNode, 23, 5, 1);         // Bind n
+  __hpvm__bindIn(CMGNode, 24, 6, 1);         // Bind block_x
+  __hpvm__bindIn(CMGNode, 25, 7, 1);         // Bind grid_x
 
   // Reject ZC Inputs
-  __visc__bindIn(RZCNode, 6, 0, 1);           // Bind S
-  __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
-  __visc__bindIn(RZCNode, 8, 2, 1);           // Bind G
-  __visc__bindIn(RZCNode, 9, 3, 1);           // Bind bytesG
-  __visc__bindIn(RZCNode, 10, 4, 1);          // Bind maxG
-  __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
-  __visc__bindIn(RZCNode, 12, 6, 1);          // Bind E
-  __visc__bindIn(RZCNode, 13, 7, 1);          // Bind bytesE
-  __visc__bindIn(RZCNode, 22, 8, 1);          // Bind m
-  __visc__bindIn(RZCNode, 23, 9, 1);          // Bind n
-
-  __visc__bindOut(RZCNode, 0, 0, 1); // Bind output
+  __hpvm__bindIn(RZCNode, 6, 0, 1);           // Bind S
+  __hpvm__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
+  __hpvm__bindIn(RZCNode, 8, 2, 1);           // Bind G
+  __hpvm__bindIn(RZCNode, 9, 3, 1);           // Bind bytesG
+  __hpvm__bindIn(RZCNode, 10, 4, 1);          // Bind maxG
+  __hpvm__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
+  __hpvm__bindIn(RZCNode, 12, 6, 1);          // Bind E
+  __hpvm__bindIn(RZCNode, 13, 7, 1);          // Bind bytesE
+  __hpvm__bindIn(RZCNode, 22, 8, 1);          // Bind m
+  __hpvm__bindIn(RZCNode, 23, 9, 1);          // Bind n
+
+  __hpvm__bindOut(RZCNode, 0, 0, 1); // Bind output
 }
 }
 
@@ -796,7 +796,7 @@ int main(int argc, char *argv[]) {
   assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() &&
          S.isContinuous() && G.isContinuous() && E.isContinuous());
 
-  __visc__init();
+  __hpvm__init();
 
   // copy A to device memory
   I_sz = src.size[0] * src.size[1] * sizeof(float);
@@ -843,7 +843,7 @@ int main(int argc, char *argv[]) {
 
   for (unsigned j = 0; j < NUM_RUNS; j++) {
     std::cout << "Run: " << j << "\n";
-    void *DFG = __visc__launch(1, edgeDetection, (void *)args);
+    void *DFG = __hpvm__launch(1, edgeDetection, (void *)args);
 
     cap = VideoCapture(inFile);
     getNextFrame(cap, src);
@@ -855,25 +855,25 @@ int main(int argc, char *argv[]) {
 
         *maxG = 0.0;
 
-        llvm_visc_track_mem(src.data, I_sz);
-        llvm_visc_track_mem(Is.data, I_sz);
-        llvm_visc_track_mem(L.data, I_sz);
-        llvm_visc_track_mem(S.data, I_sz);
-        llvm_visc_track_mem(G.data, I_sz);
-        llvm_visc_track_mem(maxG, bytesMaxG);
-        llvm_visc_track_mem(E.data, I_sz);
-        llvm_visc_track_mem(Gs, bytesGs);
-        llvm_visc_track_mem(B, bytesB);
-        llvm_visc_track_mem(Sx, bytesSx);
-        llvm_visc_track_mem(Sy, bytesSy);
-
-        __visc__push(DFG, args);
-        void *ret = __visc__pop(DFG);
+        llvm_hpvm_track_mem(src.data, I_sz);
+        llvm_hpvm_track_mem(Is.data, I_sz);
+        llvm_hpvm_track_mem(L.data, I_sz);
+        llvm_hpvm_track_mem(S.data, I_sz);
+        llvm_hpvm_track_mem(G.data, I_sz);
+        llvm_hpvm_track_mem(maxG, bytesMaxG);
+        llvm_hpvm_track_mem(E.data, I_sz);
+        llvm_hpvm_track_mem(Gs, bytesGs);
+        llvm_hpvm_track_mem(B, bytesB);
+        llvm_hpvm_track_mem(Sx, bytesSx);
+        llvm_hpvm_track_mem(Sy, bytesSy);
+
+        __hpvm__push(DFG, args);
+        void *ret = __hpvm__pop(DFG);
         std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz
                   << '\n';
 
-        llvm_visc_request_mem(maxG, bytesMaxG);
-        llvm_visc_request_mem(E.data, I_sz);
+        llvm_hpvm_request_mem(maxG, bytesMaxG);
+        llvm_hpvm_request_mem(E.data, I_sz);
 
         Mat in, out;
         resize(src, in, Size(HEIGHT, WIDTH));
@@ -882,26 +882,26 @@ int main(int argc, char *argv[]) {
         imshow(input_window, in);
         waitKey(1);
 
-        llvm_visc_untrack_mem(src.data);
-        llvm_visc_untrack_mem(Is.data);
-        llvm_visc_untrack_mem(L.data);
-        llvm_visc_untrack_mem(S.data);
-        llvm_visc_untrack_mem(G.data);
-        llvm_visc_untrack_mem(maxG);
-        llvm_visc_untrack_mem(E.data);
-        llvm_visc_untrack_mem(Gs);
-        llvm_visc_untrack_mem(B);
-        llvm_visc_untrack_mem(Sx);
-        llvm_visc_untrack_mem(Sy);
+        llvm_hpvm_untrack_mem(src.data);
+        llvm_hpvm_untrack_mem(Is.data);
+        llvm_hpvm_untrack_mem(L.data);
+        llvm_hpvm_untrack_mem(S.data);
+        llvm_hpvm_untrack_mem(G.data);
+        llvm_hpvm_untrack_mem(maxG);
+        llvm_hpvm_untrack_mem(E.data);
+        llvm_hpvm_untrack_mem(Gs);
+        llvm_hpvm_untrack_mem(B);
+        llvm_hpvm_untrack_mem(Sx);
+        llvm_hpvm_untrack_mem(Sy);
 
         getNextFrame(cap, src);
       }
     } else {
-      __visc__push(DFG, args);
-      __visc__pop(DFG);
+      __hpvm__push(DFG, args);
+      __hpvm__pop(DFG);
     }
-    __visc__wait(DFG);
+    __hpvm__wait(DFG);
   }
-  __visc__cleanup();
+  __hpvm__cleanup();
   return 0;
 }
diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c
index 1b6b1cff211d5af5a909065af988aadbe979f2ec..c3f58c95d631b5c49a47de1cbe41ed5ea871f5f4 100644
--- a/hpvm/test/unitTests/CreateNodeAndEdge.c
+++ b/hpvm/test/unitTests/CreateNodeAndEdge.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdio.h>
 
 struct Root {
@@ -7,33 +7,33 @@ struct Root {
 };
 
 void Func1(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 void Func2(int *BindIn, int *SrcIn, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, BindIn, SrcIn, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, BindIn, SrcIn, 1, Out);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
+  __hpvm__hint(CPU_TARGET);
 
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__attributes(1, In, 1, Out);
 
-  void *SrcNode = __visc__createNodeND(0, Func1);
-  void *DestNode = __visc__createNodeND(0, Func2);
+  void *SrcNode = __hpvm__createNodeND(0, Func1);
+  void *DestNode = __hpvm__createNodeND(0, Func2);
 
-  __visc__bindIn(SrcNode, 0, 0, 0);
+  __hpvm__bindIn(SrcNode, 0, 0, 0);
 
-  __visc__bindIn(DestNode, 0, 0, 0);
-  __visc__edge(SrcNode, DestNode, 1, 0, 1, 0);
+  __hpvm__bindIn(DestNode, 0, 0, 0);
+  __hpvm__edge(SrcNode, DestNode, 1, 0, 1, 0);
 
-  __visc__bindOut(SrcNode, 0, 0, 0);
+  __hpvm__bindOut(SrcNode, 0, 0, 0);
 }
 
 int main(void) {
@@ -41,10 +41,10 @@ int main(void) {
   int Out = 0;
   struct Root RootArgs = {(int *)&In, (int *)&Out};
 
-  __visc__init();
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs);
-  __visc__wait(PipeDFG);
-  __visc__cleanup();
+  __hpvm__init();
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)&RootArgs);
+  __hpvm__wait(PipeDFG);
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/Makefile b/hpvm/test/unitTests/Makefile
index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644
--- a/hpvm/test/unitTests/Makefile
+++ b/hpvm/test/unitTests/Makefile
@@ -2,8 +2,8 @@ PASSES :=
 
 .PHONY: clean
 
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc
 HOST:=gemm_opencl
 KERNELS:=matrixMul
 LLVM_CC:=$(LLVM_INSTALL)/bin/clang
diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c
index cfd041a991d976c24b372a81b35842598b571d89..173f6b3b16d1090a98242d345cefa330910d862d 100644
--- a/hpvm/test/unitTests/MallocIntrinsic.c
+++ b/hpvm/test/unitTests/MallocIntrinsic.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,12 +7,12 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
 
-  Out = (int *)__visc__malloc(*In);
+  Out = (int *)__hpvm__malloc(*In);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
@@ -26,12 +26,12 @@ int main(void) {
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  __visc__init();
+  __hpvm__init();
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c
index 2a9bf83402891beddf13d96c6346e8fed924d17e..43ba0ef56cf160acb1fab6ea334732e56e0359d2 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,9 +7,9 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
-  __visc__return(1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
@@ -23,12 +23,12 @@ int main(void) {
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  __visc__init();
+  __hpvm__init();
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
index 36fc02d22b066025be4a57695265779d8e55652a..c2deed98679bf794316f283acef8e3c1db9ffa88 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,24 +7,24 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
-  __visc__return(1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
   int In, Out;
 
-  __visc__init();
+  __hpvm__init();
 
   struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root));
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/temp/3level.ll b/hpvm/test/unitTests/temp/3level.ll
index 168e7b42322c8f7fa4be83a64cbd06d44dd9e428..2e3753f1400798d0989e2a01be78ab338205a291 100644
--- a/hpvm/test/unitTests/temp/3level.ll
+++ b/hpvm/test/unitTests/temp/3level.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/3level.ll'
@@ -13,31 +13,31 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
@@ -47,18 +47,18 @@ entry:
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output1 = extractvalue %rtype %outputstruct, 0
   %output2 = extractvalue %rtype %outputstruct, 1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output1) #0
   %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output2) #0
   ret i32 0
@@ -83,21 +83,21 @@ define %rtype_internal @foo(i32 %id) {
 }
 
 define %rtype_internal @subNode(i32 %id) {
-  %foo_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*))
-  call void @llvm.visc.bind.input(i8* %foo_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %foo_node, i32 0, i32 0)
+  %foo_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*))
+  call void @llvm.hpvm.bind.input(i8* %foo_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %foo_node, i32 0, i32 0)
   ret %rtype_internal zeroinitializer
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*))
-  %sub_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %sub_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %sub_node, i32 0, i32 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*))
+  %sub_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %sub_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %sub_node, i32 0, i32 1)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/Makefile b/hpvm/test/unitTests/temp/Makefile
index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644
--- a/hpvm/test/unitTests/temp/Makefile
+++ b/hpvm/test/unitTests/temp/Makefile
@@ -2,8 +2,8 @@ PASSES :=
 
 .PHONY: clean
 
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc
 HOST:=gemm_opencl
 KERNELS:=matrixMul
 LLVM_CC:=$(LLVM_INSTALL)/bin/clang
diff --git a/hpvm/test/unitTests/temp/query2D.ll b/hpvm/test/unitTests/temp/query2D.ll
index c994c2a3ff5b166b2f192f4b900982b3b7afc508..48358a3527553c8f4a31ff89454010289d02c072 100644
--- a/hpvm/test/unitTests/temp/query2D.ll
+++ b/hpvm/test/unitTests/temp/query2D.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query2D.ll'
@@ -12,46 +12,46 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
@@ -61,25 +61,25 @@ entry:
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -92,11 +92,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/query3D.ll b/hpvm/test/unitTests/temp/query3D.ll
index 438fe60a3bc6c2dfe718da76d55041addc47367f..d2ff16ef56628752b997577891c44fd904be4405 100644
--- a/hpvm/test/unitTests/temp/query3D.ll
+++ b/hpvm/test/unitTests/temp/query3D.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query3D.ll'
@@ -12,57 +12,57 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
+declare i8* @llvm.hpvm.createNode3D(i8*, i32, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.y(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -71,21 +71,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.y(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -98,11 +98,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNodeInst.ll b/hpvm/test/unitTests/temp/queryNodeInst.ll
index 24d6a3f0d30e6661c0f1396e082f889d54dc50be..4e3dd7553045d466199c726416db220a6be2d1aa 100644
--- a/hpvm/test/unitTests/temp/queryNodeInst.ll
+++ b/hpvm/test/unitTests/temp/queryNodeInst.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,40 +12,40 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -54,21 +54,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -81,11 +81,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNumDim.ll b/hpvm/test/unitTests/temp/queryNumDim.ll
index 500e2ff41bd52f29a56cfd49563927bf6323482b..caa0978dabab0bf6295853e35f23e3ed68f00840 100644
--- a/hpvm/test/unitTests/temp/queryNumDim.ll
+++ b/hpvm/test/unitTests/temp/queryNumDim.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,42 +12,42 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -56,21 +56,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -83,11 +83,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNumNodeInst.ll b/hpvm/test/unitTests/temp/queryNumNodeInst.ll
index 48add92f16125bdf33c9691896a8b7259339fe78..07418ff725c277e2e8adbe6a39d8831e2b77bc59 100644
--- a/hpvm/test/unitTests/temp/queryNumNodeInst.ll
+++ b/hpvm/test/unitTests/temp/queryNumNodeInst.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,48 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -62,21 +62,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -89,11 +89,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/singleNode.ll b/hpvm/test/unitTests/temp/singleNode.ll
index 20713e955fb457acec2e2968d1b4a2ae61396fe0..99e53181317a6b27a83916682bcf1457895c0bfc 100644
--- a/hpvm/test/unitTests/temp/singleNode.ll
+++ b/hpvm/test/unitTests/temp/singleNode.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll'
@@ -12,43 +12,43 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -59,8 +59,8 @@ define %rtype @foo() {
 }
 
 define %rtype @Root() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/singleNodeStream.ll b/hpvm/test/unitTests/temp/singleNodeStream.ll
index fce75df6714240286e9a676e40e37c3f14e537a6..aa0243603c420a21f51f9842d467f9da814f1814 100644
--- a/hpvm/test/unitTests/temp/singleNodeStream.ll
+++ b/hpvm/test/unitTests/temp/singleNodeStream.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.push(i8*, i8*) #0
+declare void @llvm.hpvm.push(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.pop(i8*) #0
+declare i8* @llvm.hpvm.pop(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %num = alloca i32
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -60,27 +60,27 @@ entry:
   %args = bitcast %struct.arg* %in.addr to i8*
 
   ; Launch the pipeline
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
 
   ; Push arguments into the pipeline
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
 
   ; Pop out arguments and read the output
-  %graph_output = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output1 = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output2 = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output3 = call i8* @llvm.visc.pop(i8* %graphID)
+  %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output1 = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output2 = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output3 = call i8* @llvm.hpvm.pop(i8* %graphID)
   %output.addr = bitcast i8* %graph_output to %rptype*
   %outputstruct = load %rptype* %output.addr
   %output = extractvalue %rptype %outputstruct, 0
   %output_val = load i32* %output
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0
 
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -95,11 +95,11 @@ define %rptype @producer(i32* %id, i64 %size) {
 }
 
 define %rptype @Root(i32* %id, i64 %size) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 1)
-  call void @llvm.visc.bind.output(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.output(i8* %p_node, i32 1, i32 1, i1 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %p_node, i32 1, i32 1, i1 1)
   ret %rptype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoLaunch.ll b/hpvm/test/unitTests/temp/twoLaunch.ll
index 48c973a7e6f1cc5422fffd8d9e4ae0a0e1a06bf9..ee602f58d82f004a7b19bf54e55e1c0759c17bef 100644
--- a/hpvm/test/unitTests/temp/twoLaunch.ll
+++ b/hpvm/test/unitTests/temp/twoLaunch.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll'
@@ -12,33 +12,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr_1 = alloca %struct.arg
   %in.addr_2= alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -47,12 +47,12 @@ entry:
   %conv.i = trunc i64 %call.i to i32
   %args_1 = bitcast %struct.arg* %in.addr_1 to i8*
   %args_2 = bitcast %struct.arg* %in.addr_2 to i8*
-  %graphID_1 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1)
-  %graphID_2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2)
+  %graphID_1 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1)
+  %graphID_2 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID_1)
-  call void @llvm.visc.wait(i8* %graphID_2)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID_1)
+  call void @llvm.hpvm.wait(i8* %graphID_2)
+  call void @llvm.hpvm.cleanup()
 
   ret i32 0
 }
@@ -70,14 +70,14 @@ define %rtype @foo_2() {
 }
 
 define %rtype @Root_1() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
 define %rtype @Root_2() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNode.ll b/hpvm/test/unitTests/temp/twoNode.ll
index 5e2899830b835ff50c9d2d8e4157451d4bd26f7f..74e4c64d599f7204b375743687c6da2b7ed8c9f6 100644
--- a/hpvm/test/unitTests/temp/twoNode.ll
+++ b/hpvm/test/unitTests/temp/twoNode.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -46,10 +46,10 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -66,10 +66,10 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeConnect.ll b/hpvm/test/unitTests/temp/twoNodeConnect.ll
index 06652b94e02c2cac66ab4a07e88dec0a04da49f8..6b23ad691bacb42c39fe681967d4c584179644f1 100644
--- a/hpvm/test/unitTests/temp/twoNodeConnect.ll
+++ b/hpvm/test/unitTests/temp/twoNodeConnect.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -46,14 +46,14 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -70,11 +70,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeQuery.ll b/hpvm/test/unitTests/temp/twoNodeQuery.ll
index 2e1ea0dba4659d92b9c1b0600732748c87571671..247d1830dadff69ac5380b939d26c5f850bc08ac 100644
--- a/hpvm/test/unitTests/temp/twoNodeQuery.ll
+++ b/hpvm/test/unitTests/temp/twoNodeQuery.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeQuery.ll'
@@ -11,42 +11,42 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -55,21 +55,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -82,11 +82,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeStream.ll b/hpvm/test/unitTests/temp/twoNodeStream.ll
index 6e9925951884775e7ba60bb396a97fd9bc0ef52d..f9820abd19eb7b329b2c7184719d9699b15891e6 100644
--- a/hpvm/test/unitTests/temp/twoNodeStream.ll
+++ b/hpvm/test/unitTests/temp/twoNodeStream.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.push(i8*, i8*) #0
+declare void @llvm.hpvm.push(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.pop(i8*) #0
+declare i8* @llvm.hpvm.pop(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %num = alloca i32
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -60,21 +60,21 @@ entry:
   %args = bitcast %struct.arg* %in.addr to i8*
 
   ; Launch the pipeline
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
 
   ; Push arguments into the pipeline
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
 
   ; Pop out arguments and read the output
-  %graph_output = call i8* @llvm.visc.pop(i8* %graphID)
+  %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID)
   %output.addr = bitcast i8* %graph_output to %rctype*
   %outputstruct = load %rctype* %output.addr
   %output = extractvalue %rctype %outputstruct, 0
   %output_val = load i32* %output
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0
 
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -97,14 +97,14 @@ define %rctype @consumer(i32* %id, i64 %size) {
 }
 
 define %rctype @Root(i32* %id, i64 %size) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1)
-  %edge2 = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 1, i32 1, i1 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1)
+  %edge2 = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 1, i32 1, i1 1)
   ret %rctype zeroinitializer
 }