diff --git a/.gitignore b/.gitignore
index 09b3395deae54cb0cd7145f2d75374090aa7bdb0..fdaa75feb6fbffb716f6f0f874b635e79c42ac71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,5 +33,5 @@ hpvm/install/
 hpvm/llvm/
 hpvm/llvm-*.src.tar.xz
 hpvm/llvm-*.src/
-hpvm/projects/visc-rt/visc-rt.ll
+hpvm/projects/hpvm-rt/hpvm-rt.ll
 hpvm/test/**/build/
diff --git a/hpvm/docs/hpvm-c.md b/hpvm/docs/hpvm-c.md
index 25990c22304a35fe420e83f66b634c4db4489a4c..8644fc7eda7791fb0d1e03b4ae95fb7c2866decf 100644
--- a/hpvm/docs/hpvm-c.md
+++ b/hpvm/docs/hpvm-c.md
@@ -2,110 +2,110 @@
 
 ## Host API
 
-```void __visc__init()```  
+```void __hpvm__init()```  
 Used before all other HPVM calls to initialize the HPVM runtime.
 
-```void __visc__cleanup()```  
+```void __hpvm__cleanup()```  
 Used at the end of HPVM program to clean up all remaining runtime-created HPVM objects.
 
-```void __visc__cleanup()```  
+```void __hpvm__cleanup()```  
 Used at the end of HPVM program to clean up all remaining runtime-created HPVM objects.
 
-```void llvm_visc_track_mem(void* ptr, size_t sz)```  
+```void llvm_hpvm_track_mem(void* ptr, size_t sz)```  
 Insert memory starting at ```ptr``` of size ```sz``` in the memory tracker of HPVM runtime.
 
-```void llvm_visc_untrack_mem(void* ptr)```  
+```void llvm_hpvm_untrack_mem(void* ptr)```  
 Stop tracking the memory object identified by ```ptr```.
 
-```void llvm_visc_request_mem(void* ptr, size_t sz)```  
+```void llvm_hpvm_request_mem(void* ptr, size_t sz)```  
 If the memory object identified by ```ptr``` is not in host memory, copy it to host memory.
 
-```void* __visc__launch(unsigned isStream, void* rootGraph, void* args)```  
+```void* __hpvm__launch(unsigned isStream, void* rootGraph, void* args)```  
 Launches the execution of the dataflow graph with node function ```rootGraph```. ```args``` is a pointer to a packed struct, containing one field per argument of the RootGraph function, consecutively. For non-streaming DFGs with a non empty result type, ```args``` must contain an additional field of the type ```RootGraph.returnTy```, where the result of the graph will be returned. ```isStream``` chooses between a non streaming (0) or streaming (1) graph execution. Returns a handle to the executing graph.
 
-```void __visc__wait(void* G)```  
+```void __hpvm__wait(void* G)```  
 Waits for completion of execution of the dataflow graph with handle ```G```.
 
-```void __visc__push(void* G, void* args)```  
+```void __hpvm__push(void* G, void* args)```  
 Push set of input data items, ```args```, (same as type included in launch) to streaming DFG with handle ```G```.
 
-```void* __visc__pop(void* G)```  
+```void* __hpvm__pop(void* G)```  
 Pop and return data produced from one execution of streaming DFG with handle ```G```.
 
 ## Internal Node API
 
-```void* __visc__createNodeND(unsigned dims, void* F, ...)```  
+```void* __hpvm__createNodeND(unsigned dims, void* F, ...)```  
 Creates a static dataflow node replicated in ```dims``` dimensions (0 to 3), each executing node function ```F```. The arguments following ```F``` are the size of each dimension, respectively, passed in as a ```size_t```. Returns a handle to the created dataflow node.
 
-```void* __visc__edge(void* src, void* dst, unsigned replType, unsigned sp, unsigned dp, unsigned stream)```  
+```void* __hpvm__edge(void* src, void* dst, unsigned replType, unsigned sp, unsigned dp, unsigned stream)```  
 Creates an edge from output ```sp``` of node ```src``` to input ```dp``` of node ```dst```. If ```replType``` is 0, the edge is a one-to-one edge, otherwise it is an all-to-all edge. ```isStream``` defines whether or not the edge is streaming. Returns a handle to the created edge.
 
-```void __visc__bindIn(void* N, unsigned ip, unsigned ic, unsigned isStream)```  
+```void __hpvm__bindIn(void* N, unsigned ip, unsigned ic, unsigned isStream)```  
 Binds the input ```ip``` of the current node to input ```ic``` of child node function ```N```. ```isStream``` defines whether or not the input bind is streaming.
 
-```void __visc__bindOut(void* N, unsigned op, unsigned oc, unsigned isStream)```  
+```void __hpvm__bindOut(void* N, unsigned op, unsigned oc, unsigned isStream)```  
 Binds the output ```op``` of the current node to output ```oc``` of child node function ```N```. ```isStream``` defines whether or not the output bind is streaming.
 
-```void __visc__hint(enum Target target)``` (C\) / ```void __visc__hint(visc::Target target)``` (C++)  
+```void __hpvm__hint(enum Target target)``` (C\) / ```void __hpvm__hint(hpvm::Target target)``` (C++)  
 Must be called once in each node function. Indicates which hardware target the current function should run in
 
-```void __visc__attributes(unsigned ni, â€¦, unsigned no, â€¦)```  
+```void __hpvm__attributes(unsigned ni, â€¦, unsigned no, â€¦)```  
 Must be called once at the beginning of each node function. Defines the properties of the pointer arguments to the current function. ```ni``` represents the number of input arguments, and ```no``` the number of output arguments. The arguments following ```ni``` are the input arguments, and the arguments following ```no``` are the output arguments. Arguments can be marked as both input and output. All pointer arguments must be included.
 
 ## Leaf Node API
-```void __visc__hint(enum Target target)``` (C\) / ```void __visc__hint(visc::Target target)``` (C++)  
+```void __hpvm__hint(enum Target target)``` (C\) / ```void __hpvm__hint(hpvm::Target target)``` (C++)  
 As described in internal node API.
 
-```void __visc__attributes(unsigned ni, â€¦, unsigned no, â€¦)```  
+```void __hpvm__attributes(unsigned ni, â€¦, unsigned no, â€¦)```  
 As described in internal node API.
 
-```void __visc__return(unsigned n, ...)```  
-Returns ```n``` values from a leaf node function. The remaining arguments are the values to be returned. All ```__visc__return``` statements within the same function must return the same number of values.
+```void __hpvm__return(unsigned n, ...)```  
+Returns ```n``` values from a leaf node function. The remaining arguments are the values to be returned. All ```__hpvm__return``` statements within the same function must return the same number of values.
 
-```void* __visc__getNode()```  
+```void* __hpvm__getNode()```  
 Returns a handle to the current leaf node.
 
-```void* __visc__getParentNode(void* N)```  
+```void* __hpvm__getParentNode(void* N)```  
 Returns a handle to the parent node of node ```N```.
 
-```long __visc__getNodeInstanceID_{x,y,z}(void* N)```  
+```long __hpvm__getNodeInstanceID_{x,y,z}(void* N)```  
 Returns the dynamic ID of the current instance of node ```N``` in the x, y, or z dimension respectively. The dimension must be one of the dimensions in which the node is replicated.
 
-```long __visc__getNumNodeInstances_{x,y,z}(void* N)```  
+```long __hpvm__getNumNodeInstances_{x,y,z}(void* N)```  
 Returns the number of dynamic instances of node ```N``` in the x, y, or z dimension respectively. The dimension must be one of the dimensions in which the node is replicated.
 
-```void __visc__barrier()```  
+```void __hpvm__barrier()```  
 Local synchronization barrier across dynamic instances of current leaf node.
 
-```void* __visc__malloc(long nBytes)```  
+```void* __hpvm__malloc(long nBytes)```  
 Allocate a block of memory of size ```nBytes``` and returns a pointer to it. The allocated object can be shared by all nodes, although the pointer returned must somehow be communicated explicitly for use by other nodes.
 
-```int __visc__atomic_add(int* m, int v)```  
+```int __hpvm__atomic_add(int* m, int v)```  
 Atomically adds ```v``` to the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_sub(int* m, int v)```  
+```int __hpvm__atomic_sub(int* m, int v)```  
 Atomically subtracts ```v``` from the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_xchg(int* m, int v)```  
+```int __hpvm__atomic_xchg(int* m, int v)```  
 Atomically swaps ```v``` with the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_inc(int* m)```  
+```int __hpvm__atomic_inc(int* m)```  
 Atomically increments the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_dec(int* m)```  
+```int __hpvm__atomic_dec(int* m)```  
 Atomically decrements the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_min(int* m, int v)```  
+```int __hpvm__atomic_min(int* m, int v)```  
 Atomically computes the min of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_max(int* m, int v)```  
+```int __hpvm__atomic_max(int* m, int v)```  
 Atomically computes the max of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_and(int* m, int v)```  
+```int __hpvm__atomic_and(int* m, int v)```  
 Atomically computes the bitwise AND of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_or(int* m, int v)```  
+```int __hpvm__atomic_or(int* m, int v)```  
 Atomically computes the bitwise OR of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
 
-```int __visc__atomic_xor(int* m, int v)```  
+```int __hpvm__atomic_xor(int* m, int v)```  
 Atomically computes the bitwise XOR of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
diff --git a/hpvm/docs/hpvm-specification.md b/hpvm/docs/hpvm-specification.md
index bc19c95f9c03af261b915665cae7b1b996e5bb34..c3dece54945d147daf7885050d6a8f1db4eb014b 100644
--- a/hpvm/docs/hpvm-specification.md
+++ b/hpvm/docs/hpvm-specification.md
@@ -101,7 +101,7 @@ Return a handle to the current dataflow node.
 ```i8* llvm.hpvm.getParentNode(i8* N)```  
 Return a handle to the parent in the hierarchy of node ```N```.
 
-```i32 llvm.visc.getNumDims(i8* N)```  
+```i32 llvm.hpvm.getNumDims(i8* N)```  
 Get the number of dimensions of node ```N```.
 
 ```i64 llvm.hpvm.getNodeInstanceID.{x,y,z}(i8* N)```  
diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h
index 28230e135beb68c07c998e607fa3d03d40a66791..ca4c616da5f4076528b1294992ec8ad3ab768809 100644
--- a/hpvm/include/BuildDFG/BuildDFG.h
+++ b/hpvm/include/BuildDFG/BuildDFG.h
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SupportVISC/DFGraph.h"
+#include "SupportHPVM/DFGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -58,10 +58,10 @@ public:
   // Functions
   virtual bool runOnModule(Module &M);
 
-  static bool isViscLaunchIntrinsic(Instruction *I);
-  static bool isViscGraphIntrinsic(Instruction *I);
-  static bool isViscQueryIntrinsic(Instruction *I);
-  static bool isViscIntrinsic(Instruction *I);
+  static bool isHPVMLaunchIntrinsic(Instruction *I);
+  static bool isHPVMGraphIntrinsic(Instruction *I);
+  static bool isHPVMQueryIntrinsic(Instruction *I);
+  static bool isHPVMIntrinsic(Instruction *I);
   static bool isTypeCongruent(Type *L, Type *R);
 
   // TODO: Maybe make these fields const
diff --git a/hpvm/include/GenVISC/GenVISC.h b/hpvm/include/GenHPVM/GenHPVM.h
similarity index 67%
rename from hpvm/include/GenVISC/GenVISC.h
rename to hpvm/include/GenHPVM/GenHPVM.h
index 1db9929be70fdc4335e23d7e879248f0ebb45c07..24798bc2740e2299f67cc7f515437339f2fe8310 100644
--- a/hpvm/include/GenVISC/GenVISC.h
+++ b/hpvm/include/GenHPVM/GenHPVM.h
@@ -1,4 +1,4 @@
-//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =//
+//== GenHPVM.h - Header file for "LLVM IR to HPVM IR Pass" =//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SupportVISC/VISCTimer.h"
+#include "SupportHPVM/HPVMTimer.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -18,24 +18,24 @@
 
 using namespace llvm;
 
-namespace genvisc {
-// GenVISC - The first implementation.
-struct GenVISC : public ModulePass {
+namespace genhpvm {
+// GenHPVM - The first implementation.
+struct GenHPVM : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
-  GenVISC() : ModulePass(ID) {}
+  GenHPVM() : ModulePass(ID) {}
 
 private:
   // Member variables
   Module *M;
-  FunctionCallee llvm_visc_initializeTimerSet;
-  FunctionCallee llvm_visc_switchToTimer;
-  FunctionCallee llvm_visc_printTimerSet;
+  FunctionCallee llvm_hpvm_initializeTimerSet;
+  FunctionCallee llvm_hpvm_switchToTimer;
+  FunctionCallee llvm_hpvm_printTimerSet;
 
   GlobalVariable *TimerSet;
 
   // Functions
   void initializeTimerSet(Instruction *);
-  void switchToTimer(enum visc_TimerID, Instruction *);
+  void switchToTimer(enum hpvm_TimerID, Instruction *);
   void printTimerSet(Instruction *);
   Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
                           const Twine &Name = "");
@@ -45,4 +45,4 @@ public:
   virtual bool runOnModule(Module &M);
 };
 
-} // namespace genvisc
+} // namespace genhpvm
diff --git a/hpvm/include/SupportVISC/DFG2LLVM.h b/hpvm/include/SupportHPVM/DFG2LLVM.h
similarity index 82%
rename from hpvm/include/SupportVISC/DFG2LLVM.h
rename to hpvm/include/SupportHPVM/DFG2LLVM.h
index b9e4cc4158b71ab18fbeadf2e4d094055feb6149..07147c6d909f5352dd886b5f8bc1a2b0ae434ffe 100644
--- a/hpvm/include/SupportVISC/DFG2LLVM.h
+++ b/hpvm/include/SupportHPVM/DFG2LLVM.h
@@ -1,7 +1,7 @@
 #ifndef __DFG2LLVM_H__
 #define __DFG2LLVM_H__
 
-//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "BuildDFG/BuildDFG.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMTimer.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -26,7 +26,7 @@ using namespace builddfg;
 
 #define TIMER(X)                                                               \
   do {                                                                         \
-    if (VISCTimer) {                                                           \
+    if (HPVMTimer) {                                                           \
       X;                                                                       \
     }                                                                          \
   } while (0)
@@ -37,8 +37,8 @@ using namespace builddfg;
 
 namespace dfg2llvm {
 // Helper Functions
-static inline ConstantInt *getTimerID(Module &, enum visc_TimerID);
-static inline ConstantInt *getTimerID(Module &, enum visc::Target);
+static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID);
+static inline ConstantInt *getTimerID(Module &, enum hpvm::Target);
 
 bool hasAttribute(Function *, unsigned, Attribute::AttrKind);
 
@@ -69,7 +69,7 @@ protected:
   // Member variables
   Module &M;
   BuildDFG &DFG;
-  bool VISCTimer = false;
+  bool HPVMTimer = false;
   std::string TargetName = "None";
 
   // Map from Old function associated with DFNode to new cloned function with
@@ -78,12 +78,12 @@ protected:
   // "Have we visited this function before?")
   DenseMap<DFNode *, Value *> OutputMap;
 
-  // VISC Runtime API
+  // HPVM Runtime API
   std::unique_ptr<Module> runtimeModule;
 
-  FunctionCallee llvm_visc_initializeTimerSet;
-  FunctionCallee llvm_visc_switchToTimer;
-  FunctionCallee llvm_visc_printTimerSet;
+  FunctionCallee llvm_hpvm_initializeTimerSet;
+  FunctionCallee llvm_hpvm_switchToTimer;
+  FunctionCallee llvm_hpvm_printTimerSet;
   GlobalVariable *TimerSet;
   GlobalVariable *GraphIDAddr;
   Instruction *InitCall;
@@ -109,7 +109,7 @@ protected:
 
   // Virtual Functions
   virtual void initializeTimerSet(Instruction *);
-  virtual void switchToTimer(enum visc_TimerID, Instruction *);
+  virtual void switchToTimer(enum hpvm_TimerID, Instruction *);
   virtual void printTimerSet(Instruction *);
 
   virtual ~CodeGenTraversal() {}
@@ -118,9 +118,9 @@ public:
   // Constructor
   CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
-  static bool checkPreferredTarget(DFNode *N, visc::Target T);
-  static bool preferredTargetIncludes(DFNode *N, visc::Target T);
-  visc::Target getPreferredTarget(DFNode *N);
+  static bool checkPreferredTarget(DFNode *N, hpvm::Target T);
+  static bool preferredTargetIncludes(DFNode *N, hpvm::Target T);
+  hpvm::Target getPreferredTarget(DFNode *N);
 
   virtual void visit(DFInternalNode *N) {
     // If code has already been generated for this internal node, skip the
@@ -157,25 +157,25 @@ public:
 
 // -------------- CodeGenTraversal Implementation -----------------
 
-bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) {
+bool CodeGenTraversal::checkPreferredTarget(DFNode *N, hpvm::Target T) {
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  case hpvm::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::SPIR_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+  case hpvm::SPIR_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_spir");
     break;
-  case visc::CUDNN_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+  case hpvm::CUDNN_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cudnn");
     break;
-  case visc::PROMISE_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+  case hpvm::PROMISE_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise");
     break;
-  case visc::CPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  case hpvm::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
   default:
     llvm_unreachable("Target Not supported yet!");
@@ -190,37 +190,37 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) {
   return false;
 }
 
-visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
-  return viscUtils::getPreferredTarget(N->getFuncPointer());
+hpvm::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
+  return hpvmUtils::getPreferredTarget(N->getFuncPointer());
 }
 
-bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) {
+bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, hpvm::Target T) {
 
   Function *F = N->getFuncPointer();
   Module *M = F->getParent();
   std::vector<NamedMDNode *> HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+  case hpvm::GPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"));
     break;
-  case visc::SPIR_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+  case hpvm::SPIR_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_spir"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir"));
     break;
-  case visc::CPU_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+  case hpvm::CPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cpu_spir"));
     break;
-  case visc::CUDNN_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn"));
+  case hpvm::CUDNN_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_cudnn"));
     break;
-  case visc::PROMISE_TARGET:
-    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise"));
+  case hpvm::PROMISE_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("hpvm_hint_promise"));
     break;
-  case visc::CPU_OR_GPU_TARGET:
-  case visc::CPU_OR_SPIR_TARGET:
+  case hpvm::CPU_OR_GPU_TARGET:
+  case hpvm::CPU_OR_SPIR_TARGET:
     assert(false && "Target should be one of CPU/GPU/SPIR\n");
     break;
   default:
@@ -308,11 +308,11 @@ Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
   Function *newF = Function::Create(FTy, F->getLinkage(),
                                     F->getName() + "_cloned", F->getParent());
   renameNewArgument(newF, name);
-  newF = viscUtils::cloneFunction(F, newF, false);
+  newF = hpvmUtils::cloneFunction(F, newF, false);
 
   // Check if the function is used by a metadata node
   if (F->isUsedByMetadata()) {
-    viscUtils::fixHintMetadata(*F->getParent(), F, newF);
+    hpvmUtils::fixHintMetadata(*F->getParent(), F, newF);
   }
 
   return newF;
@@ -396,32 +396,32 @@ Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) {
 }
 
 void CodeGenTraversal::initTimerAPI() {
-  DECLARE(llvm_visc_initializeTimerSet);
-  DECLARE(llvm_visc_switchToTimer);
-  DECLARE(llvm_visc_printTimerSet);
+  DECLARE(llvm_hpvm_initializeTimerSet);
+  DECLARE(llvm_hpvm_switchToTimer);
+  DECLARE(llvm_hpvm_printTimerSet);
 }
 
 // Timer Routines
 // Initialize the timer set
 void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) {
-  // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet <<
+  // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet <<
   // "\n");
   TIMER(TimerSet = new GlobalVariable(
             M, Type::getInt8PtrTy(M.getContext()), false,
             GlobalValue::CommonLinkage,
             Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-            Twine("viscTimerSet_") + TargetName);
+            Twine("hpvmTimerSet_") + TargetName);
         DEBUG(errs() << "New global variable: " << *TimerSet << "\n");
 
-        Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
+        Value *TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet,
                                                None, "", InsertBefore);
         new StoreInst(TimerSetAddr, TimerSet, InsertBefore););
 }
 
-void CodeGenTraversal::switchToTimer(enum visc_TimerID timer,
+void CodeGenTraversal::switchToTimer(enum hpvm_TimerID timer,
                                      Instruction *InsertBefore) {
   Value *switchArgs[] = {TimerSet, getTimerID(M, timer)};
-  TIMER(CallInst::Create(llvm_visc_switchToTimer,
+  TIMER(CallInst::Create(llvm_hpvm_switchToTimer,
                          ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
 }
 
@@ -430,16 +430,16 @@ void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) {
   TIMER(TimerName =
             getStringPointer(TargetName + Twine("_Timer"), InsertBefore));
   Value *printArgs[] = {TimerSet, TimerName};
-  TIMER(CallInst::Create(llvm_visc_printTimerSet,
+  TIMER(CallInst::Create(llvm_hpvm_printTimerSet,
                          ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
 }
 
 // Implementation of Helper Functions
-static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) {
+static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
 }
 
-static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) {
+static inline ConstantInt *getTargetID(Module &M, enum hpvm::Target T) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), T);
 }
 
diff --git a/hpvm/include/SupportVISC/DFGTreeTraversal.h b/hpvm/include/SupportHPVM/DFGTreeTraversal.h
similarity index 100%
rename from hpvm/include/SupportVISC/DFGTreeTraversal.h
rename to hpvm/include/SupportHPVM/DFGTreeTraversal.h
diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportHPVM/DFGraph.h
similarity index 94%
rename from hpvm/include/SupportVISC/DFGraph.h
rename to hpvm/include/SupportHPVM/DFGraph.h
index 0c224a344c4ec342f52f4816280e101518ba43dd..d904e2401d7e9a58a38e9bca024de1a437cd56d1 100644
--- a/hpvm/include/SupportVISC/DFGraph.h
+++ b/hpvm/include/SupportHPVM/DFGraph.h
@@ -20,8 +20,8 @@
 #ifndef LLVM_IR_DFGRAPH_H
 #define LLVM_IR_DFGRAPH_H
 
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -158,7 +158,7 @@ public:
   }
 };
 
-// DFNode represents a single VISC Dataflow Node in LLVM.
+// DFNode represents a single HPVM Dataflow Node in LLVM.
 //
 // A Dataflow Node basically consists of
 // 1. Pointer to a function describing this dataflow node
@@ -210,8 +210,8 @@ private:
                                   ///< hierarchy
   unsigned Rank;                  ///< Ordering based on toplogical sort
   const DFNodeKind Kind;          ///< Kind of Node Internal/Leaf
-  visc::Target Tag;               ///< Code Generated for which backend
-  visc::Target Hint;              ///< To store preferred backend
+  hpvm::Target Tag;               ///< Code Generated for which backend
+  hpvm::Target Hint;              ///< To store preferred backend
 
 public:
   virtual ~DFNode() {
@@ -287,13 +287,13 @@ public:
 
   DFNodeKind getKind() const { return Kind; }
 
-  DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+  DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
          DFInternalNode *_Parent, unsigned _NumOfDim,
          std::vector<Value *> _DimLimits, DFNodeKind _K);
 
   bool isRoot() const {
     // It is a root node is it was created from a launch intrinsic
-    if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) {
+    if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) {
       assert(Level == 0 && "Root node's level is zero.");
       return true;
     }
@@ -326,9 +326,9 @@ public:
 
   unsigned getRank() const { return Rank; }
 
-  void setTag(visc::Target T) { Tag = T; }
+  void setTag(hpvm::Target T) { Tag = T; }
 
-  visc::Target getTag() const { return Tag; }
+  hpvm::Target getTag() const { return Tag; }
 
   void *getProperty(PropertyKind PType) {
     assert(PropertyList.count(PType) == 1 &&
@@ -342,24 +342,24 @@ public:
     PropertyList[PType] = PValue;
   }
 
-  void setGenFunc(Function *F, visc::Target T) {
+  void setGenFunc(Function *F, hpvm::Target T) {
     GenFunc = F;
     Tag = T;
   }
 
   Function *getGenFunc() const { return GenFunc; }
 
-  void setHasX86FuncForTarget(visc::Target T, bool isX86Func) {
+  void setHasX86FuncForTarget(hpvm::Target T, bool isX86Func) {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return; // Do nothing.
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       GenFuncInfo.cpu_hasX86Func = isX86Func;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       GenFuncInfo.gpu_hasX86Func = isX86Func;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       break;
     default:
       assert(false && "Unknown target\n");
@@ -368,15 +368,15 @@ public:
     return;
   }
 
-  bool hasX86GenFuncForTarget(visc::Target T) const {
+  bool hasX86GenFuncForTarget(hpvm::Target T) const {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return false;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       return GenFuncInfo.cpu_hasX86Func;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       return GenFuncInfo.gpu_hasX86Func;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
     default:
       assert(false && "Unknown target\n");
@@ -384,10 +384,10 @@ public:
     return false;
   }
 
-  void addGenFunc(Function *F, visc::Target T, bool isX86Func) {
+  void addGenFunc(Function *F, hpvm::Target T, bool isX86Func) {
 
     switch (T) {
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       if (GenFuncs.CPUGenFunc != NULL) {
         DEBUG(errs() << "Warning: Second generated CPU function for node "
                      << FuncPointer->getName() << "\n");
@@ -395,7 +395,7 @@ public:
       GenFuncs.CPUGenFunc = F;
       GenFuncInfo.cpu_hasX86Func = isX86Func;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       if (GenFuncs.GPUGenFunc != NULL) {
         DEBUG(errs() << "Warning: Second generated GPU function for node "
                      << FuncPointer->getName() << "\n");
@@ -403,25 +403,25 @@ public:
       GenFuncs.GPUGenFunc = F;
       GenFuncInfo.gpu_hasX86Func = isX86Func;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false && "A node function should be set with a tag specifying its \
                 type, not the node hint itself\n");
     default:
       assert(false && "Unknown target for generated function\n");
     }
 
-    Tag = viscUtils::getUpdatedTag(Tag, T);
+    Tag = hpvmUtils::getUpdatedTag(Tag, T);
   }
 
-  Function *getGenFuncForTarget(visc::Target T) const {
+  Function *getGenFuncForTarget(hpvm::Target T) const {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return NULL;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       return GenFuncs.CPUGenFunc;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       return GenFuncs.GPUGenFunc;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Requesting genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
@@ -431,19 +431,19 @@ public:
     return NULL;
   }
 
-  void removeGenFuncForTarget(visc::Target T) {
+  void removeGenFuncForTarget(hpvm::Target T) {
     switch (T) {
-    case visc::None:
+    case hpvm::None:
       return;
-    case visc::CPU_TARGET:
+    case hpvm::CPU_TARGET:
       GenFuncs.CPUGenFunc = NULL;
       GenFuncInfo.cpu_hasX86Func = false;
       break;
-    case visc::GPU_TARGET:
+    case hpvm::GPU_TARGET:
       GenFuncs.GPUGenFunc = NULL;
       GenFuncInfo.gpu_hasX86Func = false;
       break;
-    case visc::CPU_OR_GPU_TARGET:
+    case hpvm::CPU_OR_GPU_TARGET:
       assert(false &&
              "Removing genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
@@ -453,9 +453,9 @@ public:
     return;
   }
 
-  void setTargetHint(visc::Target T) { Hint = T; }
+  void setTargetHint(hpvm::Target T) { Hint = T; }
 
-  visc::Target getTargetHint() const { return Hint; }
+  hpvm::Target getTargetHint() const { return Hint; }
 
   bool isDummyNode() const { return isEntryNode() || isExitNode(); }
 
@@ -496,7 +496,7 @@ private:
   DFGraph *childGraph; ///< Pointer to dataflow graph
 
   // Constructor
-  DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
                  DFInternalNode *Parent, int NumOfDim,
                  std::vector<Value *> DimLimits)
       : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits,
@@ -508,7 +508,7 @@ private:
 public:
   static DFInternalNode *
   Create(IntrinsicInst *II, Function *FuncPointer,
-         visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL,
+         hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL,
          int NumOfDim = 0,
          std::vector<Value *> DimLimits = std::vector<Value *>()) {
 
@@ -539,14 +539,14 @@ class DFLeafNode : public DFNode {
 
 private:
   // Constructor
-  DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
              DFInternalNode *Parent, int NumOfDim = 0,
              std::vector<Value *> DimLimits = std::vector<Value *>())
       : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {}
 
 public:
   static DFLeafNode *
-  Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+  Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint,
          DFInternalNode *Parent, int NumOfDim = 0,
          std::vector<Value *> DimLimits = std::vector<Value *>()) {
     return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits);
@@ -558,7 +558,7 @@ public:
   //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
 };
 
-// DFEdge represents a single VISC Dataflow Edge in LLVM.
+// DFEdge represents a single HPVM Dataflow Edge in LLVM.
 //
 // A Dataflow Edge basically consists of
 // 1. Pointer to the dataflow node that is the source of this edge
@@ -634,8 +634,8 @@ DFGraph::DFGraph(DFInternalNode *P) {
   Parent = P;
   // Create dummy entry and exit nodes and add them to the graph
   Entry =
-      DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
-  Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
+      DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent);
+  Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), hpvm::None, Parent);
   addChildDFNode(Entry);
   addChildDFNode(Exit);
 }
@@ -655,7 +655,7 @@ bool DFGraph::isStreaming() {
 }
 
 //===--------------------- DFNode Outlined Functions --------------===//
-DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint,
                DFInternalNode *_Parent, unsigned _NumOfDim,
                std::vector<Value *> _DimLimits, DFNodeKind _K)
     : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim),
@@ -663,7 +663,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
 
   Type *Ty = FuncPointer->getFunctionType()->getReturnType();
 
-  // Allow the return type to be void too, in the hVISC IR. If return type is
+  // Allow the return type to be void too, in the hHPVM IR. If return type is
   // void, create an empty struct type and keep that as the return type of the
   // node.
   if (Ty->isVoidTy())
@@ -683,7 +683,7 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
   Level = (_Parent) ? _Parent->getLevel() + 1 : 0;
   Rank = 0;
 
-  Tag = visc::None;
+  Tag = hpvm::None;
   GenFuncs.CPUGenFunc = NULL;
   GenFuncs.GPUGenFunc = NULL;
   GenFuncs.SPIRGenFunc = NULL;
diff --git a/hpvm/include/SupportVISC/VISCHint.h b/hpvm/include/SupportHPVM/HPVMHint.h
similarity index 78%
rename from hpvm/include/SupportVISC/VISCHint.h
rename to hpvm/include/SupportHPVM/HPVMHint.h
index 99266b071843ab0417ea73c6e4533dfa381d52cd..1ef4c6eb3b986328080caa9e99e96f444978c03e 100644
--- a/hpvm/include/SupportVISC/VISCHint.h
+++ b/hpvm/include/SupportHPVM/HPVMHint.h
@@ -1,4 +1,4 @@
-//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
+//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef VISC_HINT_HEADER
-#define VISC_HINT_HEADER
+#ifndef HPVM_HINT_HEADER
+#define HPVM_HINT_HEADER
 
 /************************** Hint Routines ***************************/
 #ifdef __cplusplus
-namespace visc {
+namespace hpvm {
 #endif
 
 enum Target {
@@ -32,4 +32,4 @@ enum Target {
 }
 #endif
 
-#endif // VISC_HINT_HEADER
+#endif // HPVM_HINT_HEADER
diff --git a/hpvm/include/SupportHPVM/HPVMTimer.h b/hpvm/include/SupportHPVM/HPVMTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..05b24d41d6d50c61cd38b458676dbf79d28a917f
--- /dev/null
+++ b/hpvm/include/SupportHPVM/HPVMTimer.h
@@ -0,0 +1,151 @@
+//===------------ HPVMTimer.h - Header file for "HPVM Timer API" ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HPVM_TIMER_HEADER
+#define HPVM_TIMER_HEADER
+
+/************************** Timer Routines ***************************/
+extern "C" {
+
+/* A time or duration. */
+//#if _POSIX_VERSION >= 200112L
+typedef unsigned long long hpvm_Timestamp; /* time in microseconds */
+//#else
+//# error "Timestamps not implemented"
+//#endif
+
+enum hpvm_TimerState {
+  hpvm_Timer_STOPPED,
+  hpvm_Timer_RUNNING,
+};
+
+struct hpvm_Timer {
+  enum hpvm_TimerState state;
+  hpvm_Timestamp elapsed; /* Amount of time elapsed so far */
+  hpvm_Timestamp init;    /* Beginning of the current time interval,
+                           * if state is RUNNING.  End of the last
+                           * recorded time interfal otherwise.  */
+};
+
+/* Reset a timer.
+ * Use this to initialize a timer or to clear
+ * its elapsed time.  The reset timer is stopped.
+ */
+void hpvm_ResetTimer(struct hpvm_Timer *timer);
+
+/* Start a timer.  The timer is set to RUNNING mode and
+ * time elapsed while the timer is running is added to
+ * the timer.
+ * The timer should not already be running.
+ */
+void hpvm_StartTimer(struct hpvm_Timer *timer);
+
+/* Stop a timer.
+ * This stops adding elapsed time to the timer.
+ * The timer should not already be stopped.
+ */
+void hpvm_StopTimer(struct hpvm_Timer *timer);
+
+/* Get the elapsed time in seconds. */
+double hpvm_GetElapsedTime(struct hpvm_Timer *timer);
+
+/* Execution time is assigned to one of these categories. */
+enum hpvm_TimerID {
+  hpvm_TimerID_NONE = 0,
+  hpvm_TimerID_IO,         /* Time spent in input/output */
+  hpvm_TimerID_KERNEL,     /* Time spent computing on the device,
+                            * recorded asynchronously */
+  hpvm_TimerID_COPY,       /* Time spent synchronously moving data
+                            * to/from device and allocating/freeing
+                            * memory on the device */
+  hpvm_TimerID_DRIVER,     /* Time spent in the host interacting with the
+                            * driver, primarily for recording the time
+                            * spent queueing asynchronous operations */
+  hpvm_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
+  hpvm_TimerID_COMPUTE,    /* Time for all program execution other
+                            * than parsing command line arguments,
+                            * I/O, kernel, and copy */
+  hpvm_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
+                            * host activity: automatically filled in,
+                            * not intended for direct usage */
+  // GPU FUNCTION
+  hpvm_TimerID_INIT_CTX,
+  hpvm_TimerID_CLEAR_CTX,
+  hpvm_TimerID_COPY_SCALAR,
+  hpvm_TimerID_COPY_PTR,
+  hpvm_TimerID_MEM_FREE,
+  hpvm_TimerID_READ_OUTPUT,
+  hpvm_TimerID_SETUP,
+  hpvm_TimerID_MEM_TRACK,
+  hpvm_TimerID_MEM_UNTRACK,
+  hpvm_TimerID_MISC,
+  // LAUNCH FUNCTION
+  hpvm_TimerID_PTHREAD_CREATE,
+  hpvm_TimerID_ARG_PACK,
+  hpvm_TimerID_ARG_UNPACK,
+  hpvm_TimerID_COMPUTATION,
+  hpvm_TimerID_OUTPUT_PACK,
+  hpvm_TimerID_OUTPUT_UNPACK,
+
+  hpvm_TimerID_LAST /* Number of timer IDs */
+};
+
+/* Dynamic list of asynchronously tracked times between events */
+struct hpvm_async_time_marker_list {
+  char *label;               // actually just a pointer to a string
+  enum hpvm_TimerID timerID; /* The ID to which the interval beginning
+                              * with this marker should be attributed */
+  void *marker;
+  // cudaEvent_t marker; 		/* The driver event for this marker */
+  struct hpvm_async_time_marker_list *next;
+};
+
+struct hpvm_SubTimer {
+  char *label;
+  struct hpvm_Timer timer;
+  struct hpvm_SubTimer *next;
+};
+
+struct hpvm_SubTimerList {
+  struct hpvm_SubTimer *current;
+  struct hpvm_SubTimer *subtimer_list;
+};
+
+/* A set of timers for recording execution times. */
+struct hpvm_TimerSet {
+  enum hpvm_TimerID current;
+  struct hpvm_async_time_marker_list *async_markers;
+  hpvm_Timestamp async_begin;
+  hpvm_Timestamp wall_begin;
+  struct hpvm_Timer timers[hpvm_TimerID_LAST];
+  struct hpvm_SubTimerList *sub_timer_list[hpvm_TimerID_LAST];
+};
+
+/* Reset all timers in the set. */
+void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers);
+
+void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
+                      enum hpvm_TimerID hpvm_Category);
+
+/* Select which timer the next interval of time should be accounted
+ * to. The selected timer is started and other timers are stopped.
+ * Using hpvm_TimerID_NONE stops all timers. */
+inline void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers,
+                               enum hpvm_TimerID timer);
+
+void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
+                           enum hpvm_TimerID category);
+
+/* Print timer values to standard output. */
+void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers);
+
+/* Release timer resources */
+void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers);
+}
+#endif // HPVM_RT_HEADER
diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h
similarity index 84%
rename from hpvm/include/SupportVISC/VISCUtils.h
rename to hpvm/include/SupportHPVM/HPVMUtils.h
index 0efd20b5b5eb57943de1feb6d2afa886c6c48a5c..25b9880180f2cb4590f5b5fcbb3f3f2fbe025f8f 100644
--- a/hpvm/include/SupportVISC/VISCUtils.h
+++ b/hpvm/include/SupportHPVM/HPVMUtils.h
@@ -1,5 +1,5 @@
 //
-//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
+//===---- DFG2LLVM.h - Header file for "HPVM Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef VISC_UTILS_HEADER
-#define VISC_UTILS_HEADER
+#ifndef HPVM_UTILS_HEADER
+#define HPVM_UTILS_HEADER
 
 #include <assert.h>
 
-#include "SupportVISC/VISCHint.h"
+#include "SupportHPVM/HPVMHint.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -29,31 +29,31 @@
 
 using namespace llvm;
 
-namespace viscUtils {
+namespace hpvmUtils {
 // Helper Functions
 
-static bool isViscCreateNodeIntrinsic(Instruction *I) {
+static bool isHPVMCreateNodeIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   return (II->getCalledFunction()->getName())
-      .startswith("llvm.visc.createNode");
+      .startswith("llvm.hpvm.createNode");
 }
 
-static bool isViscCreateNodeCall(Instruction *I) {
+static bool isHPVMCreateNodeCall(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
   return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .startswith("__visc__createNode");
+      .startswith("__hpvm__createNode");
 }
 
-static bool isViscLaunchCall(Instruction *I) {
+static bool isHPVMLaunchCall(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
   return (CI->getCalledValue()->stripPointerCasts()->getName())
-      .startswith("__visc__launch");
+      .startswith("__hpvm__launch");
 }
 // Creates a new createNode intrinsic, similar to II but with different
 // associated function F instead
@@ -69,22 +69,22 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
 
   ArrayRef<Value *> CreateNodeArgs;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::visc_createNode: {
+  case Intrinsic::hpvm_createNode: {
     CreateNodeArgs = ArrayRef<Value *>(Fp);
     break;
   }
-  case Intrinsic::visc_createNode1D: {
+  case Intrinsic::hpvm_createNode1D: {
     Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2);
     break;
   }
-  case Intrinsic::visc_createNode2D: {
+  case Intrinsic::hpvm_createNode2D: {
     Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
                                  II->getArgOperand(2)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3);
     break;
   }
-  case Intrinsic::visc_createNode3D: {
+  case Intrinsic::hpvm_createNode3D: {
     Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2),
                                  II->getArgOperand(3)};
     CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4);
@@ -101,7 +101,7 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
   return CreateNodeII;
 }
 
-// Fix VISC hints for this function
+// Fix HPVM hints for this function
 void fixHintMetadata(Module &M, Function *F, Function *G) {
   Metadata *MD_F = ValueAsMetadata::getIfExists(F);
   MDTuple *MDT_F =
@@ -119,9 +119,9 @@ void fixHintMetadata(Module &M, Function *F, Function *G) {
     }
   };
 
-  FixHint("visc_hint_gpu");
-  FixHint("visc_hint_cpu");
-  FixHint("visc_hint_cpu_gpu");
+  FixHint("hpvm_hint_gpu");
+  FixHint("hpvm_hint_cpu");
+  FixHint("hpvm_hint_cpu_gpu");
 }
 
 // Assuming that the changed function is a node function, it is only used as a
@@ -138,7 +138,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
          ++i) {
       Instruction *I = &*i; // Grab pointer to Instruction
 
-      if (isViscCreateNodeIntrinsic(I)) {
+      if (isHPVMCreateNodeIntrinsic(I)) {
         IntrinsicInst *II = cast<IntrinsicInst>(I);
         // The found createNode is not associated with the changed function
         if (II->getArgOperand(0) != F)
@@ -150,7 +150,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
             createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
         II->replaceAllUsesWith(CreateNodeII);
         toBeErased.push_back(II);
-      } else if (isViscCreateNodeCall(I)) {
+      } else if (isHPVMCreateNodeCall(I)) {
         CallInst *CI = cast<CallInst>(I);
         // The found createNode is not associated with the changed function
         if (CI->getArgOperand(1) != F)
@@ -161,7 +161,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
         // Replace use of F with use of G
         CI->setArgOperand(1, G);
         DEBUG(errs() << "Fixed use: " << *CI << "\n");
-      } else if (isViscLaunchCall(I)) {
+      } else if (isHPVMLaunchCall(I)) {
         CallInst *CI = cast<CallInst>(I);
         // The found launch call is not associated with the changed function
         if (CI->getArgOperand(1)->stripPointerCasts() != F)
@@ -370,21 +370,21 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg,
 //------------------- Helper Functions For Handling Hints -------------------//
 
 // Return true if 1st arg (tag) contains 2nd (target)
-bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
+bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
   switch (Tag) {
-  case visc::None:
+  case hpvm::None:
     return false;
-  case visc::CPU_TARGET:
-    if (T == visc::CPU_TARGET)
+  case hpvm::CPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
       return true;
     return false;
-  case visc::GPU_TARGET:
-    if (T == visc::GPU_TARGET)
+  case hpvm::GPU_TARGET:
+    if (T == hpvm::GPU_TARGET)
       return true;
     return false;
-  case visc::CPU_OR_GPU_TARGET:
-    if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) ||
-        (T == visc::CPU_OR_GPU_TARGET))
+  case hpvm::CPU_OR_GPU_TARGET:
+    if ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) ||
+        (T == hpvm::CPU_OR_GPU_TARGET))
       return true;
     return false;
   default:
@@ -392,41 +392,41 @@ bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
   }
 }
 
-bool isSingleTargetTag(visc::Target T) {
-  return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET));
+bool isSingleTargetTag(hpvm::Target T) {
+  return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET));
 }
 
 // Add the specified target to the given tag
-visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
-  assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) &&
+hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) {
+  assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET)) &&
          "The target is only allowed to be a single target: CPU, GPU, SPIR, "
          "CUDNN, PROMISE\n");
 
   switch (Tag) {
-  case visc::None:
+  case hpvm::None:
     return T;
-  case visc::CPU_TARGET:
-    if (T == visc::CPU_TARGET)
-      return visc::CPU_TARGET;
-    if (T == visc::GPU_TARGET)
-      return visc::CPU_OR_GPU_TARGET;
+  case hpvm::CPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
+      return hpvm::CPU_TARGET;
+    if (T == hpvm::GPU_TARGET)
+      return hpvm::CPU_OR_GPU_TARGET;
     return T;
-  case visc::GPU_TARGET:
-    if (T == visc::CPU_TARGET)
-      return visc::CPU_OR_GPU_TARGET;
-    if (T == visc::GPU_TARGET)
-      return visc::GPU_TARGET;
+  case hpvm::GPU_TARGET:
+    if (T == hpvm::CPU_TARGET)
+      return hpvm::CPU_OR_GPU_TARGET;
+    if (T == hpvm::GPU_TARGET)
+      return hpvm::GPU_TARGET;
     return T;
-  case visc::CPU_OR_GPU_TARGET:
-    return visc::CPU_OR_GPU_TARGET;
+  case hpvm::CPU_OR_GPU_TARGET:
+    return hpvm::CPU_OR_GPU_TARGET;
   default:
     assert(false && "Unknown Target\n");
   }
   return T;
 }
 
-// This functions add the hint as metadata in visc code
-void addHint(Function *F, visc::Target T) {
+// This functions add the hint as metadata in hpvm code
+void addHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
@@ -434,17 +434,17 @@ void addHint(Function *F, visc::Target T) {
   // Based on the hint, get the hint metadata
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
+  case hpvm::GPU_TARGET:
     DEBUG(errs() << "GPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::CPU_TARGET:
+  case hpvm::CPU_TARGET:
     DEBUG(errs() << "CPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
-  case visc::CPU_OR_GPU_TARGET:
+  case hpvm::CPU_OR_GPU_TARGET:
     DEBUG(errs() << "CPU or GPU Target\n");
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu");
     break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
@@ -457,8 +457,8 @@ void addHint(Function *F, visc::Target T) {
   HintNode->addOperand(N);
 }
 
-// This function removes the hint as metadata in visc code
-void removeHint(Function *F, visc::Target T) {
+// This function removes the hint as metadata in hpvm code
+void removeHint(Function *F, hpvm::Target T) {
   // Get Module
   Module *M = F->getParent();
   DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T
@@ -467,14 +467,14 @@ void removeHint(Function *F, visc::Target T) {
   // Based on the hint, get the hint metadata
   NamedMDNode *HintNode;
   switch (T) {
-  case visc::GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+  case hpvm::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_gpu");
     break;
-  case visc::CPU_OR_GPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+  case hpvm::CPU_OR_GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu_gpu");
     break;
-  case visc::CPU_TARGET:
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+  case hpvm::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("hpvm_hint_cpu");
     break;
   default:
     llvm_unreachable("Unsupported Target Hint!");
@@ -501,7 +501,7 @@ void removeHint(Function *F, visc::Target T) {
   }
 }
 
-visc::Target getPreferredTarget(Function *F) {
+hpvm::Target getPreferredTarget(Function *F) {
   DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
   Module *M = F->getParent();
 
@@ -517,16 +517,16 @@ visc::Target getPreferredTarget(Function *F) {
     return false;
   };
 
-  if (FoundPrefTarget("visc_hint_cpu"))
-    return visc::CPU_TARGET;
-  if (FoundPrefTarget("visc_hint_gpu"))
-    return visc::GPU_TARGET;
-  if (FoundPrefTarget("visc_hint_cpu_gpu"))
-    return visc::CPU_OR_GPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_cpu"))
+    return hpvm::CPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_gpu"))
+    return hpvm::GPU_TARGET;
+  if (FoundPrefTarget("hpvm_hint_cpu_gpu"))
+    return hpvm::CPU_OR_GPU_TARGET;
 
-  return visc::None;
+  return hpvm::None;
 }
 
-} // namespace viscUtils
+} // namespace hpvmUtils
 
-#endif // VISC_UTILS_HEADER
+#endif // HPVM_UTILS_HEADER
diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h
deleted file mode 100644
index ce3dc8a5e0f7c77ff06fec5857f223ca4f0e142f..0000000000000000000000000000000000000000
--- a/hpvm/include/SupportVISC/VISCTimer.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef VISC_TIMER_HEADER
-#define VISC_TIMER_HEADER
-
-/************************** Timer Routines ***************************/
-extern "C" {
-
-/* A time or duration. */
-//#if _POSIX_VERSION >= 200112L
-typedef unsigned long long visc_Timestamp; /* time in microseconds */
-//#else
-//# error "Timestamps not implemented"
-//#endif
-
-enum visc_TimerState {
-  visc_Timer_STOPPED,
-  visc_Timer_RUNNING,
-};
-
-struct visc_Timer {
-  enum visc_TimerState state;
-  visc_Timestamp elapsed; /* Amount of time elapsed so far */
-  visc_Timestamp init;    /* Beginning of the current time interval,
-                           * if state is RUNNING.  End of the last
-                           * recorded time interfal otherwise.  */
-};
-
-/* Reset a timer.
- * Use this to initialize a timer or to clear
- * its elapsed time.  The reset timer is stopped.
- */
-void visc_ResetTimer(struct visc_Timer *timer);
-
-/* Start a timer.  The timer is set to RUNNING mode and
- * time elapsed while the timer is running is added to
- * the timer.
- * The timer should not already be running.
- */
-void visc_StartTimer(struct visc_Timer *timer);
-
-/* Stop a timer.
- * This stops adding elapsed time to the timer.
- * The timer should not already be stopped.
- */
-void visc_StopTimer(struct visc_Timer *timer);
-
-/* Get the elapsed time in seconds. */
-double visc_GetElapsedTime(struct visc_Timer *timer);
-
-/* Execution time is assigned to one of these categories. */
-enum visc_TimerID {
-  visc_TimerID_NONE = 0,
-  visc_TimerID_IO,         /* Time spent in input/output */
-  visc_TimerID_KERNEL,     /* Time spent computing on the device,
-                            * recorded asynchronously */
-  visc_TimerID_COPY,       /* Time spent synchronously moving data
-                            * to/from device and allocating/freeing
-                            * memory on the device */
-  visc_TimerID_DRIVER,     /* Time spent in the host interacting with the
-                            * driver, primarily for recording the time
-                            * spent queueing asynchronous operations */
-  visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
-  visc_TimerID_COMPUTE,    /* Time for all program execution other
-                            * than parsing command line arguments,
-                            * I/O, kernel, and copy */
-  visc_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
-                            * host activity: automatically filled in,
-                            * not intended for direct usage */
-  // GPU FUNCTION
-  visc_TimerID_INIT_CTX,
-  visc_TimerID_CLEAR_CTX,
-  visc_TimerID_COPY_SCALAR,
-  visc_TimerID_COPY_PTR,
-  visc_TimerID_MEM_FREE,
-  visc_TimerID_READ_OUTPUT,
-  visc_TimerID_SETUP,
-  visc_TimerID_MEM_TRACK,
-  visc_TimerID_MEM_UNTRACK,
-  visc_TimerID_MISC,
-  // LAUNCH FUNCTION
-  visc_TimerID_PTHREAD_CREATE,
-  visc_TimerID_ARG_PACK,
-  visc_TimerID_ARG_UNPACK,
-  visc_TimerID_COMPUTATION,
-  visc_TimerID_OUTPUT_PACK,
-  visc_TimerID_OUTPUT_UNPACK,
-
-  visc_TimerID_LAST /* Number of timer IDs */
-};
-
-/* Dynamic list of asynchronously tracked times between events */
-struct visc_async_time_marker_list {
-  char *label;               // actually just a pointer to a string
-  enum visc_TimerID timerID; /* The ID to which the interval beginning
-                              * with this marker should be attributed */
-  void *marker;
-  // cudaEvent_t marker; 		/* The driver event for this marker */
-  struct visc_async_time_marker_list *next;
-};
-
-struct visc_SubTimer {
-  char *label;
-  struct visc_Timer timer;
-  struct visc_SubTimer *next;
-};
-
-struct visc_SubTimerList {
-  struct visc_SubTimer *current;
-  struct visc_SubTimer *subtimer_list;
-};
-
-/* A set of timers for recording execution times. */
-struct visc_TimerSet {
-  enum visc_TimerID current;
-  struct visc_async_time_marker_list *async_markers;
-  visc_Timestamp async_begin;
-  visc_Timestamp wall_begin;
-  struct visc_Timer timers[visc_TimerID_LAST];
-  struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST];
-};
-
-/* Reset all timers in the set. */
-void visc_InitializeTimerSet(struct visc_TimerSet *timers);
-
-void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
-                      enum visc_TimerID visc_Category);
-
-/* Select which timer the next interval of time should be accounted
- * to. The selected timer is started and other timers are stopped.
- * Using visc_TimerID_NONE stops all timers. */
-inline void visc_SwitchToTimer(struct visc_TimerSet *timers,
-                               enum visc_TimerID timer);
-
-void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
-                           enum visc_TimerID category);
-
-/* Print timer values to standard output. */
-void visc_PrintTimerSet(struct visc_TimerSet *timers);
-
-/* Release timer resources */
-void visc_DestroyTimerSet(struct visc_TimerSet *timers);
-}
-#endif // VISC_RT_HEADER
diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 058419f1dc80a8650e7a3b834090a88099741431..be3e6cae3dae775716fc3e2206879e978febddb0 100644
--- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -10,8 +10,8 @@
 #define DEBUG_TYPE "buildDFG"
 #include "BuildDFG/BuildDFG.h"
 
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -35,7 +35,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
     for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
       Instruction *I = &*i; // Grab pointer to Instruction
-      if (isViscLaunchIntrinsic(I)) {
+      if (isHPVMLaunchIntrinsic(I)) {
         DEBUG(errs() << "------------ Found launch site --------------\n");
         II = cast<IntrinsicInst>(I);
 
@@ -43,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
         // Intrinsic Instruction has been initialized from this point on.
         Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts());
-        Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F));
+        Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F));
         Roots.push_back(Root);
         BuildGraph(Root, F);
 
@@ -118,37 +118,37 @@ void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) {
   HandleToDFEdgeMap.erase(V);
 }
 
-// Returns true if instruction I is a visc launch intrinsic, false otherwise
-bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm launch intrinsic, false otherwise
+bool BuildDFG::isHPVMLaunchIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).equals("llvm.visc.launch");
+  return (II->getCalledFunction()->getName()).equals("llvm.hpvm.launch");
 }
 
-// Returns true if instruction I is a visc graph intrinsic, false otherwise
-bool BuildDFG::isViscGraphIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm graph intrinsic, false otherwise
+bool BuildDFG::isHPVMGraphIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") ||
-         (II->getCalledFunction()->getName()).startswith("llvm.visc.bind");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.create") ||
+         (II->getCalledFunction()->getName()).startswith("llvm.hpvm.bind");
 }
 
-// Returns true if instruction I is a visc query intrinsic, false otherwise
-bool BuildDFG::isViscQueryIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm query intrinsic, false otherwise
+bool BuildDFG::isHPVMQueryIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.get");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm.get");
 }
 
-// Returns true if instruction I is a visc intrinsic, false otherwise
-bool BuildDFG::isViscIntrinsic(Instruction *I) {
+// Returns true if instruction I is a hpvm intrinsic, false otherwise
+bool BuildDFG::isHPVMIntrinsic(Instruction *I) {
   if (!isa<IntrinsicInst>(I))
     return false;
   IntrinsicInst *II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc");
+  return (II->getCalledFunction()->getName()).startswith("llvm.hpvm");
 }
 
 // Two types are "congruent" if they are identical, or if they are both
@@ -163,7 +163,7 @@ bool BuildDFG::isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-// Handles all the createNodeXX visc intrinsics.
+// Handles all the createNodeXX hpvm intrinsics.
 void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   bool isInternalNode = false;
 
@@ -173,7 +173,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   // internal node
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
     Instruction *I = &*i; // Grab pointer to Instruction
-    if (isViscGraphIntrinsic(I))
+    if (isHPVMGraphIntrinsic(I))
       isInternalNode = true;
   }
 
@@ -196,14 +196,14 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
     // Create Internal DFNode, add it to the map and recursively build its
     // dataflow graph
     DFInternalNode *childDFNode = DFInternalNode::Create(
-        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
     BuildGraph(childDFNode, F);
   } else {
     // Create Leaf DFnode and add it to the map.
     DFLeafNode *childDFNode = DFLeafNode::Create(
-        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+        II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
   }
@@ -336,11 +336,11 @@ void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) {
 
 void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
   DEBUG(errs() << "FUNCTION: " << F->getName() << "\n");
-  // TODO: Place checks for valid visc functions. For example one of the
-  // check can be that any function that contains visc dataflow graph
+  // TODO: Place checks for valid hpvm functions. For example one of the
+  // check can be that any function that contains hpvm dataflow graph
   // construction intrinsics should not have other llvm IR statements.
 
-  // Iterate over all the instructions of a function and look for visc
+  // Iterate over all the instructions of a function and look for hpvm
   // intrinsics.
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
     Instruction *I = &*i; // Grab pointer to Instruction
@@ -349,25 +349,25 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
       DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": "
                    << II->getCalledFunction()->getName() << "\n");
       switch (II->getIntrinsicID()) {
-      case Intrinsic::visc_createNode:
-      case Intrinsic::visc_createNode1D:
-      case Intrinsic::visc_createNode2D:
-      case Intrinsic::visc_createNode3D:
+      case Intrinsic::hpvm_createNode:
+      case Intrinsic::hpvm_createNode1D:
+      case Intrinsic::hpvm_createNode2D:
+      case Intrinsic::hpvm_createNode3D:
         handleCreateNode(N, II);
         break;
-      case Intrinsic::visc_createEdge:
+      case Intrinsic::hpvm_createEdge:
         handleCreateEdge(N, II);
         break;
-      case Intrinsic::visc_bind_input:
+      case Intrinsic::hpvm_bind_input:
         handleBindInput(N, II);
         break;
-      case Intrinsic::visc_bind_output:
+      case Intrinsic::hpvm_bind_output:
         handleBindOutput(N, II);
         break;
 
       // TODO: Reconsider launch within a dataflow graph (recursion?)
-      case Intrinsic::visc_wait:
-      case Intrinsic::visc_launch:
+      case Intrinsic::hpvm_wait:
+      case Intrinsic::hpvm_launch:
         DEBUG(errs()
               << "Error: Launch/wait intrinsic used within a dataflow graph\n\t"
               << *II << "\n");
@@ -375,7 +375,7 @@ void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
 
       default:
         DEBUG(
-            errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t"
+            errs() << "Error: Invalid HPVM Intrinsic inside Internal node!\n\t"
                    << *II << "\n");
         break;
       }
diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt
index 68724684e56648d307df52624e47ed7393bfd3f9..5c9b8b9fe026ea5612caa124535e02d28d619c53 100644
--- a/hpvm/lib/Transforms/CMakeLists.txt
+++ b/hpvm/lib/Transforms/CMakeLists.txt
@@ -2,5 +2,5 @@ add_subdirectory(BuildDFG)
 add_subdirectory(ClearDFG)
 add_subdirectory(DFG2LLVM_NVPTX)
 add_subdirectory(DFG2LLVM_X86)
-add_subdirectory(GenVISC)
+add_subdirectory(GenHPVM)
 add_subdirectory(LocalMem)
diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 6dae9e6977d31a0b62a9fa903966ec10810a2f71..c23043e7829a8947a995f7ad97688091c46cf23d 100644
--- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -18,7 +18,7 @@
 using namespace llvm;
 using namespace builddfg;
 
-// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
+// STATISTIC(IntrinsicCounter, "Counts number of hpvm intrinsics greeted");
 
 namespace {
 
@@ -101,8 +101,8 @@ bool ClearDFG::runOnModule(Module &M) {
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  Function *VI = M.getFunction("llvm.visc.init");
-  assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->hasOneUse() && "More than one use of llvm.hpvm.init\n");
   for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end();
        ui != ue; ui++) {
     Instruction *I = dyn_cast<Instruction>(*ui);
@@ -111,8 +111,8 @@ bool ClearDFG::runOnModule(Module &M) {
   VI->replaceAllUsesWith(UndefValue::get(VI->getType()));
   VI->eraseFromParent();
 
-  Function *VC = M.getFunction("llvm.visc.cleanup");
-  assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n");
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
+  assert(VC->hasOneUse() && "More than one use of llvm.hpvm.cleanup\n");
   for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end();
        ui != ue; ui++) {
     Instruction *I = dyn_cast<Instruction>(*ui);
diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 8a36e3b8af5c031715d1e341f3ac166501c0a5b9..f582a9ab6a4510b5d403d0709f2a06d0339d5a93 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -15,40 +15,39 @@
 #define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_NVPTX"
+#include "SupportHPVM/DFG2LLVM.h"
+#include "SupportHPVM/HPVMTimer.h"
+#include "SupportHPVM/HPVMUtils.h"
+#include "llvm-c/Core.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm-c/Core.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/DFG2LLVM.h"
-#include "SupportVISC/VISCUtils.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/IR/UseListOrder.h"
-
+#include "llvm/Support/ToolOutputFile.h"
 
 #include <sstream>
 
 using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
-using namespace viscUtils;
+using namespace hpvmUtils;
 
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers"));
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer_NVPTX("hpvm-timers-ptx",
+                                     cl::desc("Enable hpvm timers"));
 
 namespace {
 // Helper class declarations
@@ -57,94 +56,88 @@ namespace {
 // in bytes. Would have preferred to use tuple but support not yet available
 class OutputPtr {
 public:
-  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
-    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
+      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
 
-  Value* h_ptr;
-  Value* d_ptr;
-  Value* bytes;
+  Value *h_ptr;
+  Value *d_ptr;
+  Value *bytes;
 };
 
 // Class to maintain important kernel info required for generating runtime
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
-           std::map<unsigned, unsigned>(),
-         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
-           std::map<unsigned, std::pair<Value*, unsigned> >(),
-         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
-         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
-    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
-      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
-
-    assert(gridDim == globalWGSize.size()
-           && "gridDim should be same as the size of vector globalWGSize");
-    assert(blockDim == localWGSize.size()
-           && "blockDim should be same as the size of vector localWGSize");
+  Kernel(
+      Function *_KF, DFLeafNode *_KLeafNode,
+      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
+      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
+          std::map<unsigned, std::pair<Value *, unsigned>>(),
+      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+      unsigned _gridDim = 0,
+      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
+      unsigned _blockDim = 0,
+      std::vector<Value *> _localWGSize = std::vector<Value *>())
+      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
+        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+        localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size() &&
+           "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size() &&
+           "blockDim should be same as the size of vector localWGSize");
   }
 
-  Function* KernelFunction;
-  DFLeafNode* KernelLeafNode;
+  Function *KernelFunction;
+  DFLeafNode *KernelLeafNode;
   std::map<unsigned, unsigned> inArgMap;
   // Map for shared memory arguments
-  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
   // Fields for (potential) allocation node
-  DFLeafNode* AllocationNode;
-  Function* AllocationFunction;
+  DFLeafNode *AllocationNode;
+  Function *AllocationFunction;
   std::map<unsigned, unsigned> allocInArgMap;
 
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
-  std::vector<Value*> globalWGSize;
+  std::vector<Value *> globalWGSize;
   unsigned blockDim;
-  std::vector<Value*> localWGSize;
+  std::vector<Value *> localWGSize;
   std::vector<int> localDimMap;
 
-  std::map<unsigned, unsigned> &getInArgMap() {
-    return inArgMap;
-  }
-  void setInArgMap(std::map<unsigned, unsigned> map) {
-    inArgMap = map;
-  }
+  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
+  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
 
-  std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
     return sharedInArgMap;
   }
-  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
     sharedInArgMap = map;
   }
 
-  std::vector<unsigned> &getOutArgMap() {
-    return outArgMap;
-  }
-  void setOutArgMap(std::vector<unsigned> map) {
-    outArgMap = map;
-  }
+  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
+  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
 
-  void setLocalWGSize(std::vector<Value*> V) {
-    localWGSize = V;
-  }
+  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
 
-  bool hasLocalWG() const {
-    return blockDim != 0;
-  }
+  bool hasLocalWG() const { return blockDim != 0; }
 };
 
 // Helper function declarations
-static bool canBePromoted(Argument* arg, Function* F);
-static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
-                                 ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
-                              Instruction*, const Twine& WGName = "WGSize");
-static std::string getPTXFilename(const Module&);
-static std::string getFilenameFromModule(const Module& M);
+static bool canBePromoted(Argument *arg, Function *F);
+static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
+                                 Kernel *, ValueToValueMapTy &, Instruction *);
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
+                              ValueToValueMapTy &, Instruction *,
+                              const Twine &WGName = "WGSize");
+static std::string getPTXFilename(const Module &);
+static std::string getFilenameFromModule(const Module &M);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID,
+                              std::vector<IntrinsicInst *> &);
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
 static std::string getAtomicOpName(Intrinsic::ID);
 
@@ -154,7 +147,6 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM {
   DFG2LLVM_NVPTX() : DFG2LLVM(ID) {}
 
 private:
-
 public:
   bool runOnModule(Module &M);
 };
@@ -163,57 +155,60 @@ public:
 class CGT_NVPTX : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
   std::unique_ptr<Module> KernelM;
-  DFNode* KernelLaunchNode = NULL;
-  Kernel* kernel;
-
-  // VISC Runtime API
-  FunctionCallee llvm_visc_ocl_launch;
-  FunctionCallee llvm_visc_ocl_wait;
-  FunctionCallee llvm_visc_ocl_initContext;
-  FunctionCallee llvm_visc_ocl_clearContext;
-  FunctionCallee llvm_visc_ocl_argument_shared;
-  FunctionCallee llvm_visc_ocl_argument_scalar;
-  FunctionCallee llvm_visc_ocl_argument_ptr;
-  FunctionCallee llvm_visc_ocl_output_ptr;
-  FunctionCallee llvm_visc_ocl_free;
-  FunctionCallee llvm_visc_ocl_getOutput;
-  FunctionCallee llvm_visc_ocl_executeNode;
-
-  //Functions
+  DFNode *KernelLaunchNode = NULL;
+  Kernel *kernel;
+
+  // HPVM Runtime API
+  FunctionCallee llvm_hpvm_ocl_launch;
+  FunctionCallee llvm_hpvm_ocl_wait;
+  FunctionCallee llvm_hpvm_ocl_initContext;
+  FunctionCallee llvm_hpvm_ocl_clearContext;
+  FunctionCallee llvm_hpvm_ocl_argument_shared;
+  FunctionCallee llvm_hpvm_ocl_argument_scalar;
+  FunctionCallee llvm_hpvm_ocl_argument_ptr;
+  FunctionCallee llvm_hpvm_ocl_output_ptr;
+  FunctionCallee llvm_hpvm_ocl_free;
+  FunctionCallee llvm_hpvm_ocl_getOutput;
+  FunctionCallee llvm_hpvm_ocl_executeNode;
+
+  // Functions
   std::string getKernelsModuleName(Module &M);
-  void fixValueAddrspace(Value* V, unsigned addrspace);
-  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*);
-  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
-  void addCLMetadata(Function* F);
-  Function* transformFunctionToVoid(Function* F);
-  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
+  void fixValueAddrspace(Value *V, unsigned addrspace);
+  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
+                                                  Function *);
+  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
+                               unsigned i);
+  void addCLMetadata(Function *F);
+  Function *transformFunctionToVoid(Function *F);
+  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
 
   // Virtual Functions
   void init() {
-    VISCTimer = VISCTimer_NVPTX;
+    HPVMTimer = HPVMTimer_NVPTX;
     TargetName = "NVPTX";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
 
 public:
-
   // Constructor
-  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG)
+      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
     init();
     initRuntimeAPI();
-    errs() << "Old module pointer: " << &_M << "\n";
-    errs() << "New module pointer: " <<  KernelM.get() << "\n";
+    DEBUG(errs() << "Old module pointer: " << &_M << "\n");
+    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
 
-    // Copying instead of creating new, in order to preserve required info (metadata)
-    // Remove functions, global variables and aliases
-    std::vector<GlobalVariable*> GVVect;
+    // Copying instead of creating new, in order to preserve required info
+    // (metadata) Remove functions, global variables and aliases
+    std::vector<GlobalVariable *> GVVect;
     for (Module::global_iterator mi = KernelM->global_begin(),
-         me = KernelM->global_end(); (mi != me); ++mi) {
-      GlobalVariable* GV = &*mi;
+                                 me = KernelM->global_end();
+         (mi != me); ++mi) {
+      GlobalVariable *GV = &*mi;
       GVVect.push_back(GV);
     }
     for (auto *GV : GVVect) {
@@ -221,10 +216,10 @@ public:
       GV->eraseFromParent();
     }
 
-    std::vector<Function*> FuncVect;
-    for (Module::iterator mi = KernelM->begin(),
-         me = KernelM->end(); (mi != me); ++mi) {
-      Function* F = &*mi;
+    std::vector<Function *> FuncVect;
+    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
+         (mi != me); ++mi) {
+      Function *F = &*mi;
       FuncVect.push_back(F);
     }
     for (auto *F : FuncVect) {
@@ -232,10 +227,11 @@ public:
       F->eraseFromParent();
     }
 
-    std::vector<GlobalAlias*> GAVect;
+    std::vector<GlobalAlias *> GAVect;
     for (Module::alias_iterator mi = KernelM->alias_begin(),
-         me = KernelM->alias_end(); (mi != me); ++mi) {
-      GlobalAlias* GA = &*mi;
+                                me = KernelM->alias_end();
+         (mi != me); ++mi) {
+      GlobalAlias *GA = &*mi;
       GAVect.push_back(GA);
     }
     for (auto *GA : GAVect) {
@@ -246,73 +242,69 @@ public:
     changeDataLayout(*KernelM);
     changeTargetTriple(*KernelM);
 
-
     DEBUG(errs() << *KernelM);
-
   }
 
   void writeKernelsModule();
 };
 
-// Initialize the VISC runtime API. This makes it easier to insert these calls
+// Initialize the HPVM runtime API. This makes it easier to insert these calls
 void CGT_NVPTX::initRuntimeAPI() {
 
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc";
+  Twine runtimeAPI =
+      llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  if(runtimeModule == nullptr) {
+  if (runtimeModule == nullptr) {
     DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
     assert(false && "couldn't parse runtime");
-  }
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
 
   // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_ocl_launch);
-  DECLARE(llvm_visc_ocl_wait);
-  DECLARE(llvm_visc_ocl_initContext);
-  DECLARE(llvm_visc_ocl_clearContext);
-  DECLARE(llvm_visc_ocl_argument_shared);
-  DECLARE(llvm_visc_ocl_argument_scalar);
-  DECLARE(llvm_visc_ocl_argument_ptr);
-  DECLARE(llvm_visc_ocl_output_ptr);
-  DECLARE(llvm_visc_ocl_free);
-  DECLARE(llvm_visc_ocl_getOutput);
-  DECLARE(llvm_visc_ocl_executeNode);
+  DECLARE(llvm_hpvm_ocl_launch);
+  DECLARE(llvm_hpvm_ocl_wait);
+  DECLARE(llvm_hpvm_ocl_initContext);
+  DECLARE(llvm_hpvm_ocl_clearContext);
+  DECLARE(llvm_hpvm_ocl_argument_shared);
+  DECLARE(llvm_hpvm_ocl_argument_scalar);
+  DECLARE(llvm_hpvm_ocl_argument_ptr);
+  DECLARE(llvm_hpvm_ocl_output_ptr);
+  DECLARE(llvm_hpvm_ocl_free);
+  DECLARE(llvm_hpvm_ocl_getOutput);
+  DECLARE(llvm_hpvm_ocl_executeNode);
 
   // Get or insert timerAPI functions as well if you plan to use timers
   initTimerAPI();
 
   // Insert init context in main
   DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n");
-  Function* VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
 
   InitCall = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(InitCall);
-  switchToTimer(visc_TimerID_INIT_CTX, InitCall);
-  CallInst::Create(llvm_visc_ocl_initContext,
-                   ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)),
-                   "", InitCall);
-  switchToTimer(visc_TimerID_NONE, InitCall);
+  switchToTimer(hpvm_TimerID_INIT_CTX, InitCall);
+  CallInst::Create(llvm_hpvm_ocl_initContext,
+                   ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "",
+                   InitCall);
+  switchToTimer(hpvm_TimerID_NONE, InitCall);
 
-  // Insert print instruction at visc exit
+  // Insert print instruction at hpvm exit
   DEBUG(errs() << "Gen Code to print NVPTX Timer\n");
-  Function* VC = M.getFunction("llvm.visc.cleanup");
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
   DEBUG(errs() << *VC << "\n");
-  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
+  assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once");
 
   CleanupCall = cast<Instruction>(*VC->user_begin());
   printTimerSet(CleanupCall);
-
-
 }
 
 // Generate Code to call the kernel
@@ -320,36 +312,37 @@ void CGT_NVPTX::initRuntimeAPI() {
 // used to generate a function to associate with this leaf node. The function
 // is responsible for all the memory allocation/transfer and invoking the
 // kernel call on the device
-void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
+void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
+                                   const Twine &FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
-//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
-  assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL &&
          "Code already generated for this node");
 
   // Useful values
-  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
 
   // If kernel struct has not been initialized with kernel function, then fail
   assert(K != NULL && "No kernel found!!");
 
   DEBUG(errs() << "Generating kernel call code\n");
 
-  Function* F = N->getFuncPointer();
-
+  Function *F = N->getFuncPointer();
 
   // Create of clone of F with no instructions. Only the type is the same as F
   // without the extra arguments.
-  Function* F_X86;
+  Function *F_X86;
 
   // Clone the function, if we are seeing this function for the first time. We
   // only need a clone in terms of type.
   ValueToValueMapTy VMap;
 
   // Create new function with the same type
-  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+  F_X86 =
+      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
 
   // Loop over the arguments, copying the names of arguments over.
   Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -362,26 +355,25 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   // Add a basic block to this empty function
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-  ReturnInst* RI = ReturnInst::Create(M.getContext(),
-                                      UndefValue::get(F_X86->getReturnType()), BB);
+  ReturnInst *RI = ReturnInst::Create(
+      M.getContext(), UndefValue::get(F_X86->getReturnType()), BB);
 
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   BB = &*F_X86->begin();
   RI = cast<ReturnInst>(BB->getTerminator());
 
-  //Add the generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
-  errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
-         << N->getFuncPointer()->getName() << "\n";
-
+  // Add the generated function info to DFNode
+  //  N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+  N->addGenFunc(F_X86, hpvm::GPU_TARGET, true);
+  DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
+               << N->getFuncPointer()->getName() << "\n");
 
   // Loop over the arguments, to create the VMap
   dest_iterator = F_X86->arg_begin();
@@ -414,51 +406,53 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       break;
   }
 
-  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy
+  nodes!");
 
   Function* CF = C->getFuncPointer();
   */
-  Function* KF = K->KernelLeafNode->getFuncPointer();
+  Function *KF = K->KernelLeafNode->getFuncPointer();
   // Initialize context
-  //DEBUG(errs() << "Initializing context" << "\n");
-  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+  // DEBUG(errs() << "Initializing context" << "\n");
+  // CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI);
 
-  DEBUG(errs() << "Initializing commandQ" << "\n");
+  DEBUG(errs() << "Initializing commandQ"
+               << "\n");
   // Initialize command queue
-  switchToTimer(visc_TimerID_SETUP, InitCall);
-  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
+  switchToTimer(hpvm_TimerID_SETUP, InitCall);
+  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
   DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
-  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
-
-  Value* LaunchInstArgs[] = {fileStr, kernelStr};
-
-  DEBUG(errs() << "Inserting launch call" << "\n");
-  CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
-                                         ArrayRef<Value*>(LaunchInstArgs, 2),
-                                         "graph"+KF->getName(),
-                                         InitCall);
+  DEBUG(errs() << "Generating code for kernel - "
+               << K->KernelFunction->getName() << "\n");
+  Value *kernelStr =
+      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");
+
+  Value *LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call"
+               << "\n");
+  CallInst *NVPTX_Ctx = CallInst::Create(llvm_hpvm_ocl_launch,
+                                         ArrayRef<Value *>(LaunchInstArgs, 2),
+                                         "graph" + KF->getName(), InitCall);
   DEBUG(errs() << *NVPTX_Ctx << "\n");
-  GraphIDAddr = new GlobalVariable(M,
-                                   NVPTX_Ctx->getType(),
-                                   false,
+  GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false,
                                    GlobalValue::CommonLinkage,
                                    Constant::getNullValue(NVPTX_Ctx->getType()),
-                                   "graph"+KF->getName()+".addr");
+                                   "graph" + KF->getName() + ".addr");
   DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
-  StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
+  StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
   DEBUG(errs() << *SI << "\n");
-  switchToTimer(visc_TimerID_NONE, InitCall);
-  switchToTimer(visc_TimerID_SETUP, RI);
-  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
+  switchToTimer(hpvm_TimerID_NONE, InitCall);
+  switchToTimer(hpvm_TimerID_SETUP, RI);
+  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
 
-  // Iterate over the required input edges of the node and use the visc-rt API
+  // Iterate over the required input edges of the node and use the hpvm-rt API
   // to set inputs
-  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+  DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n");
   std::vector<OutputPtr> OutputPointers;
-  // Vector to hold the device memory object that need to be cleared before we release
-  // context
-  std::vector<Value*> DevicePointers;
+  // Vector to hold the device memory object that need to be cleared before we
+  // release context
+  std::vector<Value *> DevicePointers;
 
   std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
   /*
@@ -470,133 +464,134 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   */
 
-  for(auto &InArgMapPair  : kernelInArgMap) {
+  for (auto &InArgMapPair : kernelInArgMap) {
     unsigned i = InArgMapPair.first;
-    Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second);
-    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+    Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second);
+    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
 
     // input value has been obtained.
     // Check if input is a scalar value or a pointer operand
     // For scalar values such as int, float, etc. the size is simply the size of
     // type on target machine, but for pointers, the size of data would be the
     // next integer argument
-    if(inputVal->getType()->isPointerTy()) {
+    if (inputVal->getType()->isPointerTy()) {
 
-      switchToTimer(visc_TimerID_COPY_PTR, RI);
+      switchToTimer(hpvm_TimerID_COPY_PTR, RI);
       // Pointer Input
       // CheckAttribute
-      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
-      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
-                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
-
-      Argument* A = getArgumentAt(KF, i);
-      if(isOutput == True) {
+      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
+      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
+                        !(hasAttribute(KF, i, Attribute::In)))
+                           ? False
+                           : True;
+
+      Argument *A = getArgumentAt(KF, i);
+      if (isOutput == True) {
         DEBUG(errs() << *A << " is an OUTPUT argument\n");
       }
-      if(isInput == True) {
+      if (isInput == True) {
         DEBUG(errs() << *A << " is an INPUT argument\n");
       }
 
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputVal, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
 
       // Assert that the pointer argument size (next argument) is in the map
-      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
-
-      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
-      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
-             && "Pointer type input must always be followed by size (integer type)");
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               inputSize,
-                               isInput,
-                               isOutput
-                              };
-      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());
+
+      Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]);
+      assert(
+          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
+          "Pointer type input must always be followed by size (integer type)");
+      Value *setInputArgs[] = {
+          GraphID,
+          inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          inputSize,
+          isInput,
+          isOutput};
+      Value *d_ptr =
+          CallInst::Create(llvm_hpvm_ocl_argument_ptr,
+                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
       DevicePointers.push_back(d_ptr);
       // If this has out attribute, store the returned device pointer in
       // memory to read device memory later
-      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
-    }
-    else {
-      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+      if (isOutput == True)
+        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    } else {
+      switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
       // Scalar Input
       // Store the scalar value on stack and then pass the pointer to its
       // location
-      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI);
-      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
-
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               ConstantExpr::getSizeOf(inputVal->getType())
-                              };
-      CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      AllocaInst *inputValPtr = new AllocaInst(
+          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
+      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputValPtr, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
+
+      Value *setInputArgs[] = {
+          GraphID, inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          ConstantExpr::getSizeOf(inputVal->getType())};
+      CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
     }
   }
 
-  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+  DEBUG(
+      errs() << "Setup shared memory arguments of node and insert hpvm api\n");
 
   // Check to see if all the allocation sizes are constant (determined
   // statically)
   bool constSizes = true;
-  for (auto& e: K->getSharedInArgMap()) {
+  for (auto &e : K->getSharedInArgMap()) {
     constSizes &= isa<Constant>(e.second.first);
   }
 
   // If the sizes are all constant
   if (constSizes) {
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = e.second.first;
+      Value *allocSize = e.second.first;
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
 
-        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+        assert(isa<Constant>(allocSize) &&
+               "Constant shared memory size is expected");
 
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
+        CallInst::Create(llvm_hpvm_ocl_argument_shared,
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0,
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
+        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   } else {
@@ -617,68 +612,64 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       ExtractValueInstVec.push_back(EI);
     }
 
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
+
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
+        CallInst::Create(llvm_hpvm_ocl_argument_shared,
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, 
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
+        CallInst::Create(llvm_hpvm_ocl_argument_scalar,
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   }
 
-
-  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
+  DEBUG(errs() << "Setup output edges of node and insert hpvm api\n");
   // Set output if struct is not an empty struct
-  StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  std::vector<Value*> d_Outputs;
-  if(!OutputTy->isEmptyTy()) {
-    switchToTimer(visc_TimerID_COPY_PTR, RI);
+  StructType *OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value *> d_Outputs;
+  if (!OutputTy->isEmptyTy()) {
+    switchToTimer(hpvm_TimerID_COPY_PTR, RI);
     // Not an empty struct
     // Iterate over all elements of the struct and put them in
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
-      Value* setOutputArgs[] = {GraphID,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-
-      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
-                                            ArrayRef<Value*>(setOutputArgs, 3),
-                                            "d_output."+KF->getName(),
-                                            RI);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
+      Value *setOutputArgs[] = {
+          GraphID,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
+          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr,
+                                            ArrayRef<Value *>(setOutputArgs, 3),
+                                            "d_output." + KF->getName(), RI);
       d_Outputs.push_back(d_Output);
     }
   }
@@ -688,50 +679,41 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   // Allocate size_t[numDims] space on stack. Store the work group sizes and
   // pass it as an argument to ExecNode
 
-  switchToTimer(visc_TimerID_MISC, RI);
+  switchToTimer(hpvm_TimerID_MISC, RI);
   Value *workDim, *LocalWGPtr, *GlobalWGPtr;
   getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
-  switchToTimer(visc_TimerID_KERNEL, RI);
-  Value* ExecNodeArgs[] = {GraphID,
-                           workDim,
-                           LocalWGPtr,
-                           GlobalWGPtr
-                          };
-  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
-                                     ArrayRef<Value*>(ExecNodeArgs, 4),
-                                     "event."+KF->getName(),
-                                     RI);
+  switchToTimer(hpvm_TimerID_KERNEL, RI);
+  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
+  CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode,
+                                     ArrayRef<Value *>(ExecNodeArgs, 4),
+                                     "event." + KF->getName(), RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
 
   // Wait for Kernel to Finish
-  CallInst::Create(llvm_visc_ocl_wait,
-                   ArrayRef<Value*>(GraphID),
-                   "",
-                   RI);
+  CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
 
-  switchToTimer(visc_TimerID_READ_OUTPUT, RI);
+  switchToTimer(hpvm_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
-  if(!OutputTy->isEmptyTy()) {
-    std::vector<Value*>h_Outputs;
-    Value* KernelOutput = UndefValue::get(OutputTy);
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      Value* GetOutputArgs[] = {GraphID,
-                                Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                                d_Outputs[i],
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
-                                            ArrayRef<Value*>(GetOutputArgs, 4),
-                                            "h_output."+KF->getName()+".addr",
-                                            RI);
+  if (!OutputTy->isEmptyTy()) {
+    std::vector<Value *> h_Outputs;
+    Value *KernelOutput = UndefValue::get(OutputTy);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      Value *GetOutputArgs[] = {
+          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+      CallInst *h_Output = CallInst::Create(
+          llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
+          "h_output." + KF->getName() + ".addr", RI);
       // Read each device pointer listed in output struct
       // Load the output struct
-      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
-                     OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
-
-      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
-                                             KF->getName()+"output", RI);
+      CastInst *BI = BitCastInst::CreatePointerCast(
+          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
+          RI);
+
+      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
+                                             ArrayRef<unsigned>(i),
+                                             KF->getName() + "output", RI);
     }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
@@ -746,75 +728,76 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
     DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
     DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
 
-    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
-    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
+  output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput,
                                     ArrayRef<Value*>(GetOutputArgs, 4),
                                     "", RI);
   }*/
-  switchToTimer(visc_TimerID_MEM_FREE, RI);
+  switchToTimer(hpvm_TimerID_MEM_FREE, RI);
   // Clear Context and free device memory
-  DEBUG(errs() << "Clearing context" << "\n");
+  DEBUG(errs() << "Clearing context"
+               << "\n");
   // Free Device Memory
-  for(auto d_ptr: DevicePointers) {
-    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
+  for (auto d_ptr : DevicePointers) {
+    CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
   }
-  switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
+  switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall);
   // Clear Context
-  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
-  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
-  switchToTimer(visc_TimerID_NONE, CleanupCall);
+  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "",
+                   CleanupCall);
+  switchToTimer(hpvm_TimerID_NONE, CleanupCall);
 
-  switchToTimer(visc_TimerID_MISC, RI);
+  switchToTimer(hpvm_TimerID_MISC, RI);
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
   // Get Exit node
-  DFNode* C = N->getChildGraph()->getExit();
+  DFNode *C = N->getChildGraph()->getExit();
   // Get OutputType of this node
-  StructType* OutTy = N->getOutputType();
+  StructType *OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find the kernel's output arg map, to use instead of the bindings
   std::vector<unsigned> outArgMap = kernel->getOutArgMap();
   // Find all the input edges to exit node
-  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
     DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
-    DFEdge* E = C->getInDFEdgeAt(i);
+    DFEdge *E = C->getInDFEdgeAt(i);
 
     assert(E && "No Binding for output element!");
     // Find the Source DFNode associated with the incoming edge
-    DFNode* SrcDF = E->getSourceDF();
+    DFNode *SrcDF = E->getSourceDF();
 
-    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                 << "\n");
 
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
-    Value* inputVal;
-    if(SrcDF->isEntryNode()) {
+    Value *inputVal;
+    if (SrcDF->isEntryNode()) {
       inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-    }
-    else {
+      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+    } else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
       // FIXME: Since the 2-level kernel code gen has aspecific structure, we
       // can assume the SrcDF is same as Kernel Leaf node.
       // Use outArgMap to get correct mapping
       SrcDF = K->KernelLeafNode;
-      assert(OutputMap.count(SrcDF)
-             && "Source node call not found. Dependency violation!");
+      assert(OutputMap.count(SrcDF) &&
+             "Source node call not found. Dependency violation!");
 
       // Find Output Value associated with the Source DFNode using OutputMap
-      Value* CI = OutputMap[SrcDF];
+      Value *CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
       // i is the destination of DFEdge E
       // Use the mapping instead of the bindings
-//      IndexList.push_back(E->getSourcePosition());
+      //      IndexList.push_back(E->getSourcePosition());
       IndexList.push_back(outArgMap[i]);
-      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "",RI);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
       inputVal = EI;
     }
     std::vector<unsigned> IdxList;
@@ -823,31 +806,33 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   }
 
   DEBUG(errs() << "Extracted all\n");
-  switchToTimer(visc_TimerID_NONE, RI);
+  switchToTimer(hpvm_TimerID_NONE, RI);
   retVal->setName("output");
-  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
 }
 
-
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
-void CGT_NVPTX::codeGen(DFInternalNode* N) {
-  errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n";
-  if(KernelLaunchNode == NULL)
-    errs () << "No kernel launch node\n";
+void CGT_NVPTX::codeGen(DFInternalNode *N) {
+  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
+               << "\n");
+  if (KernelLaunchNode == NULL)
+    DEBUG(errs() << "No kernel launch node\n");
   else {
-    errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "KernelLaunchNode: "
+                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
   }
 
   if (!KernelLaunchNode) {
-    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    DEBUG(errs()
+          << "No code generated (host code for kernel launch complete).\n");
     return;
   }
 
   if (N == KernelLaunchNode) {
     DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-    //TODO
+    // TODO
 
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
@@ -862,7 +847,8 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // TODO: Structure assumed: one thread node, one allocation node (at most),
     // TB node
     std::map<unsigned, unsigned> inmapFinal;
-    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(),
+                                                ie = inmap2.end();
          ib != ie; ++ib) {
       inmapFinal[ib->first] = inmap1[ib->second];
     }
@@ -879,8 +865,9 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // 0 ... outmap2.size()-1
     // The limit is the size of outmap2, because this is the number of kernel
     // output arguments for which the mapping matters
-    // For now, it reasonable to assume that all the kernel arguments are returned,
-    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    // For now, it reasonable to assume that all the kernel arguments are
+    // returned, maybe plys some others from other nodes, thus outmap2.size() <=
+    // outmap1.size()
     for (unsigned i = 0; i < outmap2.size(); i++) {
       outmap1[i] = outmap2[outmap1[i]];
     }
@@ -888,15 +875,14 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
 
     // Track the source of local dimlimits for the kernel
     // Dimension limit can either be a constant or an argument of parent
-    // function. Since Internal node would no longer exist, we need to insert the
-    // localWGSize with values from the parent of N.
-    std::vector<Value*> localWGSizeMapped;
+    // function. Since Internal node would no longer exist, we need to insert
+    // the localWGSize with values from the parent of N.
+    std::vector<Value *> localWGSizeMapped;
     for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
       if (isa<Constant>(kernel->localWGSize[i])) {
         // if constant, use as it is
         localWGSizeMapped.push_back(kernel->localWGSize[i]);
-      }
-      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+      } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
         // if argument, find the argument location in N. Use InArgMap of N to
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
@@ -906,46 +892,49 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
         assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
 
         unsigned parentArgNum = N->getInArgMap()[argNum];
-        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        Argument *A =
+            getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
-      }
-      else {
-        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      } else {
+        assert(
+            false &&
+            "LocalWGsize using value which is neither argument nor constant!");
       }
     }
     // Update localWGSize vector of kernel
     kernel->setLocalWGSize(localWGSizeMapped);
   }
-
 }
 
-void CGT_NVPTX::codeGen(DFLeafNode* N) {
-  errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n";
+void CGT_NVPTX::codeGen(DFLeafNode *N) {
+  DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName()
+               << "\n");
 
   // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
+  if (N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
 
   // Skip code generation if it is an allocation node
-  if(N->isAllocationNode()) {
+  if (N->isAllocationNode()) {
     DEBUG(errs() << "Skipping allocation node\n");
     return;
   }
 
   // Generate code only if it has the right hint
-//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
-//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-//    return;
-//  }
-  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
-    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+  //  if(!checkPreferredTarget(N, hpvm::GPU_TARGET)) {
+  //    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+  //    return;
+  //  }
+  if (!preferredTargetIncludes(N, hpvm::GPU_TARGET)) {
+    DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName()
+                 << "\n");
     return;
   }
 
   // Checking which node is the kernel launch
-  DFNode* PNode = N->getParent();
+  DFNode *PNode = N->getParent();
   int pLevel = PNode->getLevel();
   int pReplFactor = PNode->getNumOfDim();
 
@@ -953,42 +942,40 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // (1) Parent is the top level node i.e., Root of DFG
   //                    OR
   // (2) Parent does not have multiple instances
-  errs() << "pLevel = " << pLevel << "\n";
-  errs() << "pReplFactor = " << pReplFactor << "\n";
+  DEBUG(errs() << "pLevel = " << pLevel << "\n");
+  DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n");
   assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
 
   // Only these options are supported
-  enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy;
-  if(pLevel == 1 || !pReplFactor) {
-    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
+  enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy;
+  if (pLevel == 1 || !pReplFactor) {
+    DEBUG(errs()
+          << "*************** Kernel Gen: 1-Level Hierarchy **************\n");
     SelectedHierarchy = ONE_LEVEL;
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL,
-                        N,
-                        N->getInArgMap(),
-                        N->getSharedInArgMap(),
-                        N->getOutArgMap(),
-                        N->getNumOfDim(),
-                        N->getDimLimits());
-  }
-  else {
+    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(),
+                        N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits());
+  } else {
     // Converting a 2-level DFG to opencl kernel
-    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
-    assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node.");
+    DEBUG(errs()
+          << "*************** Kernel Gen: 2-Level Hierarchy **************\n");
+    assert((pLevel >= 2) &&
+           "Selected node not nested deep enough to be Kernel Node.");
     SelectedHierarchy = TWO_LEVEL;
     KernelLaunchNode = PNode->getParent();
-    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) &&
+           "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    kernel = new Kernel(NULL,                 // kernel function
-                        N,                    // kernel leaf node
-                        N->getInArgMap(),     // kenel argument mapping
+    kernel = new Kernel(NULL,             // kernel function
+                        N,                // kernel leaf node
+                        N->getInArgMap(), // kenel argument mapping
                         N->getSharedInArgMap(),
-                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
-                        PNode->getNumOfDim(), // gridDim
-                        PNode->getDimLimits(),// grid size
-                        N->getNumOfDim(),     // blockDim
-                        N->getDimLimits());   // block size
-
+                        N->getOutArgMap(),     // kernel output mapping from the
+                                               // leaf to the interemediate node
+                        PNode->getNumOfDim(),  // gridDim
+                        PNode->getDimLimits(), // grid size
+                        N->getNumOfDim(),      // blockDim
+                        N->getDimLimits());    // block size
   }
 
   std::vector<Instruction *> IItoRemove;
@@ -1000,58 +987,62 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
-//  Function *F_nvptx = N->getGenFunc();
-  Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
+  //  Function *F_nvptx = N->getGenFunc();
+  Function *F_nvptx = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-  assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
+  assert(F_nvptx == NULL &&
+         "Error: Visiting a node for which code already generated");
   // Clone the function
   ValueToValueMapTy VMap;
 
-  //F_nvptx->setName(FName+"_nvptx");
+  // F_nvptx->setName(FName+"_nvptx");
 
   Twine FName = F->getName();
   StringRef fStr = FName.getSingleStringRef();
-  Twine newFName = Twine(fStr, "_nvptx"); 
+  Twine newFName = Twine(fStr, "_nvptx");
   F_nvptx = CloneFunction(F, VMap);
   F_nvptx->setName(newFName);
 
-  
   //  errs() << "Old Function Name: " << F->getName() << "\n";
   //  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
 
   F_nvptx->removeFromParent();
 
-
   // Insert the cloned function into the kernels module
   KernelM->getFunctionList().push_back(F_nvptx);
 
-
-  //TODO: Iterate over all the instructions of F_nvptx and identify the
-  //callees and clone them into this module.
+  // TODO: Iterate over all the instructions of F_nvptx and identify the
+  // callees and clone them into this module.
   DEBUG(errs() << *F_nvptx->getType());
   DEBUG(errs() << *F_nvptx);
 
   // Transform  the function to void and remove all target dependent attributes
   // from the function
   F_nvptx = transformFunctionToVoid(F_nvptx);
-  
-  //Add generated function info to DFNode
-//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
-  N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
 
-  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
-  F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
+  // Add generated function info to DFNode
+  //  N->setGenFunc(F_nvptx, hpvm::GPU_TARGET);
+  N->addGenFunc(F_nvptx, hpvm::GPU_TARGET, false);
+
+  DEBUG(
+      errs()
+      << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_nvptx->removeAttributes(AttributeList::FunctionIndex,
+                            F_nvptx->getAttributes().getFnAttributes());
   F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
 
-  //FIXME: For now, assume only one allocation node
+  // FIXME: For now, assume only one allocation node
   kernel->AllocationNode = NULL;
 
-  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(),
+                                       iee = N->indfedge_end();
        ieb != iee; ++ieb) {
     DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Found edge from node: "
+                 << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
     DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode()
+                 << "\n");
     if (!SrcDFNode->isDummyNode()) {
       assert(SrcDFNode->isAllocationNode());
       kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
@@ -1066,18 +1057,20 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // If no allocation node was found, SharedMemArgs is empty
   if (kernel->AllocationNode) {
     ValueToValueMapTy VMap;
-    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
-    //F_alloc->removeFromParent();
+    Function *F_alloc =
+        CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    // F_alloc->removeFromParent();
     // Insert the cloned function into the kernels module
-    //M.getFunctionList().push_back(F_alloc);
+    // M.getFunctionList().push_back(F_alloc);
 
-    std::vector<IntrinsicInst *> ViscMallocInstVec;
-    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+    std::vector<IntrinsicInst *> HPVMMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::hpvm_malloc, HPVMMallocInstVec);
 
-    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
-      IntrinsicInst *II = ViscMallocInstVec[i];
-      assert(II->hasOneUse() && "visc_malloc result is used more than once");
-      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+    for (unsigned i = 0; i < HPVMMallocInstVec.size(); i++) {
+      IntrinsicInst *II = HPVMMallocInstVec[i];
+      assert(II->hasOneUse() && "hpvm_malloc result is used more than once");
+      II->replaceAllUsesWith(
+          ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
       II->eraseFromParent();
     }
     kernel->AllocationFunction = F_alloc;
@@ -1092,15 +1085,19 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         assert(RetStructTy && "Allocation node does not return a struct type");
         unsigned numFields = RetStructTy->getNumElements();
     */
-    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
-    AllocationNodeProperty* APN =
-      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
-    for (auto& AllocPair: APN->getAllocationList()) {
+    std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap =
+        kernel->getSharedInArgMap();
+    AllocationNodeProperty *APN =
+        (AllocationNodeProperty *)kernel->AllocationNode->getProperty(
+            DFNode::Allocation);
+    for (auto &AllocPair : APN->getAllocationList()) {
       unsigned destPos = AllocPair.first->getDestPosition();
       unsigned srcPos = AllocPair.first->getSourcePosition();
       SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
-      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
+      sharedInMap[destPos + 1] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
     }
     kernel->setSharedInArgMap(sharedInMap);
   }
@@ -1110,12 +1107,14 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // global address space
   unsigned argIndex = 0;
   std::vector<unsigned> GlobalMemArgs;
-  for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end();
-      ai != ae; ++ai) {
-    if (ai->getType()->isPointerTy()) {    
-      // If the arguement is already chosen for shared memory arguemnt list, skip.
-      // Else put it in Global memory arguement list
-      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+  for (Function::arg_iterator ai = F_nvptx->arg_begin(),
+                              ae = F_nvptx->arg_end();
+       ai != ae; ++ai) {
+    if (ai->getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list,
+      // skip. Else put it in Global memory arguement list
+      if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) ==
+          0) {
         GlobalMemArgs.push_back(argIndex);
       }
     }
@@ -1129,20 +1128,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Optimization: Gloabl memory arguments, which are not modified and whose
   // loads are not dependent on node id of current node, should be moved to
   // constant memory, subject to size of course
-  std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
+  std::vector<unsigned> ConstantMemArgs =
+      globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
   F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
-// Function to replace call instructions to functions in the kernel
+  // Function to replace call instructions to functions in the kernel
   std::map<Function *, Function *> OrgToClonedFuncMap;
   std::vector<Function *> FuncToBeRemoved;
-  auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) {
-    Function* NewFunc;
+  auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) {
+    Function *NewFunc;
     // Check if the called function has already been cloned before.
     auto It = OrgToClonedFuncMap.find(OrgFunc);
-    if(It == OrgToClonedFuncMap.end()) {
+    if (It == OrgToClonedFuncMap.end()) {
       ValueToValueMapTy VMap;
       NewFunc = CloneFunction(OrgFunc, VMap);
       OrgToClonedFuncMap[OrgFunc] = NewFunc;
@@ -1151,43 +1151,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       NewFunc = (*It).second;
     }
     // Replace the calls to this function
-    std::vector<Value*> args;
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+    std::vector<Value *> args;
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
       args.push_back(CI->getArgOperand(i));
     }
-    CallInst* Inst = CallInst::Create(NewFunc, args,
-        OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+    CallInst *Inst = CallInst::Create(
+        NewFunc, args,
+        OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
     CI->replaceAllUsesWith(Inst);
     IItoRemove.push_back(CI);
     return NewFunc;
   };
 
-
   // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
     Instruction *I = &(*i);
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+    // Leaf nodes should not contain HPVM graph intrinsics or launch
+    assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
+           "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
+           "HPVM graph intrinsic within a leaf dataflow node!");
 
-    if (BuildDFG::isViscIntrinsic(I)) {
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
+    if (BuildDFG::isHPVMIntrinsic(I)) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst *ArgII;
+      DFNode *ArgDFNode;
 
-      /************************ Handle VISC Query intrinsics ************************/
+      /************************ Handle HPVM Query intrinsics
+       * ************************/
 
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *****************************/
-      case Intrinsic::visc_getNode: {
+      /**************************** llvm.hpvm.getNode()
+       * *****************************/
+      case Intrinsic::hpvm_getNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
-      }
-      break;
-      /************************* llvm.visc.getParentNode() **************************/
-      case Intrinsic::visc_getParentNode: {
+      } break;
+      /************************* llvm.hpvm.getParentNode()
+       * **************************/
+      case Intrinsic::hpvm_getParentNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
         // get the parent node of the arg node
         // get argument node
@@ -1200,10 +1205,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*************************** llvm.visc.getNumDims() ***************************/
-      case Intrinsic::visc_getNumDims: {
+      } break;
+      /*************************** llvm.hpvm.getNumDims()
+       * ***************************/
+      case Intrinsic::hpvm_getNumDims: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
         // get node from map
         // get the appropriate field
@@ -1211,47 +1216,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         int numOfDim = ArgDFNode->getNumOfDim();
         DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+        IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt *numOfDimConstant =
+            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
 
         // Replace the result of the intrinsic with the computed value
         II->replaceAllUsesWith(numOfDimConstant);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*********************** llvm.visc.getNodeInstanceID() ************************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n");
+      } break;
+      /*********************** llvm.hpvm.getNodeInstanceID()
+       * ************************/
+      case Intrinsic::hpvm_getNodeInstanceID_x:
+      case Intrinsic::hpvm_getNodeInstanceID_y:
+      case Intrinsic::hpvm_getNodeInstanceID_z: {
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"
+                     << "\t: " << *II << "\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         assert(ArgDFNode && "Arg node is NULL");
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNodeInstanceID_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
+        Function *OpenCLFunction;
 
-        FunctionType* FT =
-          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                            Type::getInt32Ty(KernelM->getContext()),
-                            false);
+        FunctionType *FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()), false);
         if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
@@ -1260,838 +1266,860 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           // itself
           DEBUG(errs() << "Substitute with get_global_id()\n");
           DEBUG(errs() << *II << "\n");
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          //DEBUG(errs() << "Here inside cond 2\n");
+          // DEBUG(errs() << "Here inside cond 2\n");
           // We are asking for this node's id with respect to its parent
           // this is a local id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee());
-          //DEBUG(errs() << "exiting condition 2\n");
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT))
+                  .getCallee());
+          // DEBUG(errs() << "exiting condition 2\n");
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's id with respect to its
           // parent: this is a group id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT))
+                  .getCallee());
         } else {
-          errs() << N->getFuncPointer()->getName() << "\n";
-          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
-          errs() << *II << "\n";
+          DEBUG(errs() << N->getFuncPointer()->getName() << "\n");
+          DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n");
+          DEBUG(errs() << *II << "\n");
 
           assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
-        //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n");
-        //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n");
-        //DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
-        //DEBUG(errs() << "Argument: " << Args[0] << "\n");
-        //DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
+        // DEBUG(errs() << "Create call instruction, insert it before the
+        // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction <<
+        // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
+        // DEBUG(errs() << "Argument: " << Args[0] << "\n");
+        // DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        //DEBUG(errs() << "Replace uses\n");
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        // DEBUG(errs() << "Replace uses\n");
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /********************** llvm.visc.getNumNodeInstances() ***********************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
+      } break;
+      /********************** llvm.hpvm.getNumNodeInstances()
+       * ***********************/
+      case Intrinsic::hpvm_getNumNodeInstances_x:
+      case Intrinsic::hpvm_getNumNodeInstances_y:
+      case Intrinsic::hpvm_getNumNodeInstances_z: {
         // TODO: think about whether this is the best way to go there are hw
         // specific registers. therefore it is good to have the intrinsic but
         // then, why do we need to keep that info in the graph?  (only for the
         // kernel configuration during the call)
 
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
+        DEBUG(errs() << F_nvptx->getName()
+                     << "\t: Handling getNumNodeInstances\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNumNodeInstances_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
-        FunctionType* FT =
+        Function *OpenCLFunction;
+        FunctionType *FT =
             FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()),
-                              false);
+                              Type::getInt32Ty(KernelM->getContext()), false);
 
         if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's instances
           // this is a (global_size/local_size) (grid dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT))
+                  .getCallee());
         } else {
           assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      case Intrinsic::visc_barrier:
-      {
+      } break;
+      case Intrinsic::hpvm_barrier: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
         DEBUG(errs() << "Substitute with barrier()\n");
         DEBUG(errs() << *II << "\n");
-        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
-                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
-                                             false);
-        Function* OpenCLFunction = cast<Function>
-                                   ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee());
-        CallInst* CI = CallInst::Create(OpenCLFunction,
-                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
-                                        "", II);
+        FunctionType *FT = FunctionType::get(
+            Type::getVoidTy(KernelM->getContext()),
+            std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())),
+            false);
+        Function *OpenCLFunction = cast<Function>(
+            (KernelM->getOrInsertFunction(StringRef("barrier"), FT))
+                .getCallee());
+        CallInst *CI =
+            CallInst::Create(OpenCLFunction,
+                             ArrayRef<Value *>(ConstantInt::get(
+                                 Type::getInt32Ty(KernelM->getContext()), 1)),
+                             "", II);
         II->replaceAllUsesWith(CI);
         IItoRemove.push_back(II);
-      }
-      break;
-      case Intrinsic::visc_atomic_add:
-      case Intrinsic::visc_atomic_sub:
-      case Intrinsic::visc_atomic_xchg:
-      case Intrinsic::visc_atomic_min:
-      case Intrinsic::visc_atomic_max:
-      case Intrinsic::visc_atomic_and:
-      case Intrinsic::visc_atomic_or:
-      case Intrinsic::visc_atomic_xor:
-      {
+      } break;
+      case Intrinsic::hpvm_atomic_add:
+      case Intrinsic::hpvm_atomic_sub:
+      case Intrinsic::hpvm_atomic_xchg:
+      case Intrinsic::hpvm_atomic_min:
+      case Intrinsic::hpvm_atomic_max:
+      case Intrinsic::hpvm_atomic_and:
+      case Intrinsic::hpvm_atomic_or:
+      case Intrinsic::hpvm_atomic_xor: {
         DEBUG(errs() << *II << "\n");
         // Only have support for i32 atomic intrinsics
-        assert(II->getType() == Type::getInt32Ty(II->getContext())
-               && "Only support i32 atomic intrinsics for now");
+        assert(II->getType() == Type::getInt32Ty(II->getContext()) &&
+               "Only support i32 atomic intrinsics for now");
         // Substitute with atomicrmw instruction
-        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
-        Value* Ptr = II->getArgOperand(0);
-        Value* Val = II->getArgOperand(1);
-        assert(Ptr->getType()->isPointerTy()
-               && "First argument of supported atomics is expected to be a pointer");
-        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
-        PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
+        assert(II->getNumArgOperands() == 2 &&
+               "Expecting 2 operands for these atomics");
+        Value *Ptr = II->getArgOperand(0);
+        Value *Val = II->getArgOperand(1);
+        assert(
+            Ptr->getType()->isPointerTy() &&
+            "First argument of supported atomics is expected to be a pointer");
+        PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+        PointerType *TargetTy =
+            Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
         if (PtrTy != TargetTy) {
           Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
           PtrTy = TargetTy;
         }
 
-			 std::string name;
-			 if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-				 name = "atomic_add";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-				 name = "atomic_sub";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-				 name = "atomic_xchg";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-				 name = "atomic_min";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-				 name = "atomic_max";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-				 name = "atomic_and";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-				 name = "atomic_or";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-				 name = "atomic_xor";
-			 Type* paramTypes[] = {PtrTy, Val->getType()};
-			 FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
-			 FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
-
-			 Value* Params[] = {Ptr, Val};
-			 CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
-			 DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
-			 II->replaceAllUsesWith(AtomCI);
-			 IItoRemove.push_back(II);
-			}
-			break;
-			default:
-			llvm_unreachable("Unknown VISC Intrinsic!");
-			break;
-			}
-
-		}
-		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-			IRBuilder<> Builder(I);
-			Value *Source = MemCpyI->getSource();
-			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-			Value *Length = MemCpyI->getOperand(2);
-			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-			DEBUG(errs() << "Source: " << *Source << "\n"); 
-			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
-			DEBUG(errs() << "Length: " << *Length << "\n");
-
-			size_t memcpy_length;
-			unsigned int memcpy_count;
-			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
-				if (CI->getBitWidth() <= 64) {
-					memcpy_length = CI->getSExtValue();
-					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-					Type *Source_Type = Source->getType()->getPointerElementType();
-					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
-						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
-							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-							Value *DestPtrOperand = destGEPI->getPointerOperand();
-							for(int i = 0; i < memcpy_count; ++i) {
-								Constant *increment;
-								LoadInst *newLoadI;
-								StoreInst *newStoreI;
-								// First, need to increment the correct index for both source and dest 
-								// This invluves checking to see how many indeces the GEP has
-								// Assume for now only 1 or 2 are the viable options.
-
-								std::vector<Value*> GEPlIndex;
-								if (sourceGEPI->getNumIndices() == 1) {
-									Value *Index = sourceGEPI->getOperand(1);      
-									increment = ConstantInt::get(Index->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPlIndex.push_back(incAdd);
-									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
-									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-									newLoadI = Builder.CreateLoad(newGEPIl);
-									DEBUG(errs() << "Load: " << *newLoadI << "\n");
-								} else { 
-									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
-								}
-
-
-								std::vector<Value*> GEPsIndex;
-								if (destGEPI->getNumIndices() == 1) {
-
-								} else if (destGEPI->getNumIndices() == 2) {
-									Value *Index0 = destGEPI->getOperand(1);      
-									GEPsIndex.push_back(Index0);
-									Value *Index1 = destGEPI->getOperand(2);      
-									increment = ConstantInt::get(Index1->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index1, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPsIndex.push_back(incAdd);
-									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
-									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
-									DEBUG(errs() << "Store: " << *newStoreI << "\n");
-								} else {
-									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
-								}
-							}
-							IItoRemove.push_back(sourceGEPI);
-							IItoRemove.push_back(destGEPI);
-							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-							IItoRemove.push_back(destBitcastI);
-							IItoRemove.push_back(sourceBitcastI);
-							IItoRemove.push_back(MemCpyI);
-						}
-					}
-
-				}
-			} else {
-				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-			}
-			//      llvm_unreachable("HERE!");
-		}
-
-		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-			DEBUG(errs() << "Found a call: " << *CI << "\n");
-			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-			if(calleeF->isDeclaration()) {
-				// Add the declaration to kernel module
-				if (calleeF->getName() == "sqrtf") {
-					calleeF->setName(Twine("sqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				} else if (calleeF->getName() == "rsqrtf") {
-					calleeF->setName(Twine("rsqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				}  
-				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-			}
-			else {
-				// Check if the called function has already been cloned before.
-				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-				// Iterate over the new function to see if it calls any other functions
-				// in the module.
-				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-					if(auto *Call = dyn_cast<CallInst>(&*i)) {
-						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-						CloneAndReplaceCall(Call, CalledFunc);
-					}
-				}
-			}
-			//TODO: how to handle address space qualifiers in load/store
-		}
-
-	}
-	// search for pattern where float is being casted to int and loaded/stored and change it.	
-	DEBUG(errs() << "finding pattern for replacement!\n");
-	for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-		bool cont = false;
-		bool keepGEPI = false;
-		bool keepGEPI2= false;
-		Instruction *I = &(*i);
-		GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
-
-		if (!GEPI) {
-			// did nod find pattern start, continue
-			continue;
-		}
-		// may have found pattern, check
-		DEBUG(errs() << "GEPI " << *GEPI << "\n");
-		// print whatever we want for debug
-		Value* PtrOp = GEPI->getPointerOperand();
-		Type *SrcTy = GEPI->getSourceElementType();
-		unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-		if (SrcTy->isArrayTy()) 
-			DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
-		else
-			DEBUG(errs() << *SrcTy << " is not an array type!\n");
-		// check that source element type is float
-		if (SrcTy->isArrayTy()) {
-			if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-				DEBUG(errs() << "GEPI type is array but not float!\n");
-				continue;
-			}
-		}
-		else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
-			DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-			// does not fit this pattern - no float GEP instruction
-			continue;
-		}
-		// check that addressspace is 1
-		//	  if (GEPIaddrspace != 1) {
-		//			// does not fit this pattern - addrspace of pointer argument is not global
-		//			continue;
-		//		}
-		if (!(GEPI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI = true;
-		}
-		DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-		// 1st GEPI it has one use
-		//		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-		// See if it is a bitcast
-		BitCastInst *BitCastI;
-		for (User * U : GEPI->users()) {
-			if(Instruction *ui = dyn_cast<Instruction> (U)) { 
-				DEBUG(errs() << "--" << *ui << "\n");
-				if (isa<BitCastInst>(ui)) {
-					BitCastI = dyn_cast<BitCastInst>(ui);
-					DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = GEPI->user_begin(),
-		//				ue = GEPI->user_end(); ui!=ue; ++ui) {
-		//        DEBUG(errs() << "--" << *ui << "\n");
-		//			if (isa<BitCastInst>(*ui)) {
-		//				BitCastI = dyn_cast<BitCastInst>(*ui);
-		//        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-		//			}
-		//		}
-
-		if (cont/*!BitCastI*/) {
-			continue; // not in pattern
-		}
-
-		//    DEBUG(errs() << *BitCastI << "\n");
-		// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
-		Value *Op2 = BitCastI->getOperand(0);
-		DEBUG(errs() << "----" << *Op2 << "\n");
-		//		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-		//		Type *OpTy = cast<Type>(Op2);
-		Type *OpTy = BitCastI->getDestTy();
-		DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-		//    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
-		if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-			// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-			continue; // not in pattern
-		}
-
-		DEBUG(errs() << "----Here!\n");
-		// We are in GEP, bitcast.
-
-		// user_iterator, to find the load.
-
-		if (!(BitCastI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-		}
-		DEBUG(errs() << "----Bitcast has one use!\n");
-		// it has one use
-		assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-		LoadInst *LoadI;
-		for (User * U : BitCastI->users()) { 
-			if (Instruction *ui = dyn_cast<Instruction> (U)) {
-				DEBUG(errs() << "-----" << *ui << "\n");
-				if (isa<LoadInst>(ui)) {
-					LoadI = dyn_cast<LoadInst>(ui);
-					DEBUG(errs() << "-----Found load as only use of bitcast\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = BitCastI->user_begin(),
-		//				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-		//			if (isa<LoadInst>(*ui)) {
-		//				LoadI = dyn_cast<LoadInst>(*ui);
-		//        errs() << "Found load as only use of bitcast\n";
-		//			}
-		//		}
-
-		if (cont) {
-			continue; // not in pattern
-		}
-
-		DEBUG("HERE!\n");
-		// check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
-		assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
-
-		// Copy user_iterator, to find the store.
-
-		if (!(LoadI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-			// TODO: generalize: one load can have more than one store users
-		}
-
-		// it has one use
-		assert(LoadI->hasOneUse() && "LoadI has a single use");
-		Value::user_iterator ui = LoadI->user_begin();
-		// skipped loop, because is has a single use
-		StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-		if (!StoreI) {
-			continue; // not in pattern
-		}
-
-		// Also check that the store uses the loaded value as the value operand
-		if (StoreI->getValueOperand() != LoadI) {
-			continue;
-		}
-
-		DEBUG(errs() << "-------Found store instruction\n");
-
-		// Look for its bitcast, which is its pointer operand
-		Value *StPtrOp = StoreI->getPointerOperand();
-		DEBUG(errs() << "-------" << *StPtrOp << "\n");
-		BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-		DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-		if (!BitCastI2) {
-			continue; //not in pattern
-		}
-
-		DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
-		// found bitcast. Look for the second GEP, its from operand.
-		Value *BCFromOp = BitCastI2->getOperand(0);
-		GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-		DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-		if (!GEPI2) {
-			continue; //not in pattern
-		}
-
-		if (!(GEPI2->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI2 = true;
-		}
-		DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
-
-		Value *PtrOp2 = GEPI2->getPointerOperand();
-
-		// Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
-
-		// Assume we found pattern
-		if (!keepGEPI) {  
-			IItoRemove.push_back(GEPI);
-			DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-		} else {
-			DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-		}
-		IItoRemove.push_back(BitCastI);
-		DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-		IItoRemove.push_back(LoadI);
-		DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-		IItoRemove.push_back(GEPI2);
-		DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-		IItoRemove.push_back(BitCastI2);
-		DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-		if (!keepGEPI2) {
-			IItoRemove.push_back(StoreI);
-			DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-		} else {
-
-			DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
-		}
-
-		std::vector<Value*> GEPlIndex;
-		if (GEPI->hasIndices()) {
-			for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-				GEPlIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-		std::vector<Value*> GEPsIndex;
-		if (GEPI2->hasIndices()) {
-			for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-				GEPsIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-
-
-		//    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-		GetElementPtrInst* newlGEP =
-			GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
-					PtrOp, // operand from 1st GEP
-					ArrayRef<Value*>(GEPlIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-		// insert load before GEPI
-		LoadInst *newLoadI =
-			new LoadInst(Type::getFloatTy(M.getContext()),
-					newlGEP, // new GEP
-					Twine(),
-					LoadI->isVolatile(),
-					LoadI->getAlignment(),
-					LoadI->getOrdering(),
-					LoadI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-		// same for GEP for store, for store operand
-		GetElementPtrInst* newsGEP =
-			GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-					PtrOp2, // operand from 2nd GEP
-					ArrayRef<Value*>(GEPsIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-		// insert store before GEPI
-		StoreInst *newStoreI =
-			new StoreInst(newLoadI,
-					newsGEP, // new GEP
-					StoreI->isVolatile(),
-					StoreI->getAlignment(),
-					StoreI->getOrdering(),
-					StoreI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newStoreI << "\n");
-
-	}
-
-	// We need to do this explicitly: DCE pass will not remove them because we
-	// have assumed theworst memory behaviour for these function calls
-	// Traverse the vector backwards, otherwise definitions are deleted while
-	// their subsequent uses are still around
-	for (auto *I : reverse(IItoRemove)) {
-		DEBUG(errs() << "Erasing: " << *I << "\n");
-		I->eraseFromParent();
-	}
-
-	// Removed the cloned functions from the parent module into the new module 
-	for(auto *F : FuncToBeRemoved) {
-		F->removeFromParent(); //TODO: MARIA check
-		KernelM->getFunctionList().push_back(F);
-	}
-
-	addCLMetadata(F_nvptx);
-	kernel->KernelFunction = F_nvptx;
-	errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-	DEBUG(errs() << *KernelM);
-
-	return;
-}
+        std::string name;
+        if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_add)
+          name = "atomic_add";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_sub)
+          name = "atomic_sub";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xchg)
+          name = "atomic_xchg";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_min)
+          name = "atomic_min";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_max)
+          name = "atomic_max";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_and)
+          name = "atomic_and";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_or)
+          name = "atomic_or";
+        else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xor)
+          name = "atomic_xor";
+        Type *paramTypes[] = {PtrTy, Val->getType()};
+        FunctionType *AtomFuncT = FunctionType::get(
+            II->getType(), ArrayRef<Type *>(paramTypes, 2), false);
+        FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);
+
+        Value *Params[] = {Ptr, Val};
+        CallInst *AtomCI = CallInst::Create(
+            AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II);
+        DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
+        II->replaceAllUsesWith(AtomCI);
+        IItoRemove.push_back(II);
+      } break;
+      default:
+        llvm_unreachable("Unknown HPVM Intrinsic!");
+        break;
+      }
 
-bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-	errs() << "\nDFG2LLVM_NVPTX PASS\n";
+    } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+      IRBuilder<> Builder(I);
+      Value *Source = MemCpyI->getSource();
+      Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+      Value *Length = MemCpyI->getOperand(2);
+      DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+      DEBUG(errs() << "Source: " << *Source << "\n");
+      DEBUG(errs() << "Destination: " << *Destination << "\n");
+      DEBUG(errs() << "Length: " << *Length << "\n");
+
+      size_t memcpy_length;
+      unsigned int memcpy_count;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) {
+        if (CI->getBitWidth() <= 64) {
+          memcpy_length = CI->getSExtValue();
+          DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+          Type *Source_Type = Source->getType()->getPointerElementType();
+          DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+          memcpy_count =
+              memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+          DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+          if (GetElementPtrInst *sourceGEPI =
+                  dyn_cast<GetElementPtrInst>(Source)) {
+            if (GetElementPtrInst *destGEPI =
+                    dyn_cast<GetElementPtrInst>(Destination)) {
+              Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+              Value *DestPtrOperand = destGEPI->getPointerOperand();
+              for (int i = 0; i < memcpy_count; ++i) {
+                Constant *increment;
+                LoadInst *newLoadI;
+                StoreInst *newStoreI;
+                // First, need to increment the correct index for both source
+                // and dest This invluves checking to see how many indeces the
+                // GEP has Assume for now only 1 or 2 are the viable options.
+
+                std::vector<Value *> GEPlIndex;
+                if (sourceGEPI->getNumIndices() == 1) {
+                  Value *Index = sourceGEPI->getOperand(1);
+                  increment = ConstantInt::get(Index->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPlIndex.push_back(incAdd);
+                  Value *newGEPIl = Builder.CreateGEP(
+                      SourcePtrOperand, ArrayRef<Value *>(GEPlIndex));
+                  DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+                  newLoadI = Builder.CreateLoad(newGEPIl);
+                  DEBUG(errs() << "Load: " << *newLoadI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where source GEPI has more "
+                                   "than 1 indices!\n");
+                }
+
+                std::vector<Value *> GEPsIndex;
+                if (destGEPI->getNumIndices() == 1) {
+
+                } else if (destGEPI->getNumIndices() == 2) {
+                  Value *Index0 = destGEPI->getOperand(1);
+                  GEPsIndex.push_back(Index0);
+                  Value *Index1 = destGEPI->getOperand(2);
+                  increment = ConstantInt::get(Index1->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index1, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPsIndex.push_back(incAdd);
+                  Value *newGEPIs = Builder.CreateGEP(
+                      DestPtrOperand, ArrayRef<Value *>(GEPsIndex));
+                  DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+                  newStoreI = Builder.CreateStore(newLoadI, newGEPIs,
+                                                  MemCpyI->isVolatile());
+                  DEBUG(errs() << "Store: " << *newStoreI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where dest GEPI has more "
+                                   "than 2 indices!\n");
+                }
+              }
+              IItoRemove.push_back(sourceGEPI);
+              IItoRemove.push_back(destGEPI);
+              Instruction *destBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+              Instruction *sourceBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+              IItoRemove.push_back(destBitcastI);
+              IItoRemove.push_back(sourceBitcastI);
+              IItoRemove.push_back(MemCpyI);
+            }
+          }
+        }
+      } else {
+        llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+      }
+      //      llvm_unreachable("HERE!");
+    }
 
-	// Get the BuildDFG Analysis Results:
-	// - Dataflow graph
-	// - Maps from i8* hansles to DFNode and DFEdge
-	BuildDFG &DFG = getAnalysis<BuildDFG>();
+    else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function *calleeF =
+          cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if (calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        if (calleeF->getName() == "sqrtf") {
+          calleeF->setName(Twine("sqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        } else if (calleeF->getName() == "rsqrtf") {
+          calleeF->setName(Twine("rsqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        }
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF
+                     << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(),
+                                     calleeF->getFunctionType());
+      } else {
+        // Check if the called function has already been cloned before.
+        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+        // Iterate over the new function to see if it calls any other functions
+        // in the module.
+        for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc);
+             i != e; ++i) {
+          if (auto *Call = dyn_cast<CallInst>(&*i)) {
+            Function *CalledFunc =
+                cast<Function>(Call->getCalledValue()->stripPointerCasts());
+            CloneAndReplaceCall(Call, CalledFunc);
+          }
+        }
+      }
+      // TODO: how to handle address space qualifiers in load/store
+    }
+  }
+  // search for pattern where float is being casted to int and loaded/stored and
+  // change it.
+  DEBUG(errs() << "finding pattern for replacement!\n");
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
+    bool cont = false;
+    bool keepGEPI = false;
+    bool keepGEPI2 = false;
+    Instruction *I = &(*i);
+    GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
 
-	// DFInternalNode *Root = DFG.getRoot();
-	std::vector<DFInternalNode*> Roots = DFG.getRoots();
-	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+    if (!GEPI) {
+      // did nod find pattern start, continue
+      continue;
+    }
+    // may have found pattern, check
+    DEBUG(errs() << "GEPI " << *GEPI << "\n");
+    // print whatever we want for debug
+    Value *PtrOp = GEPI->getPointerOperand();
+    Type *SrcTy = GEPI->getSourceElementType();
+    unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+    if (SrcTy->isArrayTy())
+      DEBUG(errs() << *SrcTy << " is an array type! "
+                   << *(SrcTy->getArrayElementType()) << "\n");
+    else
+      DEBUG(errs() << *SrcTy << " is not an array type!\n");
+    // check that source element type is float
+    if (SrcTy->isArrayTy()) {
+      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+        DEBUG(errs() << "GEPI type is array but not float!\n");
+        continue;
+      }
+    } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) {
+      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+      // does not fit this pattern - no float GEP instruction
+      continue;
+    }
+    // check that addressspace is 1
+    //	  if (GEPIaddrspace != 1) {
+    //			// does not fit this pattern - addrspace of pointer argument
+    //is not global 			continue;
+    //		}
+    if (!(GEPI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI = true;
+    }
+    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+    // 1st GEPI it has one use
+    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+    // See if it is a bitcast
+    BitCastInst *BitCastI;
+    for (User *U : GEPI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "--" << *ui << "\n");
+        if (isa<BitCastInst>(ui)) {
+          BitCastI = dyn_cast<BitCastInst>(ui);
+          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = GEPI->user_begin(),
+    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
+    //        DEBUG(errs() << "--" << *ui << "\n");
+    //			if (isa<BitCastInst>(*ui)) {
+    //				BitCastI = dyn_cast<BitCastInst>(*ui);
+    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+    //			}
+    //		}
+
+    if (cont /*!BitCastI*/) {
+      continue; // not in pattern
+    }
 
-	// Visitor for Code Generation Graph Traversal
-	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+    //    DEBUG(errs() << *BitCastI << "\n");
+    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand
+    // has to be the GEP, since this is a use of the GEP.
+    Value *Op2 = BitCastI->getOperand(0);
+    DEBUG(errs() << "----" << *Op2 << "\n");
+    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+    //		Type *OpTy = cast<Type>(Op2);
+    Type *OpTy = BitCastI->getDestTy();
+    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) <<
+    //    "\n");
+    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+      continue; // not in pattern
+    }
 
-	// Iterate over all the DFGs and produce code for each one of them
-	for (auto rootNode: Roots) {
-		// Initiate code generation for root DFNode
-		CGTVisitor->visit(rootNode);
-	}
+    DEBUG(errs() << "----Here!\n");
+    // We are in GEP, bitcast.
 
-	CGTVisitor->writeKernelsModule();
+    // user_iterator, to find the load.
 
-	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
-	delete CGTVisitor;
+    if (!(BitCastI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+    }
+    DEBUG(errs() << "----Bitcast has one use!\n");
+    // it has one use
+    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+    LoadInst *LoadI;
+    for (User *U : BitCastI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "-----" << *ui << "\n");
+        if (isa<LoadInst>(ui)) {
+          LoadI = dyn_cast<LoadInst>(ui);
+          DEBUG(errs() << "-----Found load as only use of bitcast\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = BitCastI->user_begin(),
+    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+    //			if (isa<LoadInst>(*ui)) {
+    //				LoadI = dyn_cast<LoadInst>(*ui);
+    //        errs() << "Found load as only use of bitcast\n";
+    //			}
+    //		}
+
+    if (cont) {
+      continue; // not in pattern
+    }
+
+    DEBUG("HERE!\n");
+    // check that we load from pointer we got from bitcast - assert - the unique
+    // argument must be the use we found it from
+    assert(LoadI->getPointerOperand() == BitCastI &&
+           "Unexpected Load Instruction Operand\n");
+
+    // Copy user_iterator, to find the store.
+
+    if (!(LoadI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+      // TODO: generalize: one load can have more than one store users
+    }
+
+    // it has one use
+    assert(LoadI->hasOneUse() && "LoadI has a single use");
+    Value::user_iterator ui = LoadI->user_begin();
+    // skipped loop, because is has a single use
+    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+    if (!StoreI) {
+      continue; // not in pattern
+    }
 
-	return true;
+    // Also check that the store uses the loaded value as the value operand
+    if (StoreI->getValueOperand() != LoadI) {
+      continue;
+    }
+
+    DEBUG(errs() << "-------Found store instruction\n");
+
+    // Look for its bitcast, which is its pointer operand
+    Value *StPtrOp = StoreI->getPointerOperand();
+    DEBUG(errs() << "-------" << *StPtrOp << "\n");
+    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+    if (!BitCastI2) {
+      continue; // not in pattern
+    }
+
+    DEBUG(errs() << "-------- Found Bit Cast of store!\n");
+    // found bitcast. Look for the second GEP, its from operand.
+    Value *BCFromOp = BitCastI2->getOperand(0);
+    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+    if (!GEPI2) {
+      continue; // not in pattern
+    }
+
+    if (!(GEPI2->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI2 = true;
+    }
+    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n");
+
+    Value *PtrOp2 = GEPI2->getPointerOperand();
+
+    // Found GEPI2. TODO: kind of confused as o what checks I need to add here,
+    // let's add them together- all the code for int-float type checks is
+    // already above.
+
+    // Assume we found pattern
+    if (!keepGEPI) {
+      IItoRemove.push_back(GEPI);
+      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+    } else {
+      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+    }
+    IItoRemove.push_back(BitCastI);
+    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+    IItoRemove.push_back(LoadI);
+    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+    IItoRemove.push_back(GEPI2);
+    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+    IItoRemove.push_back(BitCastI2);
+    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+    if (!keepGEPI2) {
+      IItoRemove.push_back(StoreI);
+      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+    } else {
+
+      DEBUG(errs() << "Keeping " << *StoreI
+                   << " since it has multiple uses!\n");
+    }
+
+    std::vector<Value *> GEPlIndex;
+    if (GEPI->hasIndices()) {
+      for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+        GEPlIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+    std::vector<Value *> GEPsIndex;
+    if (GEPI2->hasIndices()) {
+      for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+        GEPsIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+    GetElementPtrInst *newlGEP = GetElementPtrInst::Create(
+        GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp,                        // operand from 1st GEP
+        ArrayRef<Value *>(GEPlIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+    // insert load before GEPI
+    LoadInst *newLoadI =
+        new LoadInst(Type::getFloatTy(M.getContext()),
+                     newlGEP, // new GEP
+                     Twine(), LoadI->isVolatile(), LoadI->getAlignment(),
+                     LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+    // same for GEP for store, for store operand
+    GetElementPtrInst *newsGEP = GetElementPtrInst::Create(
+        GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp2,                        // operand from 2nd GEP
+        ArrayRef<Value *>(GEPsIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+    // insert store before GEPI
+    StoreInst *newStoreI =
+        new StoreInst(newLoadI,
+                      newsGEP, // new GEP
+                      StoreI->isVolatile(), StoreI->getAlignment(),
+                      StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+  }
+
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (auto *I : reverse(IItoRemove)) {
+    DEBUG(errs() << "Erasing: " << *I << "\n");
+    I->eraseFromParent();
+  }
+
+  // Removed the cloned functions from the parent module into the new module
+  for (auto *F : FuncToBeRemoved) {
+    F->removeFromParent(); // TODO: MARIA check
+    KernelM->getFunctionList().push_back(F);
+  }
+
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName()
+               << "\n");
+  DEBUG(errs() << *KernelM);
+
+  return;
 }
 
-std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-	/*SmallString<128> currentDir;
-		llvm::sys::fs::current_path(currentDir);
-		std::string fileName = getFilenameFromModule(M);
-		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-		return output.str().append(".kernels.ll");*/
-	std::string mid = M.getModuleIdentifier();
-	return mid.append(".kernels.ll");
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+  DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n");
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap =
+  //    DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap
+  //    = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode : Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
+
+  CGTVisitor->writeKernelsModule();
+
+  // TODO: Edit module epilogue to remove the HPVM intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
 }
 
-void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-	assert(isa<PointerType>(V->getType())
-			&& "Value should be of Pointer Type!");
-	PointerType* OldTy = cast<PointerType>(V->getType());
-	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-	V->mutateType(NewTy);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-		// Change all uses producing pointer type in same address space to new
-		// addressspace.
-		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-				fixValueAddrspace(*ui, addrspace);
-			}
-		}
-	}
+std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
+  /*SmallString<128> currentDir;
+          llvm::sys::fs::current_path(currentDir);
+          std::string fileName = getFilenameFromModule(M);
+          Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+          return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
 }
 
+void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!");
+  PointerType *OldTy = cast<PointerType>(V->getType());
+  PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if (PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
+}
 
-std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-	std::vector<unsigned> ConstantMemArgs;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument* arg = &*ai; 
-		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-				GlobalMemArgs->end(), arg->getArgNo());
-		// It has to be a global memory argument to be promotable
-		if(pos == GlobalMemArgs->end())
-			continue;
-
-		// Check if it can/should be promoted
-		if(canBePromoted(arg, F)) {
-			errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n";
-			ConstantMemArgs.push_back(arg->getArgNo());
-			GlobalMemArgs->erase(pos);
-		}
-	}
-	return ConstantMemArgs;
+std::vector<unsigned>
+CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs,
+                                     Function *F) {
+  std::vector<unsigned> ConstantMemArgs;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    std::vector<unsigned>::iterator pos = std::find(
+        GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo());
+    // It has to be a global memory argument to be promotable
+    if (pos == GlobalMemArgs->end())
+      continue;
+
+    // Check if it can/should be promoted
+    if (canBePromoted(arg, F)) {
+      DEBUG(errs() << "Promoting << " << arg->getName()
+                   << " to constant memory."
+                   << "\n");
+      ConstantMemArgs.push_back(arg->getArgNo());
+      GlobalMemArgs->erase(pos);
+    }
+  }
+  return ConstantMemArgs;
 }
 
-Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-	unsigned idx = 0;
-	std::vector<Type*> ArgTypes;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument *arg = &*ai;
-		DEBUG(errs() << *arg << "\n");
-		unsigned argno = arg->getArgNo();
-		if ((idx < Args.size()) && (argno == Args[idx])) {
-			fixValueAddrspace(arg, addrspace);
-			idx++;
-		}
-		ArgTypes.push_back(arg->getType());
-	}
-	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-	return newF;
+Function *CGT_NVPTX::changeArgAddrspace(Function *F,
+                                        std::vector<unsigned> &Args,
+                                        unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type *> ArgTypes;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    DEBUG(errs() << *arg << "\n");
+    unsigned argno = arg->getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg->getType());
+  }
+  FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
+
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+
+  DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n");
+  return newF;
 }
 
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {
 
-	IRBuilder<> Builder(&*F->begin());
+  IRBuilder<> Builder(&*F->begin());
+
+  SmallVector<Metadata *, 8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
+
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
+
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels =
+      KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(ValueAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations =
+      KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+}
 
-	SmallVector<Metadata*,8> KernelMD;
-	KernelMD.push_back(ValueAsMetadata::get(F));
+void CGT_NVPTX::writeKernelsModule() {
 
-	// TODO: There is additional metadata used by kernel files but we skip them as
-	// they are not mandatory. In future they might be useful to enable
-	// optimizations
+  // In addition to deleting all other functions, we also want to spiff it
+  // up a little bit.  Do this now.
+  legacy::PassManager Passes;
 
-	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-	MDN_kernels->addOperand(MDKernelNode);
+  DEBUG(errs() << "Writing to File --- ");
+  DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
+  std::error_code EC;
+  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  if (EC) {
+    DEBUG(errs() << EC.message() << '\n');
+  }
 
-	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-	// TODO: Replace 1 with the number of the kernel.
-	// Add when support for multiple launces is added
-	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+  Passes.add(createPrintModulePass(Out.os()));
 
+  Passes.run(*KernelM);
+
+  // Declare success.
+  Out.keep();
 }
 
-void CGT_NVPTX::writeKernelsModule() {
+Function *CGT_NVPTX::transformFunctionToVoid(Function *F) {
 
-	// In addition to deleting all other functions, we also want to spiff it
-	// up a little bit.  Do this now.
-	legacy::PassManager Passes;
+  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+  // FIXME: Maybe do that using the Node?
+  StructType *FRetTy = dyn_cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
 
-	errs() << "Writing to File --- ";
-	errs() << getKernelsModuleName(M).c_str() << "\n";
-	std::error_code EC;
-	ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-	if (EC) {
-		errs() << EC.message() << '\n';
-	}
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
 
-	Passes.add(
-			createPrintModulePass(Out.os()));
+  std::vector<Type *> RetArgTypes;
+  std::vector<Argument *> RetArgs;
+  std::vector<Argument *> Args;
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
 
-	Passes.run(*KernelM);
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
 
-	// Declare success.
-	Out.keep();
-}
+    // Replacing return statements with others returning void
+    for (auto *RI : RItoRemove) {
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  } else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    for (unsigned i = 0; i < FRetTy->getNumElements(); i++) {
+      Argument *RetArg =
+          new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      RetArgs.push_back(RetArg);
+      RetArgTypes.push_back(RetArg->getType());
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
 
-Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
-
-	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-	// FIXME: Maybe do that using the Node?
-	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-	assert(FRetTy && "Return Type must always be a struct");
-
-	// Keeps return statements, because we will need to replace them
-	std::vector<ReturnInst *> RItoRemove;
-	findReturnInst(F, RItoRemove);
-
-	std::vector<Type *> RetArgTypes;
-	std::vector<Argument*> RetArgs;
-	std::vector<Argument*> Args;
-	// Check for { } return struct, which means that the function returns void
-	if (FRetTy->isEmptyTy()) {
-
-		DEBUG(errs() << "\tFunction output struct is void\n");
-		DEBUG(errs() << "\tNo parameters added\n");
-
-		// Replacing return statements with others returning void
-		for (auto *RI : RItoRemove) {
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-		}
-		DEBUG(errs() << "\tChanged return statements to return void\n");
-	}
-	else {
-		// The struct has return values, thus needs to be converted to parameter
-
-		// Iterate over all element types of return struct and add arguments to the
-		// function
-		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-			RetArgs.push_back(RetArg);
-			RetArgTypes.push_back(RetArg->getType());
-			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-		}
-
-		DEBUG(errs() << "\tReplacing Return statements\n");
-		// Replace return statements with extractValue and store instructions
-		for (auto *RI : RItoRemove) {
-			Value* RetVal = RI->getReturnValue();
-			for(unsigned i = 0; i < RetArgs.size(); i++) {
-				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-						RetArgs[i]->getName()+".val", RI);
-				new StoreInst(EI, RetArgs[i], RI);
-			}
-			// assert(RetVal && "Return value should not be null at this point");
-			// StructType* RetType = cast<StructType>(RetVal->getType());
-			// assert(RetType && "Return type is not a struct");
-
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-
-		}
-	}
-	DEBUG(errs() << "\tReplaced return statements\n");
-
-	// Create the argument type list with the added argument's type
-	std::vector<Type*> ArgTypes;
-	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		ArgTypes.push_back(ai->getType());
-	}
-	for(auto *RATy: RetArgTypes) {
-		ArgTypes.push_back(RATy);
-	}
-
-	// Creating Args vector to use in cloning!
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Args.push_back(&*ai);
-	}
-	for(auto *ai : RetArgs) {
-		Args.push_back(ai);
-	}
-
-	// Adding new arguments to the function argument list, would not change the
-	// function type. We need to change the type of this function to reflect the
-	// added arguments
-	Type* VoidRetType = Type::getVoidTy(F->getContext());
-	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-	// Change the function type
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-	//F->eraseFromParent();
-	return newF;
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (auto *RI : RItoRemove) {
+      Value *RetVal = RI->getReturnValue();
+      for (unsigned i = 0; i < RetArgs.size(); i++) {
+        ExtractValueInst *EI = ExtractValueInst::Create(
+            RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI);
+        new StoreInst(EI, RetArgs[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  for (auto *RATy : RetArgTypes) {
+    ArgTypes.push_back(RATy);
+  }
+
+  // Creating Args vector to use in cloning!
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Args.push_back(&*ai);
+  }
+  for (auto *ai : RetArgs) {
+    Args.push_back(ai);
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type *VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false, NULL, &Args);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  // F->eraseFromParent();
+  return newF;
 }
 
 /******************************************************************************
@@ -2102,314 +2130,332 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 1. No stores
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
-static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	VisitedList->push_back(V);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-			ui != ue; ++ui) {
-		Instruction* I = dyn_cast<Instruction>(*ui);
-		if(!I) {
-			// if use is not an instruction, then skip it
-			continue;
-		}
-		DEBUG(errs() << "\t" << *I << "\n");
-		if(isa<LoadInst>(I)) {
-			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-			UseList->push_back(V);
-		}
-		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-			// found a store in use chain
-			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-			return true;
-		}
-		else if(BuildDFG::isViscIntrinsic(I)) {
-			// If it is an atomic intrinsic, we found a store
-			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-					&& "Only visc atomic intrinsics can have an argument as input");
-			return true;
-		}
-		else {
-			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-			if(findLoadStoreUses(I, UseList, VisitedList))
-				return true;
-		}
-	}
-	return false;
+static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList,
+                              std::vector<Value *> *VisitedList) {
+  if (std::find(VisitedList->begin(), VisitedList->end(), V) !=
+      VisitedList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  VisitedList->push_back(V);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ++ui) {
+    Instruction *I = dyn_cast<Instruction>(*ui);
+    if (!I) {
+      // if use is not an instruction, then skip it
+      continue;
+    }
+    DEBUG(errs() << "\t" << *I << "\n");
+    if (isa<LoadInst>(I)) {
+      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+      UseList->push_back(V);
+    } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+      // found a store in use chain
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+      return true;
+    } else if (BuildDFG::isHPVMIntrinsic(I)) {
+      // If it is an atomic intrinsic, we found a store
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      assert(II &&
+             II->getCalledValue()->getName().startswith("llvm.hpvm.atomic") &&
+             "Only hpvm atomic intrinsics can have an argument as input");
+      return true;
+    } else {
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+      if (findLoadStoreUses(I, UseList, VisitedList))
+        return true;
+    }
+  }
+  return false;
 }
 
-static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	DependenceList->push_back(V);
-	// If not an instruction, then not dependent on node instance id
-	if(!isa<Instruction>(V) || isa<Constant>(V)) {
-		DEBUG(errs() << "\tStop\n");
-		return false;
-	}
-
-	Instruction* I = cast<Instruction>(V);
-	for(unsigned i = 0; i < I->getNumOperands(); i++) {
-		Value* operand = I->getOperand(i);
-		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-				Value* Node = II->getArgOperand(0);
-				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-					return true;
-				}
-			}
-		}
-		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-			continue;
-		}
-		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-			return true;
-		}
-	}
-	return false;
+static bool isDependentOnNodeInstanceID(Value *V,
+                                        std::vector<Value *> *DependenceList) {
+  if (std::find(DependenceList->begin(), DependenceList->end(), V) !=
+      DependenceList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  DependenceList->push_back(V);
+  // If not an instruction, then not dependent on node instance id
+  if (!isa<Instruction>(V) || isa<Constant>(V)) {
+    DEBUG(errs() << "\tStop\n");
+    return false;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  for (unsigned i = 0; i < I->getNumOperands(); i++) {
+    Value *operand = I->getOperand(i);
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) {
+      if ((II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_x ||
+           II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_y ||
+           II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_z)) {
+        Value *Node = II->getArgOperand(0);
+        IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node);
+        assert(
+            GN &&
+            "NodeInstanceID operande should be node/parent node intrinsic\n");
+        if (GN->getIntrinsicID() == Intrinsic::hpvm_getNode) {
+          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II
+                       << "\n");
+          return true;
+        }
+      }
+    }
+    if (CmpInst *CI = dyn_cast<CmpInst>(operand)) {
+      DEBUG(errs() << "Found compare instruction: " << *CI
+                   << "\nNot following its dependency list\n");
+      continue;
+    }
+    DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+    if (isDependentOnNodeInstanceID(operand, DependenceList)) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
-static bool canBePromoted(Argument* arg, Function* F) {
-	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-	std::vector<Value*> UseList;
-	std::vector<Value*> VisitedList;
-	// recursively traverse use chain
-	// if find a store instruction return false, everything fails, cannot be
-	// promoted
-	// if find a load instruction as use, add the GEP instruction to list
-	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-	if(foundStore == true)
-		return false;
-	// See that the GEP instructions are not dependent on getNodeInstanceID
-	// intrinsic
-	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-	std::vector<Value*>DependenceList;
-	for(auto U: UseList) {
-		if(isDependentOnNodeInstanceID(U, &DependenceList))
-			return false;
-	}
-	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-	return true;
+static bool canBePromoted(Argument *arg, Function *F) {
+  DEBUG(errs() << "OPT: Check if Argument " << *arg
+               << " can be changed to constant memory\n");
+  std::vector<Value *> UseList;
+  std::vector<Value *> VisitedList;
+  // recursively traverse use chain
+  // if find a store instruction return false, everything fails, cannot be
+  // promoted
+  // if find a load instruction as use, add the GEP instruction to list
+  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+  if (foundStore == true)
+    return false;
+  // See that the GEP instructions are not dependent on getNodeInstanceID
+  // intrinsic
+  DEBUG(errs() << foundStore
+               << "\tNo Store Instruction found. Check dependence on node "
+                  "instance ID\n");
+  std::vector<Value *> DependenceList;
+  for (auto U : UseList) {
+    if (isDependentOnNodeInstanceID(U, &DependenceList))
+      return false;
+  }
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+  return true;
 }
 
-
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
-static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-	// Assign number of dimenstions a constant value
-	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-	// If local work group size if null
-	if(!kernel->hasLocalWG()) {
-		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-	}
-	else {
-		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-			if(isa<Argument>(kernel->localWGSize[i]))
-				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-		}
-		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-	}
-
-	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-		if(isa<Argument>(kernel->globalWGSize[i]))
-			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-	}
-
-	// For OpenCL, global work group size is the total bumber of instances in each
-	// dimension. So, multiply local and global dim limits.
-	std::vector<Value*> globalWGSizeInsts;
-	if(kernel->hasLocalWG()) {
-		for (unsigned i = 0; i < kernel->gridDim; i++) {
-			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-			globalWGSizeInsts.push_back(MulInst);
-		}
-	}
-	else {
-		globalWGSizeInsts = kernel->globalWGSize;
-	}
-	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr,
+                                 Value *&GlobalWGPtr, Kernel *kernel,
+                                 ValueToValueMapTy &VMap, Instruction *IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if (!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  } else {
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr =
+        genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if (isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value *> globalWGSizeInsts;
+  if (kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator *MulInst =
+          BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i],
+                                 kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  } else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-	Value* WGPtr;
-	// Get int64_t and or ease of use
-	Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-	// Work Group type is [#dim x i64]
-	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-	// Allocate space of Global work group data on stack and get pointer to
-	// first element.
-	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-	Value* nextDim = WGPtr;
-	DEBUG(errs() << *WGPtr << "\n");
-
-	// Iterate over the number of dimensions and store the global work group
-	// size in that dimension
-	for(unsigned i=0; i < WGSize.size(); i++) {
-		DEBUG(errs() << *WGSize[i] << "\n");
-		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-		if(WGSize[i]->getType() != Int64Ty) {
-			// If number of dimensions are mentioned in any other integer format,
-			// generate code to extend it to i64. We need to use the mapped value in
-			// the new generated function, hence the use of VMap
-			// FIXME: Why are we changing the kernel WGSize vector here?
-			DEBUG(errs() << "Not i64. Zero extend required.\n");
-			DEBUG(errs() << *WGSize[i] << "\n");
-			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-			DEBUG(errs() << "Bitcast done.\n");
-			StoreInst* SI = new StoreInst(CI, nextDim, IB);
-			DEBUG(errs() << "Zero extend done.\n");
-			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-		} else {
-			// Store the value representing work group size in ith dimension on
-			// stack
-			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-		}
-		if(i+1 < WGSize.size()) {
-			// Move to next dimension
-			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-					WG->getName()+"."+Twine(i+1),
-					IB);
-			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-			nextDim = GEP;
-		}
-	}
-	return WGPtr;
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
+                              ValueToValueMapTy &VMap, Instruction *IB,
+                              const Twine &WGName) {
+  Value *WGPtr;
+  // Get int64_t and or ease of use
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type *WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(),
+                                         WG->getName() + ".0", IB);
+  Value *nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for (unsigned i = 0; i < WGSize.size(); i++) {
+    DEBUG(errs() << *WGSize[i] << "\n");
+    assert(WGSize[i]->getType()->isIntegerTy() &&
+           "Dimension not an integer type!");
+
+    if (WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst *CI =
+          BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst *SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB);
 
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if (i + 1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)),
+          WG->getName() + "." + Twine(i + 1), IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
 }
 
 // Get generated PTX binary name
-static std::string getPTXFilename(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	moduleID.append(".kernels.cl");
-	return moduleID;
+static std::string getPTXFilename(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".kernels.cl");
+  return moduleID;
 }
 
 // Get the name of the input file from module ID
-static std::string getFilenameFromModule(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	return moduleID.substr(moduleID.find_last_of("/")+1);
+static std::string getFilenameFromModule(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/") + 1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-	if (TARGET_PTX == 32)
-		M.setDataLayout(StringRef(nvptx32_layoutStr));
-	else if (TARGET_PTX == 64)
-		M.setDataLayout(StringRef(nvptx64_layoutStr));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 static void changeTargetTriple(Module &M) {
-	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-	if (TARGET_PTX == 32)
-		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-	else if (TARGET_PTX == 64)
-		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 // Helper function, populate a vector with all return statements in a function
-static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-	for (auto &BB : *F) {
-		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-			ReturnInstVec.push_back(RI);
-	}	
+static void findReturnInst(Function *F,
+                           std::vector<ReturnInst *> &ReturnInstVec) {
+  for (auto &BB : *F) {
+    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      ReturnInstVec.push_back(RI);
+  }
 }
 
-// Helper function, populate a vector with all IntrinsicID intrinsics in a function
-static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-		Instruction *I = &(*i);
-		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-		if (II && II->getIntrinsicID() == IntrinsicID) {
-			IntrinsicInstVec.push_back(II);
-		}
-	}
+// Helper function, populate a vector with all IntrinsicID intrinsics in a
+// function
+static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID,
+                              std::vector<IntrinsicInst *> &IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
 }
 
-// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic
+// op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return AtomicRMWInst::Add;
-		case Intrinsic::visc_atomic_sub:
-			return AtomicRMWInst::Sub;
-		case Intrinsic::visc_atomic_min:
-			return AtomicRMWInst::Min;
-		case Intrinsic::visc_atomic_max:
-			return AtomicRMWInst::Max;
-		case Intrinsic::visc_atomic_xchg:
-			return AtomicRMWInst::Xchg;
-		case Intrinsic::visc_atomic_and:
-			return AtomicRMWInst::And;
-		case Intrinsic::visc_atomic_or:
-			return AtomicRMWInst::Or;
-		case Intrinsic::visc_atomic_xor:
-			return AtomicRMWInst::Xor;
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::hpvm_atomic_add:
+    return AtomicRMWInst::Add;
+  case Intrinsic::hpvm_atomic_sub:
+    return AtomicRMWInst::Sub;
+  case Intrinsic::hpvm_atomic_min:
+    return AtomicRMWInst::Min;
+  case Intrinsic::hpvm_atomic_max:
+    return AtomicRMWInst::Max;
+  case Intrinsic::hpvm_atomic_xchg:
+    return AtomicRMWInst::Xchg;
+  case Intrinsic::hpvm_atomic_and:
+    return AtomicRMWInst::And;
+  case Intrinsic::hpvm_atomic_or:
+    return AtomicRMWInst::Or;
+  case Intrinsic::hpvm_atomic_xor:
+    return AtomicRMWInst::Xor;
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
-
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return "atom_add";
-		case Intrinsic::visc_atomic_sub:
-			return "atom_sub";
-		case Intrinsic::visc_atomic_min:
-			return "atom_min";
-		case Intrinsic::visc_atomic_max:
-			return "atom_max";
-		case Intrinsic::visc_atomic_xchg:
-			return "atom_xchg";
-		case Intrinsic::visc_atomic_and:
-			return "atom_and";
-		case Intrinsic::visc_atomic_or:
-			return "atom_or";
-		case Intrinsic::visc_atomic_xor:
-			return "atom_xor";
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::hpvm_atomic_add:
+    return "atom_add";
+  case Intrinsic::hpvm_atomic_sub:
+    return "atom_sub";
+  case Intrinsic::hpvm_atomic_min:
+    return "atom_min";
+  case Intrinsic::hpvm_atomic_max:
+    return "atom_max";
+  case Intrinsic::hpvm_atomic_xchg:
+    return "atom_xchg";
+  case Intrinsic::hpvm_atomic_and:
+    return "atom_and";
+  case Intrinsic::hpvm_atomic_or:
+    return "atom_or";
+  case Intrinsic::hpvm_atomic_xor:
+    return "atom_xor";
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
 } // End of namespace
@@ -2420,4 +2466,3 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
 		false /* does not modify the CFG */,
 		true /* transformation,   *
 					* not just analysis */);
-
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 6498b46cd9a56ad69df35d4497b463b9dda98c87..21adabf4ebe5999134491f163aa8119d44f84f10 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -8,34 +8,33 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "DFG2LLVM_X86"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
+#include "SupportHPVM/DFG2LLVM.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Constant.h"
-#include "SupportVISC/DFG2LLVM.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
 
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer_X86("hpvm-timers-x86",
+                                   cl::desc("Enable hpvm timers"));
 
 namespace {
 
-
 // DFG2LLVM_X86 - The first implementation.
 struct DFG2LLVM_X86 : public DFG2LLVM {
   static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_X86() :DFG2LLVM(ID) {}
+  DFG2LLVM_X86() : DFG2LLVM(ID) {}
 
 private:
   // Member variables
@@ -50,58 +49,59 @@ public:
 class CGT_X86 : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
 
   FunctionCallee malloc;
-  // VISC Runtime API
-  FunctionCallee llvm_visc_x86_launch;
-  FunctionCallee llvm_visc_x86_wait;
-  FunctionCallee llvm_visc_x86_argument_ptr;
-
-  FunctionCallee llvm_visc_streamLaunch;
-  FunctionCallee llvm_visc_streamPush;
-  FunctionCallee llvm_visc_streamPop;
-  FunctionCallee llvm_visc_streamWait;
-  FunctionCallee llvm_visc_createBindInBuffer;
-  FunctionCallee llvm_visc_createBindOutBuffer;
-  FunctionCallee llvm_visc_createEdgeBuffer;
-  FunctionCallee llvm_visc_createLastInputBuffer;
-  FunctionCallee llvm_visc_createThread;
-  FunctionCallee llvm_visc_bufferPush;
-  FunctionCallee llvm_visc_bufferPop;
-  FunctionCallee llvm_visc_x86_dstack_push;
-  FunctionCallee llvm_visc_x86_dstack_pop;
-  FunctionCallee llvm_visc_x86_getDimLimit;
-  FunctionCallee llvm_visc_x86_getDimInstance;
-  
-  //Functions
-  std::vector<IntrinsicInst*>* getUseList(Value* LI);
-  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
-  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+  // HPVM Runtime API
+  FunctionCallee llvm_hpvm_x86_launch;
+  FunctionCallee llvm_hpvm_x86_wait;
+  FunctionCallee llvm_hpvm_x86_argument_ptr;
+
+  FunctionCallee llvm_hpvm_streamLaunch;
+  FunctionCallee llvm_hpvm_streamPush;
+  FunctionCallee llvm_hpvm_streamPop;
+  FunctionCallee llvm_hpvm_streamWait;
+  FunctionCallee llvm_hpvm_createBindInBuffer;
+  FunctionCallee llvm_hpvm_createBindOutBuffer;
+  FunctionCallee llvm_hpvm_createEdgeBuffer;
+  FunctionCallee llvm_hpvm_createLastInputBuffer;
+  FunctionCallee llvm_hpvm_createThread;
+  FunctionCallee llvm_hpvm_bufferPush;
+  FunctionCallee llvm_hpvm_bufferPop;
+  FunctionCallee llvm_hpvm_x86_dstack_push;
+  FunctionCallee llvm_hpvm_x86_dstack_pop;
+  FunctionCallee llvm_hpvm_x86_getDimLimit;
+  FunctionCallee llvm_hpvm_x86_getDimInstance;
+
+  // Functions
+  std::vector<IntrinsicInst *> *getUseList(Value *LI);
+  Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = "");
+  void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
   Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
-  Argument* getArgumentFromEnd(Function* F, unsigned offset);
-  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
-                      Instruction* InsertBefore);
-  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  StructType* getArgumentListStructTy(DFNode*);
-  Function* createFunctionFilter(DFNode* C);
-  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
-                      Value*, Value*, Instruction*);
-  Function* createLaunchFunction(DFInternalNode*);
-  
+  Argument *getArgumentFromEnd(Function *F, unsigned offset);
+  Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
+                      Instruction *InsertBefore);
+  void invokeChild_X86(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
+                       Instruction *InsertBefore);
+  void invokeChild_PTX(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
+                       Instruction *InsertBefore);
+  StructType *getArgumentListStructTy(DFNode *);
+  Function *createFunctionFilter(DFNode *C);
+  void startNodeThread(DFNode *, std::vector<Value *>,
+                       DenseMap<DFEdge *, Value *>, Value *, Value *,
+                       Instruction *);
+  Function *createLaunchFunction(DFInternalNode *);
+
   // Virtual Functions
   void init() {
-    VISCTimer = VISCTimer_X86;
+    HPVMTimer = HPVMTimer_X86;
     TargetName = "X86";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
-  Function* codeGenStreamPush(DFInternalNode* N);
-  Function* codeGenStreamPop(DFInternalNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
+  Function *codeGenStreamPush(DFInternalNode *N);
+  Function *codeGenStreamPop(DFInternalNode *N);
 
 public:
   // Constructor
@@ -110,8 +110,8 @@ public:
     initRuntimeAPI();
   }
 
-  void codeGenLaunch(DFInternalNode* Root);
-  void codeGenLaunchStreaming(DFInternalNode* Root);
+  void codeGenLaunch(DFInternalNode *Root);
+  void codeGenLaunchStreaming(DFInternalNode *Root);
 };
 
 bool DFG2LLVM_X86::runOnModule(Module &M) {
@@ -122,8 +122,8 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   // - Maps from i8* hansles to DFNode and DFEdge
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  //DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
@@ -131,16 +131,17 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
 
   // Iterate over all the DFGs and produce code for each one of them
-  for (auto &rootNode: Roots) {
+  for (auto &rootNode : Roots) {
     // Initiate code generation for root DFNode
     CGTVisitor->visit(rootNode);
-    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise
+    // return now.
     // TODO: Later on, we might like to do this in a separate pass, which would
-    // allow us the flexibility to switch between complete static code generation
-    // for DFG or having a customized runtime+scheduler
-    
+    // allow us the flexibility to switch between complete static code
+    // generation for DFG or having a customized runtime+scheduler
+
     // Do streaming code generation if root node is streaming. Usual otherwise
-    if(rootNode->isChildGraphStreaming())
+    if (rootNode->isChildGraphStreaming())
       CGTVisitor->codeGenLaunchStreaming(rootNode);
     else
       CGTVisitor->codeGenLaunch(rootNode);
@@ -150,61 +151,61 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   return true;
 }
 
-// Initialize the VISC runtime API. This makes it easier to insert these calls
+// Initialize the HPVM runtime API. This makes it easier to insert these calls
 void CGT_X86::initRuntimeAPI() {
 
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc";
+  Twine runtimeAPI =
+      llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
 
-  if(runtimeModule == nullptr) {
+  if (runtimeModule == nullptr) {
     DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
     assert(false && "couldn't parse runtime");
-  }
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
 
   // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_x86_launch);
+  DECLARE(llvm_hpvm_x86_launch);
   DECLARE(malloc);
-  DECLARE(llvm_visc_x86_wait);
-  DECLARE(llvm_visc_x86_argument_ptr);
-  DECLARE(llvm_visc_streamLaunch);
-  DECLARE(llvm_visc_streamPush);
-  DECLARE(llvm_visc_streamPop);
-  DECLARE(llvm_visc_streamWait);
-  DECLARE(llvm_visc_createBindInBuffer);
-  DECLARE(llvm_visc_createBindOutBuffer);
-  DECLARE(llvm_visc_createEdgeBuffer);
-  DECLARE(llvm_visc_createLastInputBuffer);
-  DECLARE(llvm_visc_createThread);
-  DECLARE(llvm_visc_bufferPush);
-  DECLARE(llvm_visc_bufferPop);
-  DECLARE(llvm_visc_x86_dstack_push);
-  DECLARE(llvm_visc_x86_dstack_pop);
-  DECLARE(llvm_visc_x86_getDimLimit);
-  DECLARE(llvm_visc_x86_getDimInstance);
+  DECLARE(llvm_hpvm_x86_wait);
+  DECLARE(llvm_hpvm_x86_argument_ptr);
+  DECLARE(llvm_hpvm_streamLaunch);
+  DECLARE(llvm_hpvm_streamPush);
+  DECLARE(llvm_hpvm_streamPop);
+  DECLARE(llvm_hpvm_streamWait);
+  DECLARE(llvm_hpvm_createBindInBuffer);
+  DECLARE(llvm_hpvm_createBindOutBuffer);
+  DECLARE(llvm_hpvm_createEdgeBuffer);
+  DECLARE(llvm_hpvm_createLastInputBuffer);
+  DECLARE(llvm_hpvm_createThread);
+  DECLARE(llvm_hpvm_bufferPush);
+  DECLARE(llvm_hpvm_bufferPop);
+  DECLARE(llvm_hpvm_x86_dstack_push);
+  DECLARE(llvm_hpvm_x86_dstack_pop);
+  DECLARE(llvm_hpvm_x86_getDimLimit);
+  DECLARE(llvm_hpvm_x86_getDimInstance);
 
   // Get or insert timerAPI functions as well if you plan to use timers
   initTimerAPI();
 
   // Insert init context in main
-  Function* VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
+  Function *VI = M.getFunction("llvm.hpvm.init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
   DEBUG(errs() << "Inserting x86 timer initialization\n");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
+  Instruction *I = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
-  // Insert print instruction at visc exit
-  Function* VC = M.getFunction("llvm.visc.cleanup");
-  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
+  switchToTimer(hpvm_TimerID_NONE, I);
+  // Insert print instruction at hpvm exit
+  Function *VC = M.getFunction("llvm.hpvm.cleanup");
+  assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
 
   DEBUG(errs() << "Inserting x86 timer print\n");
   printTimerSet(I);
@@ -212,12 +213,13 @@ void CGT_X86::initRuntimeAPI() {
 
 /* Returns vector of all wait instructions
  */
-std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
-  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
+std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) {
+  std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>();
   // It must have been loaded from memory somewhere
-  for(Value::user_iterator ui = GraphID->user_begin(),
-      ue = GraphID->user_end(); ui!=ue; ++ui) {
-    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+  for (Value::user_iterator ui = GraphID->user_begin(),
+                            ue = GraphID->user_end();
+       ui != ue; ++ui) {
+    if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) {
       UseList->push_back(waitI);
     } else {
       llvm_unreachable("Error: Operation on Graph ID not supported!\n");
@@ -229,14 +231,14 @@ std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
 /* Traverse the function argument list in reverse order to get argument at a
  * distance offset fromt he end of argument list of function F
  */
-Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
-  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
-         && "Invalid offset to access arguments!");
+Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
+         "Invalid offset to access arguments!");
   Function::arg_iterator e = F->arg_end();
   // Last element of argument iterator is dummy. Skip it.
   e--;
-  Argument* arg;
-  for( ; offset != 0; e--) {
+  Argument *arg;
+  for (; offset != 0; e--) {
     offset--;
     arg = &*e;
   }
@@ -254,25 +256,24 @@ Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
-                          Instruction* BodyEnd, Value* TerminationCond) {
-  BasicBlock* Entry = CondBlockStart->getParent();
-  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
-  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
-  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+void CGT_X86::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
+                           Instruction *BodyEnd, Value *TerminationCond) {
+  BasicBlock *Entry = CondBlockStart->getParent();
+  BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
 
   // Replace the terminator instruction of conditional with new conditional
   // branch which goes to while.body if true and branches to while.end otherwise
-  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
   ReplaceInstWithInst(CondBlock->getTerminator(), BI);
 
   // While Body should jump to condition block
-  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
+  BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock);
   ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
-
 }
 
-Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
                                           BasicBlock *Body) {
   Module *M = Entry->getParent()->getParent();
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
@@ -282,10 +283,10 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
   PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
 
   ConstantInt *IConst =
-    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
   Instruction *CounterIncr =
-    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
-                                            "cnt_incr", Body->getTerminator());
+      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                "cnt_incr", Body->getTerminator());
 
   // Set incoming values for Phi node
   IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
@@ -307,39 +308,40 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
-  BasicBlock* Entry = I->getParent();
-  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
+Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
+  BasicBlock *Entry = I->getParent();
+  BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
 
   BasicBlock::iterator i(I);
   ++i;
-  Instruction* NextI = &*i;
+  Instruction *NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == ForBody
-         && "Next Instruction should also belong to the same basic block!");
-  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
-
+  assert(NextI->getParent() == ForBody &&
+         "Next Instruction should also belong to the same basic block!");
+  BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
 
   // Add Phi Node for index variable
-  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
-                                      2, "index."+indexName, I);
+  PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
+                                      "index." + indexName, I);
 
   // Add incoming edge to phi
   IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
                         Entry);
   // Increment index variable
-  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
-                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
-                             "index."+indexName+".inc", ForBody->getTerminator());
+  BinaryOperator *IndexInc = BinaryOperator::Create(
+      Instruction::Add, IndexPhi,
+      ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+      "index." + indexName + ".inc", ForBody->getTerminator());
 
   // Compare index variable with limit
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
-                                  limit, "cond."+indexName, ForBody->getTerminator());
+  CmpInst *Cond =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
+                      "cond." + indexName, ForBody->getTerminator());
 
   // Replace the terminator instruction of for.body with new conditional
   // branch which loops over body if true and branches to for.end otherwise
-  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
   ReplaceInstWithInst(ForBody->getTerminator(), BI);
 
   // Add incoming edge to phi node in body
@@ -351,260 +353,274 @@ Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
 // types, output types and isLastInput buffer type. All the streaming
 // inputs/outputs are converted to i8*, since this is the type of buffer
 // handles.
-StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
-  std::vector<Type*> TyList;
+StructType *CGT_X86::getArgumentListStructTy(DFNode *C) {
+  std::vector<Type *> TyList;
   // Input types
-  Function* CF = C->getFuncPointer();
-  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
-      ai != ae; ++ai) {
-    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+  Function *CF = C->getFuncPointer();
+  for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+       ai != ae; ++ai) {
+    if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
       TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-    else 
+    else
       TyList.push_back(ai->getType());
   }
   // Output Types
-  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
+  StructType *OutStructTy = cast<StructType>(CF->getReturnType());
   for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
     // All outputs of a node are streaming edge
-    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
-        && "All output edges of child node have to be streaming");
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() &&
+           "All output edges of child node have to be streaming");
     TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
   }
   // isLastInput buffer element
   TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
 
-  StructType* STy = StructType::create(CF->getContext(), TyList,
-                        Twine("struct.thread."+CF->getName()).str(), true);
+  StructType *STy =
+      StructType::create(CF->getContext(), TyList,
+                         Twine("struct.thread." + CF->getName()).str(), true);
   return STy;
-
 }
 
-void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
-                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
-                              Instruction* IB) {
-  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
+void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args,
+                              DenseMap<DFEdge *, Value *> EdgeBufferMap,
+                              Value *isLastInputBuffer, Value *graphID,
+                              Instruction *IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Create a filter/pipeline function for the child node
-  Function* C_Pipeline = createFunctionFilter(C);
-  Function* CF = C->getFuncPointer();
+  Function *C_Pipeline = createFunctionFilter(C);
+  Function *CF = C->getFuncPointer();
 
   // Get module context and i32 0 constant, as they would be frequently used in
   // this function.
-  LLVMContext& Ctx = IB->getParent()->getContext();
-  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+  LLVMContext &Ctx = IB->getParent()->getContext();
+  Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
 
   // Marshall arguments
   // Create a packed struct type with inputs of C followed by outputs and then
   // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
   // by i8*
   //
-  StructType* STy = getArgumentListStructTy(C);
+  StructType *STy = getArgumentListStructTy(C);
   // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
-  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
-                                  C->getFuncPointer()->getName()+".inputs", IB);
-  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
-  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
+  CallInst *CI =
+      CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)),
+                       C->getFuncPointer()->getName() + ".inputs", IB);
+  CastInst *Struct = BitCastInst::CreatePointerCast(
+      CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB);
+  // AllocaInst* AI = new AllocaInst(STy,
+  // C->getFuncPointer()->getName()+".inputs", IB);
   // Insert elements in the struct
-  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall inputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Marshall Inputs
-  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
     // Create constant int (i)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
     // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".arg_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getInDFEdgeAt(i);
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".arg_" + Twine(i), IB);
+    DFEdge *E = C->getInDFEdgeAt(i);
     if (E->getSourceDF()->isEntryNode()) {
       // This is a Bind Input Edge
-      if(E->isStreamingEdge()) {
+      if (E->isStreamingEdge()) {
         // Streaming Bind Input edge. Get buffer corresponding to it
-        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
+        assert(EdgeBufferMap.count(E) &&
+               "No mapping buffer for a Streaming Bind DFEdge!");
         new StoreInst(EdgeBufferMap[E], GEP, IB);
-      }
-      else {
+      } else {
         // Non-streaming Bind edge
         new StoreInst(Args[i], GEP, IB);
       }
-    }
-    else {
-      // This is an edge between siblings. 
+    } else {
+      // This is an edge between siblings.
       // This must be an streaming edge. As it is our assumption that all edges
       // between two nodes in a DFG are streaming.
-      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
+      assert(EdgeBufferMap.count(E) &&
+             "No mapping buffer for a Streaming DFEdge!");
       new StoreInst(EdgeBufferMap[E], GEP, IB);
     }
   }
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
   // Marshall Outputs
-  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
-  for(unsigned i = 0; i < numOutputs; i++ ) {
+  DEBUG(errs() << "Marshall outputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
     // Create constant int (i+numInputs)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs);
     // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".out_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getOutDFEdgeAt(i);
-    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
-    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".out_" + Twine(i), IB);
+    DFEdge *E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() &&
+           "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) &&
+           "No mapping buffer for a Out Streaming DFEdge!");
     new StoreInst(EdgeBufferMap[E], GEP, IB);
   }
   // Marshall last argument. isLastInput buffer
-  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall isLastInput for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Create constant int (i+numInputs)
-  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
+  Constant *Int_index =
+      ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs);
   // Get Element pointer instruction
-  Value* GEPIndices[] = { IntZero, Int_index };
-  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                           ArrayRef<Value*>(GEPIndices, 2),
-                           Struct->getName()+".isLastInput", IB);
+  Value *GEPIndices[] = {IntZero, Int_index};
+  GetElementPtrInst *GEP = GetElementPtrInst::Create(
+      nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+      Struct->getName() + ".isLastInput", IB);
   new StoreInst(isLastInputBuffer, GEP, IB);
 
   // AllocaInst AI points to memory with all the arguments packed
   // Call runtime to create the thread with these arguments
-  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
-// DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << "Start Thread for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // DEBUG(errs() << *llvm_hpvm_createThread << "\n");
   DEBUG(errs() << *graphID->getType() << "\n");
   DEBUG(errs() << *C_Pipeline->getType() << "\n");
   DEBUG(errs() << *Struct->getType() << "\n");
   // Bitcast AI to i8*
-  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
-  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
-  CallInst::Create(llvm_visc_createThread, ArrayRef<Value*>(CreateThreadArgs, 3), "", IB);
-
+  CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx),
+                                                Struct->getName(), IB);
+  Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst::Create(llvm_hpvm_createThread,
+                   ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
 }
 
-Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
+Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
   DEBUG(errs() << "Generating Streaming Launch Function\n");
   // Get Function associated with Node N
-  Function* NF = N->getFuncPointer();
+  Function *NF = N->getFuncPointer();
 
-  // Map from Streaming edge to buffer 
-  DenseMap<DFEdge*, Value*> EdgeBufferMap;
+  // Map from Streaming edge to buffer
+  DenseMap<DFEdge *, Value *> EdgeBufferMap;
 
   /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
-  * (2) Extract each of inputs from data.addr
-  * (3) create Buffers for all the streaming edges
-  *     - Put buffers in the context
-  * (4) Go over each child node
-  *     - marshall its arguments together (use buffers in place of streaming
-  *       arguments)
-  *     - Start the threads
-  * (5) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+   * (2) Extract each of inputs from data.addr
+   * (3) create Buffers for all the streaming edges
+   *     - Put buffers in the context
+   * (4) Go over each child node
+   *     - marshall its arguments together (use buffers in place of streaming
+   *       arguments)
+   *     - Start the threads
+   * (5) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
   // (1) Create Launch Function of type void (i8* args, i8* GraphID)
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
-  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
-                                  ArrayRef<Type*>(ArgTypes, 2), false);
-  Function* LaunchFunc = Function::Create(LaunchFuncTy,
-                                       NF->getLinkage(),
-                                       NF->getName()+".LaunchFunction",
-                                       &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType *LaunchFuncTy = FunctionType::get(
+      Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false);
+  Function *LaunchFunc = Function::Create(
+      LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M);
   DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Argument* data = &*LaunchFunc->arg_begin();
+  Argument *data = &*LaunchFunc->arg_begin();
   // NOTE-HS: Check correctness with Maria
-  Argument* graphID = &*(LaunchFunc->arg_begin() + 1);
+  Argument *graphID = &*(LaunchFunc->arg_begin() + 1);
   data->setName("data.addr");
   graphID->setName("graphID");
   // Add a basic block to this empty function and a return null statement to it
   DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
-  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
-  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
-                                      BB);
+  BasicBlock *BB =
+      BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
 
   // (2) Extract each of inputs from data.addr
-  std::vector<Type*> TyList;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
-  std::vector<Value*> Args;
+  std::vector<Value *> Args;
 
   for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
-      ai != ae; ++ai) {
-    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
+       ai != ae; ++ai) {
+    if (N->getChildGraph()
+            ->getEntry()
+            ->getOutDFEdgeAt(ai->getArgNo())
+            ->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back(Twine(ai->getName()+"_buffer").str());
+      names.push_back(Twine(ai->getName() + "_buffer").str());
       continue;
     }
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
   Args = extractElements(data, TyList, names, RI);
-  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
+  DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc
+               << "\n");
   // (3) Create buffers for all the streaming edges
-  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
-      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
-    DFEdge* Edge = *di;
+  for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+                                de = N->getChildGraph()->dfedge_end();
+       di != de; ++di) {
+    DFEdge *Edge = *di;
     DEBUG(errs() << *Edge->getType() << "\n");
-    Value* size = ConstantExpr::getSizeOf(Edge->getType());
-    Value* CallArgs[] = {graphID, size};
+    Value *size = ConstantExpr::getSizeOf(Edge->getType());
+    Value *CallArgs[] = {graphID, size};
     if (Edge->isStreamingEdge()) {
-      CallInst* CI;
+      CallInst *CI;
       // Create a buffer call
-      if(Edge->getSourceDF()->isEntryNode()) {
+      if (Edge->getSourceDF()->isEntryNode()) {
         // Bind Input Edge
-        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
-                                  Edge->getSourcePosition());
-        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
-        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
-                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else if(Edge->getDestDF()->isExitNode()) {
+        Constant *Int_ArgNo = ConstantInt::get(
+            Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
+        Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(
+            llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
+            "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
+      } else if (Edge->getDestDF()->isExitNode()) {
         // Bind Output Edge
-        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else {
+        CI = CallInst::Create(
+            llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
+            "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
+      } else {
         // Streaming Edge
-        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
-                              ArrayRef<Value*>(CallArgs, 2),
-                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
-                              +Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
+        CI = CallInst::Create(
+            llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
+            Edge->getSourceDF()->getFuncPointer()->getName() + "." +
+                Edge->getDestDF()->getFuncPointer()->getName(),
+            RI);
       }
       EdgeBufferMap[Edge] = CI;
     }
   }
   // Create buffer for isLastInput for all the child nodes
-  DFGraph* G = N->getChildGraph();
-  DenseMap<DFNode*, Value*> NodeLastInputMap;
-  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
-    DFNode* child = *ci;
-    if(child->isDummyNode())
+  DFGraph *G = N->getChildGraph();
+  DenseMap<DFNode *, Value *> NodeLastInputMap;
+  for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce;
+       ++ci) {
+    DFNode *child = *ci;
+    if (child->isDummyNode())
       continue;
-    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
-    Value* CallArgs[] = {graphID, size};
-    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
-                              RI);
+    Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value *CallArgs[] = {graphID, size};
+    CallInst *CI = CallInst::Create(
+        llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
+        "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
     NodeLastInputMap[child] = CI;
   }
-  DEBUG(errs() <<  "Start Each child node filter\n");
+  DEBUG(errs() << "Start Each child node filter\n");
   // (4) Marshall arguments for each child node and start the thread with its
   //     pipeline funtion
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
-    
+
     // Marshall all the arguments for this node into an i8*
     // Pass to the runtime to create the thread
     // Start the thread for child node C
@@ -617,7 +633,6 @@ Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
   return LaunchFunc;
 }
 
-
 /* This fuction does the steps necessary to launch a streaming graph
  * Steps
  * Create Pipeline/Filter function for each node in child graph of Root
@@ -625,167 +640,158 @@ Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
  * Modify each of the instrinsic in host code
  * Launch, Push, Pop, Wait
  */
-void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
-  IntrinsicInst* LI = Root->getInstruction();
-  Function* RootLaunch = createLaunchFunction(Root);
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) {
+  IntrinsicInst *LI = Root->getInstruction();
+  Function *RootLaunch = createLaunchFunction(Root);
   // Substitute launch intrinsic main
-  DEBUG(errs() <<  "Substitute launch intrinsic\n");
-  Value* LaunchInstArgs[] = {RootLaunch,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
+  DEBUG(errs() << "Substitute launch intrinsic\n");
+  Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
-    switch(II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_streamWait,
-                            ArrayRef<Value*>(LaunchInst),
+  DEBUG(errs() << "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_streamPush,
-                            ArrayRef<Value*>(PushArgs, 2),
-                            "");
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_streamPush,
+                            ArrayRef<Value *>(PushArgs, 2), "");
       break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_streamPop,
-                            ArrayRef<Value*>(LaunchInst),
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
     };
     DEBUG(errs() << "Replace:\n\t" << *II << "\n");
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << "\twith " << *CI << "\n");
   }
-
-
 }
 
-void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
+void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   // TODO: Place an assert to check if the constant passed by launch intrinsic
   // as the number of arguments to DFG is same as the number of arguments of the
   // root of DFG
   DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
-  IntrinsicInst* LI = Root->getInstruction();
-  switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
+  IntrinsicInst *LI = Root->getInstruction();
+  switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI);
   DEBUG(errs() << "Generating Launch Function\n");
 
   /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type i8*(i8*)
-  * (2) Extract each of inputs from data.addr and pass them as arguments to the
-  * call to Root function
-  * (3) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type i8*(i8*)
+   * (2) Extract each of inputs from data.addr and pass them as arguments to the
+   * call to Root function
+   * (3) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
   // Create Launch Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                            ArrayRef<Type*>(i8Ty->getPointerTo()),
-                            false);
-  Function* AppFunc = Function::Create(AppFuncTy,
-                                       Root->getFuncPointer()->getLinkage(),
-                                       "LaunchDataflowGraph",
-                                       &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *AppFuncTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *AppFunc =
+      Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(),
+                       "LaunchDataflowGraph", &M);
   DEBUG(errs() << "Generating Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value* data = &*AppFunc->arg_begin();
+  Value *data = &*AppFunc->arg_begin();
   data->setName("data.addr");
   // Add a basic block to this empty function and a return null statement to it
   BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
-  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
-                                      Constant::getNullValue(AppFunc->getReturnType()),
-                                      BB);
-  switchToTimer(visc_TimerID_ARG_UNPACK, RI);
+  ReturnInst *RI =
+      ReturnInst::Create(AppFunc->getContext(),
+                         Constant::getNullValue(AppFunc->getReturnType()), BB);
+  switchToTimer(hpvm_TimerID_ARG_UNPACK, RI);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and
-//  Function* RootF_X86 = Root->getGenFunc();
-  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  //  Function* RootF_X86 = Root->getGenFunc();
+  Function *RootF_X86 = Root->getGenFuncForTarget(hpvm::CPU_TARGET);
   assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
-  assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
+  assert(Root->hasX86GenFuncForTarget(hpvm::CPU_TARGET) &&
          "Error: Generated Function for Root node with no x86 wrapper\n");
 
   // Generate a call to RootF_X86 with null parameters for now
-  std::vector<Value*>Args;
-  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  std::vector<Value *> Args;
+  for (unsigned i = 0; i < RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(
+        Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
   }
-  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
+  CallInst *CI =
+      CallInst::Create(RootF_X86, Args, RootF_X86->getName() + ".output", RI);
 
   // Extract input data from i8* data.addr and patch them to correct argument of
   // call to RootF_X86. For each argument
-  std::vector<Type*> TyList;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
-  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
-      ai != ae; ++ai) {
+  for (Function::arg_iterator ai = RootF_X86->arg_begin(),
+                              ae = RootF_X86->arg_end();
+       ai != ae; ++ai) {
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
-  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
+  std::vector<Value *> elements = extractElements(data, TyList, names, CI);
   // Patch the elements to the call arguments
-  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
+  for (unsigned i = 0; i < CI->getNumArgOperands(); i++)
     CI->setArgOperand(i, elements[i]);
 
   // Add timers around Call to RootF_X86 function
-  switchToTimer(visc_TimerID_COMPUTATION, CI);
-  switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
+  switchToTimer(hpvm_TimerID_COMPUTATION, CI);
+  switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI);
 
-  StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
+  StructType *RootRetTy =
+      cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
 
-  // if Root has non empty return 
+  // if Root has non empty return
   if (RootRetTy->getNumElements()) {
     // We can't access the type of the arg struct - build it
-    std::vector<Type*> TyList;
-    for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
-        ai != ae; ++ai) {
+    std::vector<Type *> TyList;
+    for (Function::arg_iterator ai = RootF_X86->arg_begin(),
+                                ae = RootF_X86->arg_end();
+         ai != ae; ++ai) {
       TyList.push_back(ai->getType());
     }
     TyList.push_back(CI->getType());
 
-    StructType* ArgStructTy = StructType::create(M.getContext(),
-                                                 ArrayRef<Type*>(TyList),
-                                 (RootF_X86->getName()+".arg.struct.ty").str(), true);
+    StructType *ArgStructTy = StructType::create(
+        M.getContext(), ArrayRef<Type *>(TyList),
+        (RootF_X86->getName() + ".arg.struct.ty").str(), true);
 
     // Cast the data pointer to the type of the arg struct
-    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                                 ArgStructTy->getPointerTo(),
-                                 "argStructCast.addr",
-                                 RI);
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI);
 
     // Result struct is the last element of the packed struct passed to launch
     unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
 
-    ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
-    ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()),
-                                          outStructIdx);
+    ConstantInt *IntZero =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    ConstantInt *IntIdx =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx);
 
-    Value* GEPIIdxList[] = { IntZero,
-                             IntIdx
-                           };
+    Value *GEPIIdxList[] = {IntZero, IntIdx};
     // Get data pointer to the last element of struct - result field
-    GetElementPtrInst *OutGEPI =
-      GetElementPtrInst::Create(ArgStructTy,
-                                OutputAddrCast,
-                                ArrayRef<Value*>(GEPIIdxList, 2),
-                                CI->getName()+".addr",
-                                RI);
+    GetElementPtrInst *OutGEPI = GetElementPtrInst::Create(
+        ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2),
+        CI->getName() + ".addr", RI);
     // Store result there
     new StoreInst(CI, OutGEPI, RI);
   } else {
@@ -794,117 +800,111 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
     // We were casting the data pointer to the result type of Root, and
     // returning result there. This would work at the LLVM level, but not
     // at the C level, thus the rewrite.
-    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                               CI->getType()->getPointerTo(),
-                               CI->getName()+".addr",
-                               RI);
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI);
     new StoreInst(CI, OutputAddrCast, RI);
   }
 
-  switchToTimer(visc_TimerID_NONE, RI);
+  switchToTimer(hpvm_TimerID_NONE, RI);
 
   DEBUG(errs() << "Application specific function:\n");
   DEBUG(errs() << *AppFunc << "\n");
 
   // Substitute launch intrinsic main
-  Value* LaunchInstArgs[] = {AppFunc,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
-  //ReplaceInstWithInst(LI, LaunchInst);
+  Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_hpvm_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
+  // ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    switch(II->getIntrinsicID()) {
-    case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_x86_wait,
-                            ArrayRef<Value*>(LaunchInst),
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::hpvm_wait:
+      CI = CallInst::Create(llvm_hpvm_x86_wait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_bufferPush,
-                            ArrayRef<Value*>(LaunchInst),
+    case Intrinsic::hpvm_push:
+      CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
-    case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_bufferPop,
-                            ArrayRef<Value*>(LaunchInst),
+    case Intrinsic::hpvm_pop:
+      CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
     };
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << *CI << "\n");
   }
-
 }
 
-Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+Value *CGT_X86::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
+                             Instruction *InsertBefore) {
   // TODO: Assumption is that each input port of a node has just one
   // incoming edge. May change later on.
 
   // Find the incoming edge at the requested input port
-  DFEdge* E = Child->getInDFEdgeAt(i);
+  DFEdge *E = Child->getInDFEdgeAt(i);
   assert(E && "No incoming edge or binding for input element!");
   // Find the Source DFNode associated with the incoming edge
-  DFNode* SrcDF = E->getSourceDF();
+  DFNode *SrcDF = E->getSourceDF();
 
   // If Source DFNode is a dummyNode, edge is from parent. Get the
   // argument from argument list of this internal node
-  Value* inputVal;
-  if(SrcDF->isEntryNode()) {
+  Value *inputVal;
+  if (SrcDF->isEntryNode()) {
     inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
-    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-  }
-  else {
+    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+  } else {
     // edge is from a sibling
     // Check - code should already be generated for this source dfnode
-    assert(OutputMap.count(SrcDF)
-           && "Source node call not found. Dependency violation!");
+    assert(OutputMap.count(SrcDF) &&
+           "Source node call not found. Dependency violation!");
 
     // Find CallInst associated with the Source DFNode using OutputMap
-    Value* CI = OutputMap[SrcDF];
+    Value *CI = OutputMap[SrcDF];
 
     // Extract element at source position from this call instruction
     std::vector<unsigned> IndexList;
     IndexList.push_back(E->getSourcePosition());
-    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                           "", InsertBefore);
+    DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
     inputVal = EI;
   }
   return inputVal;
 }
 
-void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
-                              ValueToValueMapTy &VMap,Instruction* IB) {
-  Function* CF = C->getFuncPointer();
-
-//  Function* CF_X86 = C->getGenFunc();
-  Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
-  assert(CF_X86 != NULL
-         && "Found leaf node for which code generation has not happened yet!\n");
-  assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
-         "The generated function to be called from x86 backend is not an x86 function\n");
+void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
+                              ValueToValueMapTy &VMap, Instruction *IB) {
+  Function *CF = C->getFuncPointer();
+
+  //  Function* CF_X86 = C->getGenFunc();
+  Function *CF_X86 = C->getGenFuncForTarget(hpvm::CPU_TARGET);
+  assert(CF_X86 != NULL &&
+         "Found leaf node for which code generation has not happened yet!\n");
+  assert(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET) &&
+         "The generated function to be called from x86 backend is not an x86 "
+         "function\n");
   DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
 
-  std::vector<Value*> Args;
+  std::vector<Value *> Args;
   // Create argument list to pass to call instruction
   // First find the correct values using the edges
   // The remaing six values are inserted as constants for now.
-  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
     Args.push_back(getInValueAt(C, i, F_X86, IB));
   }
 
-  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
-  for(unsigned j=0; j<6; j++)
+  Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for (unsigned j = 0; j < 6; j++)
     Args.push_back(I64Zero);
 
   errs() << "Gen Function type: " << *CF_X86->getType() << "\n";
@@ -912,9 +912,8 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   errs() << "Arguments: " << Args.size() << "\n";
 
   // Call the F_X86 function associated with this node
-  CallInst* CI = CallInst::Create(CF_X86, Args,
-                                  CF_X86->getName()+"_output",
-                                  IB);
+  CallInst *CI =
+      CallInst::Create(CF_X86, Args, CF_X86->getName() + "_output", IB);
   DEBUG(errs() << *CI << "\n");
   OutputMap[C] = CI;
 
@@ -922,55 +921,56 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   // Based on number of dimensions, insert loop instructions
   std::string varNames[3] = {"x", "y", "z"};
   unsigned numArgs = CI->getNumArgOperands();
-  for(unsigned j=0; j < C->getNumOfDim(); j++) {
-    Value* indexLimit = NULL;
+  for (unsigned j = 0; j < C->getNumOfDim(); j++) {
+    Value *indexLimit = NULL;
     // Limit can either be a constant or an arguement of the internal node.
     // In case of constant we can use that constant value directly in the
     // new F_X86 function. In case of an argument, we need to get the mapped
     // value using VMap
-    if(isa<Constant>(C->getDimLimits()[j])) {
+    if (isa<Constant>(C->getDimLimits()[j])) {
       indexLimit = C->getDimLimits()[j];
       DEBUG(errs() << "In Constant case:\n"
-             << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    }
-    else {
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    } else {
       indexLimit = VMap[C->getDimLimits()[j]];
       DEBUG(errs() << "In VMap case:"
-             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
     }
     assert(indexLimit && "Invalid dimension limit!");
     // Insert loop
-    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
     DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
     // Insert index variable and limit arguments
-    CI->setArgOperand(numArgs-6+j, indexVar);
-    CI->setArgOperand(numArgs-3+j, indexLimit);
+    CI->setArgOperand(numArgs - 6 + j, indexVar);
+    CI->setArgOperand(numArgs - 3 + j, indexLimit);
   }
   // Insert call to runtime to push the dim limits and instanceID on the depth
   // stack
-  Value* args[] = {
-    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
-    CI->getArgOperand(numArgs-3+0), // limitX
-    CI->getArgOperand(numArgs-6+0), // iX
-    CI->getArgOperand(numArgs-3+1), // limitY
-    CI->getArgOperand(numArgs-6+1), // iY
-    CI->getArgOperand(numArgs-3+2), // limitZ
-    CI->getArgOperand(numArgs-6+2)  // iZ
+  Value *args[] = {
+      ConstantInt::get(Type::getInt32Ty(CI->getContext()),
+                       C->getNumOfDim()), // numDim
+      CI->getArgOperand(numArgs - 3 + 0), // limitX
+      CI->getArgOperand(numArgs - 6 + 0), // iX
+      CI->getArgOperand(numArgs - 3 + 1), // limitY
+      CI->getArgOperand(numArgs - 6 + 1), // iY
+      CI->getArgOperand(numArgs - 3 + 2), // limitZ
+      CI->getArgOperand(numArgs - 6 + 2)  // iZ
   };
 
-  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
+  CallInst *Push = CallInst::Create(llvm_hpvm_x86_dstack_push,
+                                    ArrayRef<Value *>(args, 7), "", CI);
   DEBUG(errs() << "Push on stack: " << *Push << "\n");
   // Insert call to runtime to pop the dim limits and instanceID from the depth
   // stack
   BasicBlock::iterator i(CI);
   ++i;
-  Instruction* NextI = &*i;
+  Instruction *NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == CI->getParent()
-         && "Next Instruction should also belong to the same basic block!");
+  assert(NextI->getParent() == CI->getParent() &&
+         "Next Instruction should also belong to the same basic block!");
 
-  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  CallInst *Pop = CallInst::Create(llvm_hpvm_x86_dstack_pop, None, "", NextI);
   DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
   DEBUG(errs() << *CI->getParent()->getParent());
 }
@@ -991,34 +991,33 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
 // Add runtime API calls to push output for each of the streaming outputs
 // Add loop around the basic block, which exits the loop if isLastInput is false
 
-Function* CGT_X86::createFunctionFilter(DFNode* C) {
-  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
+Function *CGT_X86::createFunctionFilter(DFNode *C) {
+  DEBUG(errs() << "*********Creating Function filter for "
+               << C->getFuncPointer()->getName() << "*****\n");
 
   /* Create a function with same argument list as child.*/
   DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
   // Get the generated function for child node
-  Function* CF = C->getFuncPointer();
+  Function *CF = C->getFuncPointer();
   // Create Filter Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
-                                ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                false);
-  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
-                          CF->getLinkage(),
-                          CF->getName()+"_Pipeline",
-                          &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *CF_PipelineTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(),
+                                           CF->getName() + "_Pipeline", &M);
   DEBUG(errs() << "Generating Pipeline Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value* data = &*CF_Pipeline->arg_begin();
+  Value *data = &*CF_Pipeline->arg_begin();
   data->setName("data.addr");
   // Create a new basic block
   DEBUG(errs() << "\tCreate new BB and add a return function\n");
   // Add a basic block to this empty function
-  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  BasicBlock *BB =
+      BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
   // Add a return instruction to the basic block
-  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
-                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
-
+  ReturnInst *RI =
+      ReturnInst::Create(CF_Pipeline->getContext(),
+                         UndefValue::get(CF_Pipeline->getReturnType()), BB);
 
   /* Extract the elements from the aggregate argument to the function.
    * Replace the streaming inputs with i8* types signifying handle to
@@ -1029,25 +1028,24 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
   // These Args will be used when passing arguments to the generated function
   // inside loop, and reading outputs as well.
-  std::vector<Value*> Args;
-  std::vector<Type*> TyList;
+  std::vector<Value *> Args;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
   // Adding inputs
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back((Twine(i->getName())+"_buffer").str());
-    }
-    else {
+      names.push_back((Twine(i->getName()) + "_buffer").str());
+    } else {
       TyList.push_back(i->getType());
       names.push_back(i->getName());
     }
   }
   // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
   // because we get there buffer handles
-  StructType* RetTy = cast<StructType>(CF->getReturnType());
-  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
+  StructType *RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < RetTy->getNumElements(); i++) {
     TyList.push_back(i8Ty->getPointerTo());
     names.push_back("out");
   }
@@ -1056,66 +1054,54 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   TyList.push_back(i8Ty->getPointerTo());
   names.push_back("isLastInput_buffer");
 
-  // Extract the inputs, outputs 
+  // Extract the inputs, outputs
   Args = extractElements(data, TyList, names, RI);
-  for(unsigned i=0; i<Args.size(); i++) {
+  for (unsigned i = 0; i < Args.size(); i++) {
     DEBUG(errs() << *Args[i] << "\n");
   }
 
   // Split the Args vector into, input output and isLastInput
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = RetTy->getNumElements();
-  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
-  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
-  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
+  std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value *> OutputArgs(Args.begin() + numInputs,
+                                  Args.begin() + numInputs + numOutputs);
+  Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]);
 
   /* Add runtime API calls to get input for each of the streaming input edges */
-  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the "
+                  "streaming input edges\n");
   // First read the termination condition variable islastInput
-  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
-                                        ArrayRef<Value*>(isLastInput),
-                                        "",
-                                        RI);
-
-  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
-                 Type::getInt64Ty(CF_Pipeline->getContext()),
-                 false,
-                 "isLastInput",
-                 RI);
+  CallInst *isLastInputPop = CallInst::Create(
+      llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
+
+  CastInst *BI = BitCastInst::CreateIntegerCast(
+      isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
+      "isLastInput", RI);
   isLastInput = BI;
   // Create a loop termination condition
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
-      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
-      RI);
+  CmpInst *Cond = CmpInst::Create(
+      Instruction::ICmp, CmpInst::ICMP_NE, isLastInput,
+      Constant::getNullValue(Type::getInt64Ty(CF->getContext())),
+      "isLastInputNotZero", RI);
 
   // Get input from buffers of all the incoming streaming edges
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
-      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
-                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
-                                            "",
-                                            RI);
-      CastInst* BI;
-      if(i->getType()->isPointerTy()) {
-        BI = CastInst::Create(CastInst::IntToPtr,
-                              bufferIn,
-                              i->getType(),
-                              i->getName()+".addr",
-                              RI);
-      }
-      else if(i->getType()->isFloatTy()) {
-        BI = CastInst::CreateFPCast(bufferIn,
-                                    i->getType(),
-                                    i->getName()+".addr",
-                                    RI);
-      }
-      else {
-        BI = CastInst::CreateIntegerCast(bufferIn,
-                                         i->getType(),
-                                         false,
-                                         i->getName()+".addr",
-                                         RI);
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst *bufferIn =
+          CallInst::Create(llvm_hpvm_bufferPop,
+                           ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
+      CastInst *BI;
+      if (i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(),
+                              i->getName() + ".addr", RI);
+      } else if (i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn, i->getType(),
+                                    i->getName() + ".addr", RI);
+      } else {
+        BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false,
+                                         i->getName() + ".addr", RI);
       }
       // Replace the argument in Args vector. We would be using the vector as
       // parameters passed to the call
@@ -1124,46 +1110,40 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   }
   /* Add a call to the generated function of the child node */
   DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
-//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
-//                                  C->getGenFunc()->getName()+".output", RI);
-  Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
-  DEBUG(errs() << "Type: "
-               << *CGenF->getType()
-               << "\n");
-  CallInst* CI = CallInst::Create(CGenF,
-                                  InputArgs,
-                                  CGenF->getName()+".output",
-                                  RI);
+  //  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+  //  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+  //                                  C->getGenFunc()->getName()+".output", RI);
+  Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET);
+  DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
+  CallInst *CI =
+      CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
 
   /* Add runtime API calls to push output for each of the streaming outputs */
   // FIXME: Assumption
   // All edges between siblings are streaming edges
-  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
-  for (unsigned i=0; i< numOutputs; i++) {
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the "
+                  "streaming outputs\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
     // Extract output
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
-                           "",RI);
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI);
     // Convert to i64
-    CastInst* BI;
-    if(EI->getType()->isPointerTy())
-      BI = CastInst::Create(CastInst::PtrToInt,EI,
-                            Type::getInt64Ty(CF_Pipeline->getContext()),
-                            "",
-                            RI);
+    CastInst *BI;
+    if (EI->getType()->isPointerTy())
+      BI =
+          CastInst::Create(CastInst::PtrToInt, EI,
+                           Type::getInt64Ty(CF_Pipeline->getContext()), "", RI);
     else
-      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
-                                       false, "", RI);
+      BI = CastInst::CreateIntegerCast(
+          EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI);
     // Push to Output buffer
-    Value* bufferOutArgs[] = {OutputArgs[i], BI};
-    CallInst::Create(llvm_visc_bufferPush,
-                                           ArrayRef<Value*>(bufferOutArgs, 2),
-                                           "",
-                                           RI);
+    Value *bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2),
+                     "", RI);
   }
 
-  // Add loop around the basic block, which exits the loop if isLastInput is false
-  // Pointers to keep the created loop structure
+  // Add loop around the basic block, which exits the loop if isLastInput is
+  // false Pointers to keep the created loop structure
   BasicBlock *EntryBB, *CondBB, *BodyBB;
   Instruction *CondStartI = cast<Instruction>(isLastInputPop);
   Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
@@ -1177,23 +1157,23 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   return CF_Pipeline;
 }
 
-void CGT_X86::codeGen(DFInternalNode* N) {
+void CGT_X86::codeGen(DFInternalNode *N) {
   // Check if N is root node and its graph is streaming. We do not do codeGen
   // for Root in such a case
-  if(N->isRoot() && N->isChildGraphStreaming())
+  if (N->isRoot() && N->isChildGraphStreaming())
     return;
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-//  if(N->getGenFunc() != NULL)
-//    return;
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
-              " : skipping it\n";
+  //  if(N->getGenFunc() != NULL)
+  //    return;
+  if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
+    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
+                 << " : skipping it\n");
     return;
   }
 
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
          "Error: Visiting a node for which code already generated\n");
 
   // Sort children in topological order before code generation
@@ -1202,14 +1182,15 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   // Only process if all children have a CPU x86 function
   // Otherwise skip to end
   bool codeGen = true;
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
 
-    if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
+    if (!(C->hasX86GenFuncForTarget(hpvm::CPU_TARGET))) {
       errs() << "No CPU x86 version for child node "
              << C->getFuncPointer()->getName()
              << "\n  Skip code gen for parent node "
@@ -1219,17 +1200,18 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   }
 
   if (codeGen) {
-    Function* F = N->getFuncPointer();
+    Function *F = N->getFuncPointer();
     // Create of clone of F with no instructions. Only the type is the same as F
     // without the extra arguments.
-    Function* F_X86;
-  
+    Function *F_X86;
+
     // Clone the function, if we are seeing this function for the first time. We
     // only need a clone in terms of type.
     ValueToValueMapTy VMap;
-  
+
     // Create new function with the same type
-    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(),
+                             F->getName(), &M);
 
     // Loop over the arguments, copying the names of arguments over.
     Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -1242,20 +1224,20 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     // Add a basic block to this empty function
     BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
-    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
-                                        UndefValue::get(F_X86->getReturnType()), BB);
+    ReturnInst *RI = ReturnInst::Create(
+        F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB);
 
-    // Add Index and Dim arguments except for the root node and the child graph of
-    // parent node is not streaming
-    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+    // Add Index and Dim arguments except for the root node and the child graph
+    // of parent node is not streaming
+    if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
       F_X86 = addIdxDimArgs(F_X86);
 
     BB = &*F_X86->begin();
     RI = cast<ReturnInst>(BB->getTerminator());
-  
-    //Add generated function info to DFNode
-//    N->setGenFunc(F_X86, visc::CPU_TARGET);
-    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Add generated function info to DFNode
+    //    N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+    N->addGenFunc(F_X86, hpvm::CPU_TARGET, true);
 
     // Loop over the arguments, to create the VMap.
     dest_iterator = F_X86->arg_begin();
@@ -1267,59 +1249,59 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     }
 
     // Iterate over children in topological order
-    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-      DFNode* C = *ci;
+    for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                    ce = N->getChildGraph()->end();
+         ci != ce; ++ci) {
+      DFNode *C = *ci;
       // Skip dummy node call
       if (C->isDummyNode())
         continue;
-  
+
       // Create calls to CPU function of child node
       invokeChild_X86(C, F_X86, VMap, RI);
-  
     }
- 
+
     DEBUG(errs() << "*** Generating epilogue code for the function****\n");
     // Generate code for output bindings
     // Get Exit node
-    DFNode* C = N->getChildGraph()->getExit();
+    DFNode *C = N->getChildGraph()->getExit();
     // Get OutputType of this node
-    StructType* OutTy = N->getOutputType();
+    StructType *OutTy = N->getOutputType();
     Value *retVal = UndefValue::get(F_X86->getReturnType());
     // Find all the input edges to exit node
-    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
       DEBUG(errs() << "Output Edge " << i << "\n");
       // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-  
+      DFEdge *E = C->getInDFEdgeAt(i);
+
       assert(E && "No Binding for output element!");
       // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-  
-      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
-  
+      DFNode *SrcDF = E->getSourceDF();
+
+      DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                   << "\n");
+
       // If Source DFNode is a dummyNode, edge is from parent. Get the
       // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
+      Value *inputVal;
+      if (SrcDF->isEntryNode()) {
         inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
+        DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+      } else {
         // edge is from a internal node
         // Check - code should already be generated for this source dfnode
-        assert(OutputMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-  
+        assert(OutputMap.count(SrcDF) &&
+               "Source node call not found. Dependency violation!");
+
         // Find Output Value associated with the Source DFNode using OutputMap
-        Value* CI = OutputMap[SrcDF];
-  
+        Value *CI = OutputMap[SrcDF];
+
         // Extract element at source position from this call instruction
         std::vector<unsigned> IndexList;
         IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "",RI);
+        DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI
+                     << "\n");
+        ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
         inputVal = EI;
       }
       std::vector<unsigned> IdxList;
@@ -1328,9 +1310,8 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     }
     DEBUG(errs() << "Extracted all\n");
     retVal->setName("output");
-    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
     ReplaceInstWithInst(RI, newRI);
-
   }
 
   //-------------------------------------------------------------------------//
@@ -1339,88 +1320,83 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   // If not, we see which version exists, check that it is in fact an x86
   // function and save it as the CPU_TARGET function
 
-  // TODO: visc_id per node, so we can use this for id for policies
+  // TODO: hpvm_id per node, so we can use this for id for policies
   // For now, use node function name and change it later
-  Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-  Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
+  Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+  Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
 
-  bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-  bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
+  bool CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET);
+  bool GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET);
 
-	DEBUG(errs() << "Before editing\n");
-  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName()
-                     << " with tag " << N->getTag() << "\n");
-  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
+  DEBUG(errs() << "Before editing\n");
+  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+               << N->getTag() << "\n");
+  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
   DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
+  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
   DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
 
-
-  if (N->getTag() == visc::None) {
+  if (N->getTag() == hpvm::None) {
     // No code is available for this node. This (usually) means that this
     // node is a node that
     // - from the accelerator backends has been mapped to an intermediate
     // node, and thus they have not produced a genFunc
-    // - a child node had no CPU hint, thus no code gen for CPU could 
+    // - a child node had no CPU hint, thus no code gen for CPU could
     // take place
     DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
-           << N->getFuncPointer()->getName() << "\n");
-  } else if (viscUtils::isSingleTargetTag(N->getTag())) {
+                 << N->getFuncPointer()->getName() << "\n");
+  } else if (hpvmUtils::isSingleTargetTag(N->getTag())) {
     // There is a single version for this node according to code gen hints.
     // Therefore, we do not need to check the policy, we simply use the
     // available implementation, whichever target it is for.
 
     // Sanity check - to be removed TODO
     switch (N->getTag()) {
-      case visc::CPU_TARGET:
-        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        break;
-      case visc::GPU_TARGET:
-        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
-        break;
-      default:
-        assert(false && "Unreachable: we checked that tag was single target!\n");
-        break;
+    case hpvm::CPU_TARGET:
+      assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET) && "");
+      assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) && "");
+      break;
+    case hpvm::GPU_TARGET:
+      assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(hpvm::CPU_TARGET)) && "");
+      assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET) && "");
+      break;
+    default:
+      assert(false && "Unreachable: we checked that tag was single target!\n");
+      break;
     }
-		
-    N->addGenFunc(N->getGenFuncForTarget(N->getTag()),
-				visc::CPU_TARGET,
-				true);
-		N->removeGenFuncForTarget(visc::GPU_TARGET);
-		N->setTag(visc::CPU_TARGET);
-
-		// Sanity checks - to be removed TODO
-		CF = N->getGenFuncForTarget(visc::CPU_TARGET);
-		GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-
-		CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
-		GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-
-		DEBUG(errs() << "After editing\n");
-		DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() 
-                        << " with tag " << N->getTag() << "\n");
-		DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
-		DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-		DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
-		DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
-
-   } 
-   else {
-    assert(false && "Multiple tags unsupported!");
-   }
 
+    N->addGenFunc(N->getGenFuncForTarget(N->getTag()), hpvm::CPU_TARGET, true);
+    N->removeGenFuncForTarget(hpvm::GPU_TARGET);
+    N->setTag(hpvm::CPU_TARGET);
+
+    // Sanity checks - to be removed TODO
+    CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
+    GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
+
+    CFx86 = N->hasX86GenFuncForTarget(hpvm::CPU_TARGET);
+    GFx86 = N->hasX86GenFuncForTarget(hpvm::GPU_TARGET);
+
+    DEBUG(errs() << "After editing\n");
+    DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+                 << N->getTag() << "\n");
+    DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
+    DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
+    DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
+    DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
+
+  } else {
+    assert(false && "Multiple tags unsupported!");
+  }
 }
 
 // Code generation for leaf nodes
-void CGT_X86::codeGen(DFLeafNode* N) {
+void CGT_X86::codeGen(DFLeafNode *N) {
   // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
+  if (N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
@@ -1437,31 +1413,32 @@ void CGT_X86::codeGen(DFLeafNode* N) {
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-//  if(N->getGenFunc() != NULL)
-//    return;
+  //  if(N->getGenFunc() != NULL)
+  //    return;
 
-  if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
-              " : skipping it\n";
+  if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
+    errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
+           << " : skipping it\n";
 
     switch (N->getTag()) {
-       case visc::GPU_TARGET:
-         // A leaf node should not have an x86 function for GPU
-         // by design of DFG2LLVM_NVPTX backend
-         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "Leaf node not expected to have GPU GenFunc");
-         break;
-       default:
-         break;
+    case hpvm::GPU_TARGET:
+      // A leaf node should not have an x86 function for GPU
+      // by design of DFG2LLVM_NVPTX backend
+      assert(!(N->hasX86GenFuncForTarget(hpvm::GPU_TARGET)) &&
+             "Leaf node not expected to have GPU GenFunc");
+      break;
+    default:
+      break;
     }
 
     return;
   }
 
-  assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL &&
+  assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
          "Error: Visiting a node for which code already generated\n");
 
   std::vector<IntrinsicInst *> IItoRemove;
-  std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace;
+  std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace;
   BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
 
   // Get the function associated woth the dataflow node
@@ -1477,41 +1454,39 @@ void CGT_X86::codeGen(DFLeafNode* N) {
 
   // Add the new argument to the argument list. Add arguments only if the cild
   // graph of parent node is not streaming
-  if(!N->getParent()->isChildGraphStreaming())
+  if (!N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   // Add generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+  //  N->setGenFunc(F_X86, hpvm::CPU_TARGET);
+  N->addGenFunc(F_X86, hpvm::CPU_TARGET, true);
 
   // Go through the arguments, and any pointer arguments with in attribute need
   // to have x86_argument_ptr call to get the x86 ptr of the argument
   // Insert these calls in a new BB which would dominate all other BBs
   // Create new BB
-  BasicBlock* EntryBB = &*F_X86->begin();
-  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
-  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
+  BasicBlock *EntryBB = &*F_X86->begin();
+  BasicBlock *BB =
+      BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_X86, EntryBB);
+  BranchInst *Terminator = BranchInst::Create(EntryBB, BB);
   // Insert calls
-  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
-        ai != ae; ++ai) {
-    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
-      assert(ai->getType()->isPointerTy()
-          && "Only pointer arguments can have visc in/out attributes ");
+  for (Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+       ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo() + 1,
+                                            Attribute::In)) {
+      assert(ai->getType()->isPointerTy() &&
+             "Only pointer arguments can have hpvm in/out attributes ");
       Function::arg_iterator aiNext = ai;
       ++aiNext;
-      Argument* size = &*aiNext;
-      assert(size->getType() == Type::getInt64Ty(M.getContext())
-          && "Next argument after a pointer should be an i64 type");
-      CastInst* BI = BitCastInst::CreatePointerCast(&*ai,
-                                                    Type::getInt8PtrTy(M.getContext()),
-                                                    ai->getName()+".i8ptr",
-                                                    Terminator);
-      Value* ArgPtrCallArgs[] = {BI, size};
-      CallInst::Create(llvm_visc_x86_argument_ptr,
-                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
-                                              "",
-                                              Terminator);
-
+      Argument *size = &*aiNext;
+      assert(size->getType() == Type::getInt64Ty(M.getContext()) &&
+             "Next argument after a pointer should be an i64 type");
+      CastInst *BI = BitCastInst::CreatePointerCast(
+          &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr",
+          Terminator);
+      Value *ArgPtrCallArgs[] = {BI, size};
+      CallInst::Create(llvm_hpvm_x86_argument_ptr,
+                       ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator);
     }
   }
   errs() << *BB << "\n";
@@ -1520,28 +1495,30 @@ void CGT_X86::codeGen(DFLeafNode* N) {
   for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
     Instruction *I = &(*i);
     DEBUG(errs() << *I << "\n");
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+    // Leaf nodes should not contain HPVM graph intrinsics or launch
+    assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
+           "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
+           "HPVM graph intrinsic within a leaf dataflow node!");
 
-    if (BuildDFG::isViscQueryIntrinsic(I)) {
-      IntrinsicInst* II = cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
+    if (BuildDFG::isHPVMQueryIntrinsic(I)) {
+      IntrinsicInst *II = cast<IntrinsicInst>(I);
+      IntrinsicInst *ArgII;
+      DFNode *ArgDFNode;
 
       /***********************************************************************
-      *                        Handle VISC Query intrinsics                  *
-      ***********************************************************************/
+       *                        Handle HPVM Query intrinsics                  *
+       ***********************************************************************/
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *******************/
-      case Intrinsic::visc_getNode: {
+      /**************************** llvm.hpvm.getNode() *******************/
+      case Intrinsic::hpvm_getNode: {
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
         break;
       }
-      /************************* llvm.visc.getParentNode() ****************/
-      case Intrinsic::visc_getParentNode: {
+      /************************* llvm.hpvm.getParentNode() ****************/
+      case Intrinsic::hpvm_getParentNode: {
         // get the parent node of the arg node
         // get argument node
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
@@ -1554,69 +1531,70 @@ void CGT_X86::codeGen(DFLeafNode* N) {
         IItoRemove.push_back(II);
         break;
       }
-      /*************************** llvm.visc.getNumDims() *****************/
-      case Intrinsic::visc_getNumDims: {
+      /*************************** llvm.hpvm.getNumDims() *****************/
+      case Intrinsic::hpvm_getNumDims: {
         // get node from map
         // get the appropriate field
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
-        IntegerType* IntTy = Type::getInt32Ty(M.getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+        IntegerType *IntTy = Type::getInt32Ty(M.getContext());
+        ConstantInt *numOfDimConstant =
+            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
 
         II->replaceAllUsesWith(numOfDimConstant);
         IItoRemove.push_back(II);
         break;
       }
-      /*********************** llvm.visc.getNodeInstanceID() **************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
+      /*********************** llvm.hpvm.getNodeInstanceID() **************/
+      case Intrinsic::hpvm_getNodeInstanceID_x:
+      case Intrinsic::hpvm_getNodeInstanceID_y:
+      case Intrinsic::hpvm_getNodeInstanceID_z: {
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
 
         // The dfnode argument should be an ancestor of this leaf node or
         // the leaf node itself
         int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
-               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+        assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
+               "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
 
         // Get specified dimension
         // (dim = 0) => x
         // (dim = 1) => y
         // (dim = 2) => z
-        int dim = (int) (II->getIntrinsicID() -
-                         Intrinsic::visc_getNodeInstanceID_x);
-        assert((dim >= 0) && (dim < 3)
-               && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!");
+        int dim =
+            (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x);
+        assert((dim >= 0) && (dim < 3) &&
+               "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic "
+               "ID!");
 
         // For immediate ancestor, use the extra argument introduced in
         // F_X86
         int numParamsF = F->getFunctionType()->getNumParams();
         int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert((numParamsF_X86 - numParamsF == 6)
-               && "Difference of arguments between function and its clone is not 6!");
+        assert(
+            (numParamsF_X86 - numParamsF == 6) &&
+            "Difference of arguments between function and its clone is not 6!");
 
-        if(parentLevel == 0) {
+        if (parentLevel == 0) {
           // Case when the query is for this node itself
-          unsigned offset = 3 + (3-dim);
+          unsigned offset = 3 + (3 - dim);
           // Traverse argument list of F_X86 in reverse order to find the
           // correct index or dim argument.
-          Argument* indexVal = getArgumentFromEnd(F_X86, offset);
+          Argument *indexVal = getArgumentFromEnd(F_X86, offset);
           assert(indexVal && "Index argument not found. Invalid offset!");
 
           DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
 
           II->replaceAllUsesWith(indexVal);
           IItoRemove.push_back(II);
-        }
-        else {
+        } else {
           // Case when query is for an ancestor
-          Value* args[] = {
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
-          };
-          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance,
-                                          ArrayRef<Value*>(args, 2),
+          Value *args[] = {
+              ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+              ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
+          CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimInstance,
+                                          ArrayRef<Value *>(args, 2),
                                           "nodeInstanceID", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
           II->replaceAllUsesWith(CI);
@@ -1624,10 +1602,10 @@ void CGT_X86::codeGen(DFLeafNode* N) {
         }
         break;
       }
-      /********************** llvm.visc.getNumNodeInstances() *************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
+      /********************** llvm.hpvm.getNumNodeInstances() *************/
+      case Intrinsic::hpvm_getNumNodeInstances_x:
+      case Intrinsic::hpvm_getNumNodeInstances_y:
+      case Intrinsic::hpvm_getNumNodeInstances_z: {
 
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
@@ -1635,46 +1613,46 @@ void CGT_X86::codeGen(DFLeafNode* N) {
         // The dfnode argument should be an ancestor of this leaf node or
         // the leaf node itself
         int parentLevel = N->getAncestorHops(ArgDFNode);
-        assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N )
-               && "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
+        assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
+               "Invalid DFNode argument to getNodeInstanceID_[xyz]!");
 
         // Get specified dimension
         // (dim = 0) => x
         // (dim = 1) => y
         // (dim = 2) => z
-        int dim = (int) (II->getIntrinsicID() -
-                         Intrinsic::visc_getNumNodeInstances_x);
-        assert((dim >= 0) && (dim < 3)
-               && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!");
+        int dim =
+            (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x);
+        assert((dim >= 0) && (dim < 3) &&
+               "Invalid dimension for getNumNodeInstances_[xyz]. Check "
+               "Intrinsic ID!");
 
         // For immediate ancestor, use the extra argument introduced in
         // F_X86
         int numParamsF = F->getFunctionType()->getNumParams();
         int numParamsF_X86 = F_X86->getFunctionType()->getNumParams();
-        assert((numParamsF_X86 - numParamsF == 6)
-               && "Difference of arguments between function and its clone is not 6!");
+        assert(
+            (numParamsF_X86 - numParamsF == 6) &&
+            "Difference of arguments between function and its clone is not 6!");
 
-        if(parentLevel == 0) {
+        if (parentLevel == 0) {
           // Case when the query is for this node itself
           unsigned offset = 3 - dim;
           // Traverse argument list of F_X86 in reverse order to find the
           // correct index or dim argument.
-          Argument* limitVal = getArgumentFromEnd(F_X86, offset);
+          Argument *limitVal = getArgumentFromEnd(F_X86, offset);
           assert(limitVal && "Limit argument not found. Invalid offset!");
 
-          DEBUG(errs() << *II << " replaced with " <<  *limitVal << "\n");
+          DEBUG(errs() << *II << " replaced with " << *limitVal << "\n");
 
           II->replaceAllUsesWith(limitVal);
           IItoRemove.push_back(II);
-        }
-        else {
+        } else {
           // Case when query is from the ancestor
-          Value* args[] = {
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
-            ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)
-          };
-          CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit,
-                                          ArrayRef<Value*>(args, 2),
+          Value *args[] = {
+              ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
+              ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
+          CallInst *CI = CallInst::Create(llvm_hpvm_x86_getDimLimit,
+                                          ArrayRef<Value *>(args, 2),
                                           "numNodeInstances", II);
           DEBUG(errs() << *II << " replaced with " << *CI << "\n");
           II->replaceAllUsesWith(CI);
@@ -1684,19 +1662,16 @@ void CGT_X86::codeGen(DFLeafNode* N) {
         break;
       }
       default:
-        DEBUG(errs() << "Found unknown intrinsic with ID = " <<
-              II->getIntrinsicID() << "\n");
-        assert(false && "Unknown VISC Intrinsic!");
+        DEBUG(errs() << "Found unknown intrinsic with ID = "
+                     << II->getIntrinsicID() << "\n");
+        assert(false && "Unknown HPVM Intrinsic!");
         break;
       }
 
     } else {
     }
-
   }
 
-
-
   // Remove them in reverse order
   for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
        i != IItoRemove.end(); ++i) {
@@ -1710,8 +1685,7 @@ void CGT_X86::codeGen(DFLeafNode* N) {
 } // End of namespace
 
 char DFG2LLVM_X86::ID = 0;
-static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86",
-                                    "Dataflow Graph to LLVM for X86 backend",
-                                    false /* does not modify the CFG */,
-                                    true /* transformation, not just analysis */);
-
+static RegisterPass<DFG2LLVM_X86>
+    X("dfg2llvm-x86", "Dataflow Graph to LLVM for X86 backend",
+      false /* does not modify the CFG */,
+      true /* transformation, not just analysis */);
diff --git a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
similarity index 74%
rename from hpvm/lib/Transforms/GenVISC/CMakeLists.txt
rename to hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
index ed087f63b4933a33792d7cd773acdf8fab1ac8e3..967766e7058c1ef8bcc1414afb7ff0087e3ce188 100644
--- a/hpvm/lib/Transforms/GenVISC/CMakeLists.txt
+++ b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WIN32 OR CYGWIN)
   set(LLVM_LINK_COMPONENTS Core Support)
 endif()
 
-add_llvm_library( LLVMGenVISC
+add_llvm_library( LLVMGenHPVM
   MODULE
-  GenVISC.cpp
+  GenHPVM.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..738b39905b885aa42bc861e3a19c3bdf9c65668e
--- /dev/null
+++ b/hpvm/lib/Transforms/GenHPVM/GenHPVM.cpp
@@ -0,0 +1,894 @@
+//=== GenHPVM.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "genhpvm"
+#include "GenHPVM/GenHPVM.h"
+
+#include "SupportHPVM/HPVMHint.h"
+#include "SupportHPVM/HPVMUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define TIMER(X)                                                               \
+  do {                                                                         \
+    if (HPVMTimer) {                                                           \
+      X;                                                                       \
+    }                                                                          \
+  } while (0)
+
+using namespace llvm;
+using namespace hpvmUtils;
+
+// HPVM Command line option to use timer or not
+static cl::opt<bool> HPVMTimer("hpvm-timers-gen",
+                               cl::desc("Enable GenHPVM timer"));
+
+namespace genhpvm {
+
+// Helper Functions
+
+static inline ConstantInt *getTimerID(Module &, enum hpvm_TimerID);
+static Function *transformReturnTypeToStruct(Function *F);
+static Type *getReturnTypeFromReturnInst(Function *F);
+
+// Check if the dummy function call is a __hpvm__node call
+#define IS_HPVM_CALL(callName)                                                 \
+  static bool isHPVMCall_##callName(Instruction *I) {                          \
+    if (!isa<CallInst>(I))                                                     \
+      return false;                                                            \
+    CallInst *CI = cast<CallInst>(I);                                          \
+    return (CI->getCalledValue()->stripPointerCasts()->getName())              \
+        .equals("__hpvm__" #callName);                                         \
+  }
+
+static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID,
+                                     std::vector<Instruction *> *Erase) {
+  // Check if the instruction is Call Instruction
+  assert(isa<CallInst>(I) && "Expecting CallInst");
+  CallInst *CI = cast<CallInst>(I);
+  DEBUG(errs() << "Found call: " << *CI << "\n");
+
+  // Find the correct intrinsic call
+  Module *M = CI->getParent()->getParent()->getParent();
+  Function *F;
+  std::vector<Type *> ArgTypes;
+  std::vector<Value *> args;
+  if (Intrinsic::isOverloaded(IntrinsicID)) {
+    // This is an overloaded intrinsic. The types must exactly match. Get the
+    // argument types
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      ArgTypes.push_back(CI->getArgOperand(i)->getType());
+      args.push_back(CI->getArgOperand(i));
+    }
+    F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
+    DEBUG(errs() << *F << "\n");
+  } else { // Non-overloaded intrinsic
+    F = Intrinsic::getDeclaration(M, IntrinsicID);
+    FunctionType *FTy = F->getFunctionType();
+    DEBUG(errs() << *F << "\n");
+
+    // Create argument list
+    assert(CI->getNumArgOperands() == FTy->getNumParams() &&
+           "Number of arguments of call do not match with Intrinsic");
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      Value *V = CI->getArgOperand(i);
+      // Either the type should match or both should be of pointer type
+      assert((V->getType() == FTy->getParamType(i) ||
+              (V->getType()->isPointerTy() &&
+               FTy->getParamType(i)->isPointerTy())) &&
+             "Dummy function call argument does not match with Intrinsic "
+             "argument!");
+      // If the types do not match, then both must be pointer type and pointer
+      // cast needs to be performed
+      if (V->getType() != FTy->getParamType(i)) {
+        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+      }
+      args.push_back(V);
+    }
+  }
+  // Insert call instruction
+  CallInst *Inst = CallInst::Create(
+      F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
+
+  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+
+  CI->replaceAllUsesWith(Inst);
+  // If the previous instruction needs to be erased, insert it in the vector
+  // Erased
+  if (Erase != NULL)
+    Erase->push_back(CI);
+}
+
+IS_HPVM_CALL(launch) /* Exists but not required */
+IS_HPVM_CALL(edge)   /* Exists but not required */
+IS_HPVM_CALL(createNodeND)
+// IS_HPVM_CALL(createNode)
+// IS_HPVM_CALL(createNode1D)
+// IS_HPVM_CALL(createNode2D)
+// IS_HPVM_CALL(createNode3D)
+IS_HPVM_CALL(bindIn)
+IS_HPVM_CALL(bindOut)
+IS_HPVM_CALL(push)
+IS_HPVM_CALL(pop)
+IS_HPVM_CALL(getNode)
+IS_HPVM_CALL(getParentNode)
+IS_HPVM_CALL(barrier)
+IS_HPVM_CALL(malloc)
+IS_HPVM_CALL(return )
+IS_HPVM_CALL(getNodeInstanceID_x)
+IS_HPVM_CALL(getNodeInstanceID_y)
+IS_HPVM_CALL(getNodeInstanceID_z)
+IS_HPVM_CALL(getNumNodeInstances_x)
+IS_HPVM_CALL(getNumNodeInstances_y)
+IS_HPVM_CALL(getNumNodeInstances_z)
+// Atomics
+IS_HPVM_CALL(atomic_cmpxchg)
+IS_HPVM_CALL(atomic_add)
+IS_HPVM_CALL(atomic_sub)
+IS_HPVM_CALL(atomic_xchg)
+IS_HPVM_CALL(atomic_inc)
+IS_HPVM_CALL(atomic_dec)
+IS_HPVM_CALL(atomic_min)
+IS_HPVM_CALL(atomic_max)
+IS_HPVM_CALL(atomic_umin)
+IS_HPVM_CALL(atomic_umax)
+IS_HPVM_CALL(atomic_and)
+IS_HPVM_CALL(atomic_or)
+IS_HPVM_CALL(atomic_xor)
+// Misc Fn
+IS_HPVM_CALL(floor)
+IS_HPVM_CALL(rsqrt)
+IS_HPVM_CALL(sqrt)
+IS_HPVM_CALL(sin)
+IS_HPVM_CALL(cos)
+
+IS_HPVM_CALL(init)
+IS_HPVM_CALL(cleanup)
+IS_HPVM_CALL(wait)
+IS_HPVM_CALL(trackMemory)
+IS_HPVM_CALL(untrackMemory)
+IS_HPVM_CALL(requestMemory)
+IS_HPVM_CALL(attributes)
+IS_HPVM_CALL(hint)
+
+// Return the constant integer represented by value V
+static unsigned getNumericValue(Value *V) {
+  assert(
+      isa<ConstantInt>(V) &&
+      "Value indicating the number of arguments should be a constant integer");
+  return cast<ConstantInt>(V)->getZExtValue();
+}
+
+// Take the __hpvm__return instruction and generate code for combining the
+// values being returned into a struct and returning it.
+// The first operand is the number of returned values
+static Value *genCodeForReturn(CallInst *CI) {
+  LLVMContext &Ctx = CI->getContext();
+  assert(isHPVMCall_return(CI) && "__hpvm__return instruction expected!");
+
+  // Parse the dummy function call here
+  assert(CI->getNumArgOperands() > 0 &&
+         "Too few arguments for __hpvm_return call!\n");
+  unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
+
+  assert(CI->getNumArgOperands() - 1 == numRetVals &&
+         "Too few arguments for __hpvm_return call!\n");
+  DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
+
+  std::vector<Type *> ArgTypes;
+  for (unsigned i = 1; i < CI->getNumArgOperands(); i++) {
+    ArgTypes.push_back(CI->getArgOperand(i)->getType());
+  }
+  Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
+  StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
+
+  InsertValueInst *IV = InsertValueInst::Create(
+      UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI);
+  DEBUG(errs() << "Code generation for return:\n");
+  DEBUG(errs() << *IV << "\n");
+
+  for (unsigned i = 2; i < CI->getNumArgOperands(); i++) {
+    IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(),
+                                 CI);
+    DEBUG(errs() << *IV << "\n");
+  }
+
+  return IV;
+}
+
+// Analyse the attribute call for this function. Add the in and out
+// attributes to pointer parameters.
+static void handleHPVMAttributes(Function *F, CallInst *CI) {
+  DEBUG(errs() << "Kernel before adding In/Out HPVM attributes:\n"
+               << *F << "\n");
+  // Parse the dummy function call here
+  unsigned offset = 0;
+  // Find number of In pointers
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __hpvm__attributes call!");
+  unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
+  DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
+
+  for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::In);
+    } else {
+      DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __hpvm__attributes call");
+    }
+  }
+  // Find number of Out Pointers
+  offset += 1 + numInPtrs;
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __hpvm__attributes call!");
+  unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
+  DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
+  for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::Out);
+    } else {
+      DEBUG(errs() << "Invalid argument to __hpvm__attribute: " << *V << "\n");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __hpvm__attributes call");
+    }
+  }
+  DEBUG(errs() << "Kernel after adding In/Out HPVM attributes:\n"
+               << *F << "\n");
+}
+
+// Public Functions of GenHPVM pass
+bool GenHPVM::runOnModule(Module &M) {
+  DEBUG(errs() << "\nGENHPVM PASS\n");
+  this->M = &M;
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
+
+  Twine llvmSrcRoot = LLVM_SRC_ROOT;
+  Twine runtimeAPI =
+      llvmSrcRoot + "/../build/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
+  DEBUG(errs() << llvmSrcRoot << "\n");
+
+  std::unique_ptr<Module> runtimeModule =
+      parseIRFile(runtimeAPI.str(), Err, M.getContext());
+
+  if (runtimeModule == NULL) {
+    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
+    assert(false && "couldn't parse runtime");
+  } else
+    DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
+
+  llvm_hpvm_initializeTimerSet = M.getOrInsertFunction(
+      "llvm_hpvm_initializeTimerSet",
+      runtimeModule->getFunction("llvm_hpvm_initializeTimerSet")
+          ->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_initializeTimerSet);
+
+  llvm_hpvm_switchToTimer = M.getOrInsertFunction(
+      "llvm_hpvm_switchToTimer",
+      runtimeModule->getFunction("llvm_hpvm_switchToTimer")->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_switchToTimer);
+
+  llvm_hpvm_printTimerSet = M.getOrInsertFunction(
+      "llvm_hpvm_printTimerSet",
+      runtimeModule->getFunction("llvm_hpvm_printTimerSet")->getFunctionType());
+  // DEBUG(errs() << *llvm_hpvm_printTimerSet);
+
+  // Insert init context in main
+  DEBUG(errs() << "Locate __hpvm__init()\n");
+  Function *VI = M.getFunction("__hpvm__init");
+  assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
+  Instruction *I = cast<Instruction>(*VI->user_begin());
+
+  DEBUG(errs() << "Initialize Timer Set\n");
+  initializeTimerSet(I);
+  switchToTimer(hpvm_TimerID_NONE, I);
+
+  // Insert print instruction at hpvm exit
+  DEBUG(errs() << "Locate __hpvm__cleanup()\n");
+  Function *VC = M.getFunction("__hpvm__cleanup");
+  assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
+  I = cast<Instruction>(*VC->user_begin());
+  printTimerSet(I);
+
+  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
+
+  std::vector<Instruction *> toBeErased;
+  std::vector<Function *> functions;
+
+  for (auto &F : M)
+    functions.push_back(&F);
+
+  // Iterate over all functions in the module
+  for (Function *f : functions) {
+    DEBUG(errs() << "Function: " << f->getName() << "\n");
+
+    // List with the required additions in the function's return type
+    std::vector<Type *> FRetTypes;
+
+    enum mutateTypeCause {
+      mtc_None,
+      mtc_BIND,
+      mtc_RETURN,
+      mtc_NUM_CAUSES
+    } bind;
+    bind = mutateTypeCause::mtc_None;
+
+    // Iterate over all the instructions in this function
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
+      // If not a call instruction, move to next instruction
+      if (!isa<CallInst>(I))
+        continue;
+
+      CallInst *CI = cast<CallInst>(I);
+      LLVMContext &Ctx = CI->getContext();
+
+      if (isHPVMCall_init(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_init, &toBeErased);
+      }
+      if (isHPVMCall_cleanup(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_cleanup, &toBeErased);
+      }
+      if (isHPVMCall_wait(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_wait, &toBeErased);
+      }
+      if (isHPVMCall_trackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_trackMemory, &toBeErased);
+      }
+      if (isHPVMCall_untrackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_untrackMemory, &toBeErased);
+      }
+      if (isHPVMCall_requestMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_requestMemory, &toBeErased);
+      }
+      if (isHPVMCall_hint(I)) {
+        assert(isa<ConstantInt>(CI->getArgOperand(0)) &&
+               "Argument to hint must be constant integer!");
+        ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0));
+
+        hpvm::Target t = (hpvm::Target)hint->getZExtValue();
+        addHint(CI->getParent()->getParent(), t);
+        DEBUG(errs() << "Found hpvm hint call: " << *CI << "\n");
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_launch(I)) {
+        Function *LaunchF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_launch);
+        DEBUG(errs() << *LaunchF << "\n");
+        // Get i8* cast to function pointer
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+        assert(
+            F &&
+            "Function invoked by HPVM launch has to be define and constant.");
+
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0));
+        assert(Op && "HPVM launch's streaming argument is a constant value.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+
+        auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
+        assert(ArgTy && "HPVM launch argument should be pointer type.");
+        Value *Arg = CI->getArgOperand(2);
+        if (!ArgTy->getElementType()->isIntegerTy(8))
+          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2),
+                                               Type::getInt8PtrTy(Ctx), "", CI);
+        Value *LaunchArgs[] = {F, Arg, isStreaming};
+        CallInst *LaunchInst = CallInst::Create(
+            LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI);
+        DEBUG(errs() << "Found hpvm launch call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
+        CI->replaceAllUsesWith(LaunchInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_push(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_push, &toBeErased);
+      }
+      if (isHPVMCall_pop(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_pop, &toBeErased);
+      }
+      if (isHPVMCall_createNodeND(I)) {
+        assert(CI->getNumArgOperands() > 0 &&
+               "Too few arguments for __hpvm__createNodeND call");
+        unsigned numDims = getNumericValue(CI->getArgOperand(0));
+        // We need as meny dimension argments are there are dimensions
+        assert(CI->getNumArgOperands() - 2 == numDims &&
+               "Too few arguments for __hpvm_createNodeND call!\n");
+
+        Function *CreateNodeF;
+        switch (numDims) {
+        case 0:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode);
+          break;
+        case 1:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode1D);
+          break;
+        case 2:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode2D);
+          break;
+        case 3:
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createNode3D);
+          break;
+        default:
+          llvm_unreachable("Unsupported number of dimensions\n");
+          break;
+        }
+        DEBUG(errs() << *CreateNodeF << "\n");
+        DEBUG(errs() << *I << "\n");
+        DEBUG(errs() << "in " << I->getParent()->getParent()->getName()
+                     << "\n");
+
+        // Get i8* cast to function pointer
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
+        graphFunc = transformReturnTypeToStruct(graphFunc);
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+
+        CallInst *CreateNodeInst;
+        switch (numDims) {
+        case 0:
+          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F),
+                                            graphFunc->getName() + ".node", CI);
+          break;
+        case 1: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 2: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+                                     CI->getArgOperand(3)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 3: {
+          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 2, expected to be i64\n");
+          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 3, expected to be i64\n");
+          assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
+                 "CreateNodeND dimension argument, 4, expected to be i64\n");
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
+                                     CI->getArgOperand(3),
+                                     CI->getArgOperand(4)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        default:
+          llvm_unreachable(
+              "Impossible path: number of dimensions is 0, 1, 2, 3\n");
+          break;
+        }
+
+        DEBUG(errs() << "Found hpvm createNode call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n");
+        CI->replaceAllUsesWith(CreateNodeInst);
+        toBeErased.push_back(CI);
+      }
+
+      if (isHPVMCall_edge(I)) {
+        Function *EdgeF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_createEdge);
+        DEBUG(errs() << *EdgeF << "\n");
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5));
+        ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
+        assert(Op && EdgeTypeOp &&
+               "Arguments of CreateEdge are not constant integers.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx)
+                                                 : ConstantInt::getTrue(Ctx);
+        Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                             isAllToAll,           CI->getArgOperand(3),
+                             CI->getArgOperand(4), isStreaming};
+        CallInst *EdgeInst = CallInst::Create(
+            EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI);
+        DEBUG(errs() << "Found hpvm edge call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
+        CI->replaceAllUsesWith(EdgeInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_bindIn(I)) {
+        Function *BindInF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_input);
+        DEBUG(errs() << *BindInF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind in intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), isStreaming};
+        CallInst *BindInInst =
+            CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI);
+        DEBUG(errs() << "Found hpvm bindIn call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
+        CI->replaceAllUsesWith(BindInInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_bindOut(I)) {
+        Function *BindOutF =
+            Intrinsic::getDeclaration(&M, Intrinsic::hpvm_bind_output);
+        DEBUG(errs() << *BindOutF << "\n");
+        // Check if this is a streaming bind or not
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind out intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                                CI->getArgOperand(2), isStreaming};
+        CallInst *BindOutInst = CallInst::Create(
+            BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI);
+        DEBUG(errs() << "Found hpvm bindOut call: " << *CI << "\n");
+        DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
+
+        DEBUG(errs() << "Fixing the return type of the function\n");
+        // FIXME: What if the child node function has not been visited already.
+        // i.e., it's return type has not been fixed.
+        Function *F = I->getParent()->getParent();
+        DEBUG(errs() << F->getName() << "\n";);
+        IntrinsicInst *NodeIntrinsic =
+            cast<IntrinsicInst>(CI->getArgOperand(0));
+        assert(NodeIntrinsic &&
+               "Instruction value in bind out is not a create node intrinsic.");
+        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
+        assert(
+            (NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode1D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode2D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::hpvm_createNode3D) &&
+            "Instruction value in bind out is not a create node intrinsic.");
+        Function *ChildF = cast<Function>(
+            NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
+        DEBUG(errs() << ChildF->getName() << "\n";);
+        int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
+        int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
+        StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType());
+
+        Type *ReturnType = F->getReturnType();
+        DEBUG(errs() << *ReturnType << "\n";);
+        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) &&
+               "Return type should either be a struct or void type!");
+
+        FRetTypes.insert(FRetTypes.begin() + destpos,
+                         ChildReturnTy->getElementType(srcpos));
+        assert(((bind == mutateTypeCause::mtc_BIND) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+               "Both bind_out and hpvm_return detected");
+        bind = mutateTypeCause::mtc_BIND;
+
+        CI->replaceAllUsesWith(BindOutInst);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_attributes(I)) {
+        Function *F = CI->getParent()->getParent();
+        handleHPVMAttributes(F, CI);
+        toBeErased.push_back(CI);
+      }
+      if (isHPVMCall_getNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNode, &toBeErased);
+      }
+      if (isHPVMCall_getParentNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getParentNode, &toBeErased);
+      }
+      if (isHPVMCall_barrier(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_barrier, &toBeErased);
+      }
+      if (isHPVMCall_malloc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_malloc, &toBeErased);
+      }
+      if (isHPVMCall_return(I)) {
+        DEBUG(errs() << "Function before hpvm return processing\n"
+                     << *I->getParent()->getParent() << "\n");
+        // The operands to this call are the values to be returned by the node
+        Value *ReturnVal = genCodeForReturn(CI);
+        DEBUG(errs() << *ReturnVal << "\n");
+        Type *ReturnType = ReturnVal->getType();
+        assert(isa<StructType>(ReturnType) &&
+               "Return type should be a struct type!");
+
+        assert(((bind == mutateTypeCause::mtc_RETURN) ||
+                (bind == mutateTypeCause::mtc_None)) &&
+               "Both bind_out and hpvm_return detected");
+
+        if (bind == mutateTypeCause::mtc_None) {
+          // If this is None, this is the first __hpvm__return
+          // instruction we have come upon. Place the return type of the
+          // function in the return type vector
+          bind = mutateTypeCause::mtc_RETURN;
+          StructType *ReturnStructTy = cast<StructType>(ReturnType);
+          for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
+            FRetTypes.push_back(ReturnStructTy->getElementType(i));
+        } else { // bind == mutateTypeCause::mtc_RETURN
+          // This is not the first __hpvm__return
+          // instruction we have come upon.
+          // Check that the return types are the same
+          assert((ReturnType == FRetTypes[0]) &&
+                 "Multiple returns with mismatching types");
+        }
+
+        ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal);
+        DEBUG(errs() << "Found hpvm return call: " << *CI << "\n");
+        Instruction *oldReturn = CI->getParent()->getTerminator();
+        assert(isa<ReturnInst>(oldReturn) &&
+               "Expecting a return to be the terminator of this BB!");
+        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
+        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
+        // CI->replaceAllUsesWith(RetInst);
+        toBeErased.push_back(CI);
+        ReplaceInstWithInst(oldReturn, RetInst);
+        DEBUG(errs() << "Function after hpvm return processing\n"
+                     << *I->getParent()->getParent() << "\n");
+      }
+
+      if (isHPVMCall_getNodeInstanceID_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_x,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNodeInstanceID_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_y,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNodeInstanceID_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNodeInstanceID_z,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_x,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_y,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_getNumNodeInstances_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_getNumNodeInstances_z,
+                                 &toBeErased);
+      }
+      if (isHPVMCall_atomic_add(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_add, &toBeErased);
+      }
+      if (isHPVMCall_atomic_sub(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_sub, &toBeErased);
+      }
+      if (isHPVMCall_atomic_xchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xchg, &toBeErased);
+      }
+      if (isHPVMCall_atomic_min(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_min, &toBeErased);
+      }
+      if (isHPVMCall_atomic_max(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_max, &toBeErased);
+      }
+      if (isHPVMCall_atomic_and(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_and, &toBeErased);
+      }
+      if (isHPVMCall_atomic_or(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_or, &toBeErased);
+      }
+      if (isHPVMCall_atomic_xor(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::hpvm_atomic_xor, &toBeErased);
+      }
+      if (isHPVMCall_sin(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
+      }
+      if (isHPVMCall_cos(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
+      }
+    }
+
+    // Erase the __hpvm__node calls
+    DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
+    for (auto I : toBeErased) {
+      DEBUG(errs() << *I << "\n");
+    }
+    while (!toBeErased.empty()) {
+      Instruction *I = toBeErased.back();
+      DEBUG(errs() << "\tErasing " << *I << "\n");
+      I->eraseFromParent();
+      toBeErased.pop_back();
+    }
+
+    if (bind == mutateTypeCause::mtc_BIND ||
+        bind == mutateTypeCause::mtc_RETURN) {
+      DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
+      // Argument type list.
+      std::vector<Type *> FArgTypes;
+      for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
+           ai != ae; ++ai) {
+        FArgTypes.push_back(ai->getType());
+      }
+
+      // Find new return type of function
+      Type *NewReturnTy;
+      if (bind == mutateTypeCause::mtc_BIND) {
+
+        std::vector<Type *> TyList;
+        for (unsigned i = 0; i < FRetTypes.size(); i++)
+          TyList.push_back(FRetTypes[i]);
+
+        NewReturnTy =
+            StructType::create(f->getContext(), TyList,
+                               Twine("struct.out." + f->getName()).str(), true);
+      } else {
+        NewReturnTy = getReturnTypeFromReturnInst(f);
+        assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
+      }
+
+      FunctionType *FTy =
+          FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
+
+      // Change the function type
+      Function *newF = cloneFunction(f, FTy, false);
+      DEBUG(errs() << *newF << "\n");
+
+      if (bind == mutateTypeCause::mtc_BIND) {
+        // This is certainly an internal node, and hence just one BB with one
+        // return terminator instruction. Change return statement
+        ReturnInst *RI =
+            cast<ReturnInst>(newF->getEntryBlock().getTerminator());
+        ReturnInst *newRI = ReturnInst::Create(newF->getContext(),
+                                               UndefValue::get(NewReturnTy));
+        ReplaceInstWithInst(RI, newRI);
+      }
+      if (bind == mutateTypeCause::mtc_RETURN) {
+        // Nothing
+      }
+      replaceNodeFunctionInIR(*f->getParent(), f, newF);
+      DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
+    }
+  }
+  return false; // TODO: What does returning "false" mean?
+}
+
+// Generate Code for declaring a constant string [L x i8] and return a pointer
+// to the start of it.
+Value *GenHPVM::getStringPointer(const Twine &S, Instruction *IB,
+                                 const Twine &Name) {
+  Constant *SConstant =
+      ConstantDataArray::getString(M->getContext(), S.str(), true);
+  Value *SGlobal =
+      new GlobalVariable(*M, SConstant->getType(), true,
+                         GlobalValue::InternalLinkage, SConstant, Name);
+  Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+  Value *GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst *SPtr = GetElementPtrInst::Create(
+      nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB);
+  return SPtr;
+}
+
+void GenHPVM::initializeTimerSet(Instruction *InsertBefore) {
+  Value *TimerSetAddr;
+  StoreInst *SI;
+  TIMER(TimerSet = new GlobalVariable(
+            *M, Type::getInt8PtrTy(M->getContext()), false,
+            GlobalValue::CommonLinkage,
+            Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
+            "hpvmTimerSet_GenHPVM"));
+  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
+               << "\n");
+  // DEBUG(errs() << "Inserting call to: " << *llvm_hpvm_initializeTimerSet <<
+  // "\n");
+
+  TIMER(TimerSetAddr = CallInst::Create(llvm_hpvm_initializeTimerSet, None, "",
+                                        InsertBefore));
+  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
+  TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
+  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
+}
+
+void GenHPVM::switchToTimer(enum hpvm_TimerID timer,
+                            Instruction *InsertBefore) {
+  Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)};
+  TIMER(CallInst::Create(llvm_hpvm_switchToTimer,
+                         ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
+}
+
+void GenHPVM::printTimerSet(Instruction *InsertBefore) {
+  Value *TimerName;
+  TIMER(TimerName = getStringPointer("GenHPVM_Timer", InsertBefore));
+  Value *printArgs[] = {TimerSet, TimerName};
+  TIMER(CallInst::Create(llvm_hpvm_printTimerSet,
+                         ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
+}
+
+static inline ConstantInt *getTimerID(Module &M, enum hpvm_TimerID timer) {
+  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
+}
+
+static Function *transformReturnTypeToStruct(Function *F) {
+  // Currently only works for void return types
+  DEBUG(errs() << "Transforming return type of function to Struct: "
+               << F->getName() << "\n");
+
+  if (isa<StructType>(F->getReturnType())) {
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": "
+                 << *F->getReturnType() << "\n");
+    return F;
+  }
+
+  assert(F->getReturnType()->isVoidTy() &&
+         "Unhandled case - Only void return type handled\n");
+
+  // Create the argument type list with added argument types
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+
+  StructType *RetTy =
+      StructType::create(F->getContext(), None, "emptyStruct", true);
+  FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
+
+  SmallVector<ReturnInst *, 8> Returns;
+  Function *newF = cloneFunction(F, FTy, false, &Returns);
+  // Replace ret void instruction with ret %RetTy undef
+  for (auto &RI : Returns) {
+    DEBUG(errs() << "Found return inst: " << *RI << "\n");
+    ReturnInst *newRI =
+        ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
+    ReplaceInstWithInst(RI, newRI);
+  }
+
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  return newF;
+}
+
+static Type *getReturnTypeFromReturnInst(Function *F) {
+  for (BasicBlock &BB : *F) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType()
+                   << "\n");
+      return RI->getReturnValue()->getType();
+    }
+  }
+}
+
+char genhpvm::GenHPVM::ID = 0;
+static RegisterPass<genhpvm::GenHPVM>
+    X("genhpvm",
+      "Pass to generate HPVM IR from LLVM IR (with dummy function calls)",
+      false, false);
+
+} // End of namespace genhpvm
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.exports b/hpvm/lib/Transforms/GenHPVM/GenHPVM.exports
similarity index 100%
rename from hpvm/lib/Transforms/GenVISC/GenVISC.exports
rename to hpvm/lib/Transforms/GenHPVM/GenHPVM.exports
diff --git a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
similarity index 88%
rename from hpvm/lib/Transforms/GenVISC/LLVMBuild.txt
rename to hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
index 9266b2c5972984a179beba227946964182761239..94ef73ac07ca5c1ff23a05e404b0ea1f751ef36c 100644
--- a/hpvm/lib/Transforms/GenVISC/LLVMBuild.txt
+++ b/hpvm/lib/Transforms/GenHPVM/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===;
+;===- ./lib/Transforms/GenHPVM/LLVMBuild.txt -------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,5 +17,5 @@
 
 [component_0]
 type = Library
-name = GenVISC
+name = GenHPVM
 parent = Transforms
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
deleted file mode 100644
index cc505415396b4a0441d5a5bfe0cf58adc945b9f8..0000000000000000000000000000000000000000
--- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ /dev/null
@@ -1,866 +0,0 @@
-//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "genvisc"
-#include "GenVISC/GenVISC.h"
-
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "SupportVISC/VISCUtils.h"
-
-
-#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
-
-using namespace llvm;
-using namespace viscUtils;
-
-
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer"));
-
-namespace genvisc {
-
-// Helper Functions
-
-static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
-static Function* transformReturnTypeToStruct(Function* F);
-static Type* getReturnTypeFromReturnInst(Function* F);
-
-// Check if the dummy function call is a __visc__node call
-#define IS_VISC_CALL(callName) \
-  static bool isVISCCall_##callName(Instruction* I) { \
-    if(!isa<CallInst>(I)) \
-      return false; \
-    CallInst* CI = cast<CallInst>(I); \
-    return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
-  }
-
-static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
-  // Check if the instruction is Call Instruction
-  assert(isa<CallInst>(I) && "Expecting CallInst");
-  CallInst* CI = cast<CallInst>(I);
-  DEBUG(errs() << "Found call: " << *CI << "\n");
-
-  // Find the correct intrinsic call
-  Module* M = CI->getParent()->getParent()->getParent();
-  Function* F;
-  std::vector<Type*> ArgTypes;
-  std::vector<Value*> args;
-  if(Intrinsic::isOverloaded(IntrinsicID)) {
-    // This is an overloaded intrinsic. The types must exactly match. Get the
-    // argument types
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-      ArgTypes.push_back(CI->getArgOperand(i)->getType());
-      args.push_back(CI->getArgOperand(i));
-    }
-    F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
-    DEBUG(errs() << *F << "\n");
-  }
-  else { // Non-overloaded intrinsic
-    F = Intrinsic::getDeclaration(M, IntrinsicID);
-    FunctionType* FTy = F->getFunctionType();
-    DEBUG(errs() << *F << "\n");
-
-    // Create argument list
-    assert(CI->getNumArgOperands() == FTy->getNumParams()
-        && "Number of arguments of call do not match with Intrinsic");
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-      Value* V = CI->getArgOperand(i);
-      // Either the type should match or both should be of pointer type
-      assert((V->getType() == FTy->getParamType(i) ||
-          (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
-          && "Dummy function call argument does not match with Intrinsic argument!");
-      // If the types do not match, then both must be pointer type and pointer
-      // cast needs to be performed
-      if(V->getType() != FTy->getParamType(i)) {
-        V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
-      }
-      args.push_back(V);
-    }
-  }
-  // Insert call instruction
-  CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
-
-  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
-
-  CI->replaceAllUsesWith(Inst);
-  // If the previous instruction needs to be erased, insert it in the vector
-  // Erased
-  if(Erase != NULL)
-    Erase->push_back(CI);
-}
-
-IS_VISC_CALL(launch) /* Exists but not required */
-IS_VISC_CALL(edge) /* Exists but not required */
-IS_VISC_CALL(createNodeND)
-//IS_VISC_CALL(createNode)
-//IS_VISC_CALL(createNode1D)
-//IS_VISC_CALL(createNode2D)
-//IS_VISC_CALL(createNode3D)
-IS_VISC_CALL(bindIn)
-IS_VISC_CALL(bindOut)
-IS_VISC_CALL(push)
-IS_VISC_CALL(pop)
-IS_VISC_CALL(getNode)
-IS_VISC_CALL(getParentNode)
-IS_VISC_CALL(barrier)
-IS_VISC_CALL(malloc)
-IS_VISC_CALL(return)
-IS_VISC_CALL(getNodeInstanceID_x)
-IS_VISC_CALL(getNodeInstanceID_y)
-IS_VISC_CALL(getNodeInstanceID_z)
-IS_VISC_CALL(getNumNodeInstances_x)
-IS_VISC_CALL(getNumNodeInstances_y)
-IS_VISC_CALL(getNumNodeInstances_z)
-// Atomics
-IS_VISC_CALL(atomic_cmpxchg)
-IS_VISC_CALL(atomic_add)
-IS_VISC_CALL(atomic_sub)
-IS_VISC_CALL(atomic_xchg)
-IS_VISC_CALL(atomic_inc)
-IS_VISC_CALL(atomic_dec)
-IS_VISC_CALL(atomic_min)
-IS_VISC_CALL(atomic_max)
-IS_VISC_CALL(atomic_umin)
-IS_VISC_CALL(atomic_umax)
-IS_VISC_CALL(atomic_and)
-IS_VISC_CALL(atomic_or)
-IS_VISC_CALL(atomic_xor)
-// Misc Fn
-IS_VISC_CALL(floor)
-IS_VISC_CALL(rsqrt)
-IS_VISC_CALL(sqrt)
-IS_VISC_CALL(sin)
-IS_VISC_CALL(cos)
-
-
-IS_VISC_CALL(init)
-IS_VISC_CALL(cleanup)
-IS_VISC_CALL(wait)
-IS_VISC_CALL(trackMemory)
-IS_VISC_CALL(untrackMemory)
-IS_VISC_CALL(requestMemory)
-IS_VISC_CALL(attributes)
-IS_VISC_CALL(hint)
-
-// Return the constant integer represented by value V
-static unsigned getNumericValue(Value* V) {
-  assert(isa<ConstantInt>(V)
-         && "Value indicating the number of arguments should be a constant integer");
-  return cast<ConstantInt>(V)->getZExtValue();
-}
-
-// Take the __visc__return instruction and generate code for combining the
-// values being returned into a struct and returning it.
-// The first operand is the number of returned values
-static Value* genCodeForReturn(CallInst* CI) {
-  LLVMContext& Ctx = CI->getContext();
-  assert(isVISCCall_return(CI)
-      && "__visc__return instruction expected!");
-
-  // Parse the dummy function call here
-  assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n");
-  unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
-
-  assert(CI->getNumArgOperands()-1 == numRetVals &&
-         "Too few arguments for __visc_return call!\n");
-  DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
-
-  std::vector<Type*> ArgTypes;
-  for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
-    ArgTypes.push_back(CI->getArgOperand(i)->getType());
-  }
-  Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
-  StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
-
-  InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy),
-                                                CI->getArgOperand(1),
-                                                0,
-                                                "returnStruct",
-                                                CI);
-  DEBUG(errs() << "Code generation for return:\n");
-  DEBUG(errs() << *IV << "\n");
-
-  for(unsigned i=2; i < CI->getNumArgOperands(); i++) {
-    IV = InsertValueInst::Create(IV,
-                                 CI->getArgOperand(i),
-                                 i-1,
-                                 IV->getName(),
-                                 CI);
-    DEBUG(errs() << *IV << "\n");
-  }
-  
-  return IV;
-}
-
-// Analyse the attribute call for this function. Add the in and out
-// attributes to pointer parameters.
-static void handleVISCAttributes(Function* F, CallInst* CI) {
-  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n");
-  // Parse the dummy function call here
-  unsigned offset = 0;
-  // Find number of In pointers
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
-  unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
-  DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
-
-  for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::In);
-    }
-    else {
-      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
-    }
-  }
-  // Find number of Out Pointers
-  offset += 1 + numInPtrs;
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
-  unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
-  DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
-  for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::Out);
-    }
-    else {
-      errs() << "Invalid argument to __visc__attribute: " << *V << "\n";
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
-    }
-  }
-  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n");
-}
-
-// Public Functions of GenVISC pass
-bool GenVISC::runOnModule(Module &M) {
-  errs() << "\nGENVISC PASS\n";
-  this->M = &M;
-
-  // Load Runtime API Module
-  SMDiagnostic Err;
-
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL &&
-         "Define LLVM_SRC_ROOT environment variable!");
-
-  Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/../build/tools/hpvm/projects/visc-rt/visc-rt.bc";
-  errs() << llvmSrcRoot << "\n";
-
-  std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-
-  if(runtimeModule == NULL) {
-    DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
-    assert(false && "couldn't parse runtime");
-  }
-  else
-    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
-
-  llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet",
-                                 runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_initializeTimerSet);
-
-  llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer",
-                            runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
- // DEBUG(errs() << *llvm_visc_switchToTimer);
-
-  llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet",
-                            runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_printTimerSet);
-
-  // Insert init context in main
-  DEBUG(errs() << "Locate __visc__init()\n");
-  Function* VI = M.getFunction("__visc__init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
-
-  DEBUG(errs() << "Initialize Timer Set\n");
-  initializeTimerSet(I);
-  switchToTimer(visc_TimerID_NONE, I);
-
-  // Insert print instruction at visc exit
-  DEBUG(errs() << "Locate __visc__cleanup()\n");
-  Function* VC = M.getFunction("__visc__cleanup");
-  assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
-  I = cast<Instruction>(*VC->user_begin());
-  printTimerSet(I);
-
-
-  DEBUG(errs() << "-------- Searching for launch sites ----------\n");
-
-  std::vector<Instruction*> toBeErased;
-  std::vector<Function*> functions;
-
-  for (auto &F : M) 
-    functions.push_back(&F);
-
-  // Iterate over all functions in the module
-  for (Function *f : functions) {
-    DEBUG(errs() << "Function: " << f->getName() << "\n");
-
-    // List with the required additions in the function's return type
-    std::vector<Type*> FRetTypes;
-
-    enum mutateTypeCause {
-      mtc_None,
-      mtc_BIND,
-      mtc_RETURN,
-      mtc_NUM_CAUSES
-    } bind;
-    bind = mutateTypeCause::mtc_None;
-
-    // Iterate over all the instructions in this function
-    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
-      Instruction* I = &*i; // Grab pointer to Instruction
-      // If not a call instruction, move to next instruction
-      if(!isa<CallInst>(I))
-        continue;
-
-      CallInst* CI = cast<CallInst>(I);
-      LLVMContext& Ctx = CI->getContext();
-
-      if(isVISCCall_init(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
-      }
-      if(isVISCCall_cleanup(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
-      }
-      if(isVISCCall_wait(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
-      }
-      if(isVISCCall_trackMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
-      }
-      if(isVISCCall_untrackMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
-      }
-      if(isVISCCall_requestMemory(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
-      }
-      if(isVISCCall_hint(I)) {
-        assert(isa<ConstantInt>(CI->getArgOperand(0))
-               && "Argument to hint must be constant integer!");
-        ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
-
-        visc::Target t = (visc::Target) hint->getZExtValue();
-        addHint(CI->getParent()->getParent(), t);
-        DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_launch(I)) {
-        Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
-        DEBUG(errs() << *LaunchF << "\n");
-        // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
-        graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-	assert(F && "Function invoked by VISC launch has to be define and constant.");
-
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
-	assert(Op && "VISC launch's streaming argument is a constant value.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        
-        auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
-        assert(ArgTy && "VISC launch argument should be pointer type.");
-        Value *Arg = CI->getArgOperand(2);
-        if(!ArgTy->getElementType()->isIntegerTy(8))
-          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI);
-        Value* LaunchArgs[] = {F, Arg, isStreaming};
-        CallInst* LaunchInst = CallInst::Create(LaunchF,
-                                                ArrayRef<Value*>(LaunchArgs, 3),
-                                                "graphID", CI);
-        DEBUG(errs() << "Found visc launch call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
-        CI->replaceAllUsesWith(LaunchInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_push(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
-      }
-      if(isVISCCall_pop(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
-      }
-      if(isVISCCall_createNodeND(I)) {
-        assert(CI->getNumArgOperands() > 0 &&
-               "Too few arguments for __visc__createNodeND call");
-        unsigned numDims = getNumericValue(CI->getArgOperand(0));
-        // We need as meny dimension argments are there are dimensions
-        assert(CI->getNumArgOperands()-2 == numDims &&
-              "Too few arguments for __visc_createNodeND call!\n");
-
-        Function* CreateNodeF;
-        switch (numDims) {
-        case 0:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
-          break;
-        case 1:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
-          break;
-        case 2:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
-          break;
-        case 3:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
-          break;
-        default:
-          llvm_unreachable("Unsupported number of dimensions\n");
-          break;
-        }
-        DEBUG(errs() << *CreateNodeF << "\n");
-        DEBUG(errs() << *I << "\n");
-        DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n");
-
-        // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
-        graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-
-        CallInst* CreateNodeInst;
-        switch (numDims) {
-        case 0:
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(F),
-                                            graphFunc->getName()+".node", CI);
-          break;
-        case 1:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 2),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 2:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 3, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
-                                     CI->getArgOperand(3)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 3),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 3:
-          {
-          assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 2, expected to be i64\n");
-          assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 3, expected to be i64\n");
-          assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
-                 "CreateNodeND dimension argument, 4, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
-                                     CI->getArgOperand(3),
-                                     CI->getArgOperand(4)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 4),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        default:
-          llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n");
-          break;
-        }
-
-        DEBUG(errs() << "Found visc createNode call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n");
-        CI->replaceAllUsesWith(CreateNodeInst);
-        toBeErased.push_back(CI);
-      }
-
-      if(isVISCCall_edge(I)) {
-        Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
-        DEBUG(errs() << *EdgeF << "\n");
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5));
-        ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
-	assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx)
-                                                : ConstantInt::getTrue(Ctx);
-        Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                             isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4),
-                             isStreaming
-                            };
-        CallInst* EdgeInst = CallInst::Create(EdgeF,
-                                              ArrayRef<Value*>(EdgeArgs, 6),
-                                              "output", CI);
-        DEBUG(errs() << "Found visc edge call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
-        CI->replaceAllUsesWith(EdgeInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_bindIn(I)) {
-        Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
-        DEBUG(errs() << *BindInF << "\n");
-        // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind in intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), isStreaming
-                              };
-        CallInst* BindInInst = CallInst::Create(BindInF,
-                                                ArrayRef<Value*>(BindInArgs, 4),
-                                                "", CI);
-        DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
-        CI->replaceAllUsesWith(BindInInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_bindOut(I)) {
-        Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
-        DEBUG(errs() << *BindOutF << "\n");
-        // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind out intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                                CI->getArgOperand(2), isStreaming
-                               };
-        CallInst* BindOutInst = CallInst::Create(BindOutF,
-                                ArrayRef<Value*>(BindOutArgs, 4),
-                                "", CI);
-        DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
-
-        DEBUG(errs() << "Fixing the return type of the function\n");
-        // FIXME: What if the child node function has not been visited already.
-        // i.e., it's return type has not been fixed.
-        Function* F = I->getParent()->getParent();
-        DEBUG(errs() << F->getName() << "\n";);
-        IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
-	assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic.");
-        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
-	assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
-		"Instruction value in bind out is not a create node intrinsic.");
-        Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
-        DEBUG(errs() << ChildF->getName() << "\n";);
-        int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
-        int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
-        StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
-
-        Type* ReturnType = F->getReturnType();
-        DEBUG(errs() << *ReturnType << "\n";);
-        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
-            && "Return type should either be a struct or void type!");
-
-        FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos));
-        assert(((bind == mutateTypeCause::mtc_BIND) ||
-                (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
-        bind = mutateTypeCause::mtc_BIND;
-
-        CI->replaceAllUsesWith(BindOutInst);
-        toBeErased.push_back(CI);
-      }
-      if(isVISCCall_attributes(I)) {
-        Function* F = CI->getParent()->getParent();
-        handleVISCAttributes(F, CI);
-        toBeErased.push_back(CI);
-      }
-      if (isVISCCall_getNode(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased);
-      }
-      if (isVISCCall_getParentNode(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased);
-      }
-      if (isVISCCall_barrier(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased);
-      }
-      if (isVISCCall_malloc(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
-      }
-      if (isVISCCall_return(I)) {
-        DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n");
-        // The operands to this call are the values to be returned by the node
-        Value* ReturnVal = genCodeForReturn(CI);
-        DEBUG(errs() << *ReturnVal << "\n");
-        Type* ReturnType = ReturnVal->getType();
-        assert(isa<StructType>(ReturnType)
-               && "Return type should be a struct type!");
-
-        assert(((bind == mutateTypeCause::mtc_RETURN) ||
-                (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
-
-        if (bind == mutateTypeCause::mtc_None) {
-          // If this is None, this is the first __visc__return
-          // instruction we have come upon. Place the return type of the
-          // function in the return type vector
-          bind = mutateTypeCause::mtc_RETURN;
-          StructType* ReturnStructTy = cast<StructType>(ReturnType);
-          for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
-            FRetTypes.push_back(ReturnStructTy->getElementType(i));
-        } else { // bind == mutateTypeCause::mtc_RETURN
-          // This is not the first __visc__return
-          // instruction we have come upon. 
-          // Check that the return types are the same
-          assert((ReturnType == FRetTypes[0])
-                 && "Multiple returns with mismatching types");
-        }
-
-        ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
-        DEBUG(errs() << "Found visc return call: " << *CI << "\n");
-        Instruction* oldReturn = CI->getParent()->getTerminator();
-        assert(isa<ReturnInst>(oldReturn)
-                && "Expecting a return to be the terminator of this BB!");
-        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
-        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
-        //CI->replaceAllUsesWith(RetInst);
-        toBeErased.push_back(CI);
-        ReplaceInstWithInst(oldReturn, RetInst);
-        DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n");
-      }
-
-      if (isVISCCall_getNodeInstanceID_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
-      }
-      if (isVISCCall_getNodeInstanceID_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
-      }
-      if (isVISCCall_getNodeInstanceID_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
-      }
-      if (isVISCCall_getNumNodeInstances_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
-      }
-      if (isVISCCall_atomic_add(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
-      }
-      if (isVISCCall_atomic_sub(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased);
-      }
-      if (isVISCCall_atomic_xchg(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased);
-      }
-      if (isVISCCall_atomic_min(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased);
-      }
-      if (isVISCCall_atomic_max(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased);
-      }
-      if (isVISCCall_atomic_and(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased);
-      }
-      if (isVISCCall_atomic_or(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased);
-      }
-      if (isVISCCall_atomic_xor(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased);
-      }
-      if (isVISCCall_sin(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased);
-      }
-      if (isVISCCall_cos(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
-      }
-    }
-
-    // Erase the __visc__node calls
-    DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
-    for(auto I: toBeErased) {
-      DEBUG(errs() << *I << "\n");
-    }
-    while(!toBeErased.empty()) {
-      Instruction* I = toBeErased.back(); 
-      DEBUG(errs() << "\tErasing " << *I << "\n");
-      I->eraseFromParent();
-      toBeErased.pop_back(); 
-    }
-
-    if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
-        DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
-        // Argument type list.
-        std::vector<Type*> FArgTypes;
-        for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
-            ai != ae; ++ai) {
-          FArgTypes.push_back(ai->getType());
-        }
-
-        // Find new return type of function
-        Type* NewReturnTy;
-        if(bind == mutateTypeCause::mtc_BIND) {
-
-          std::vector<Type*> TyList;
-          for (unsigned i = 0; i < FRetTypes.size(); i++)
-            TyList.push_back(FRetTypes[i]);
-
-          NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true);
-        }
-        else {
-          NewReturnTy = getReturnTypeFromReturnInst(f);
-          assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
-        }
-
-        FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
-
-        // Change the function type
-        Function* newF = cloneFunction(f, FTy, false);
-        DEBUG(errs() << *newF << "\n");
-
-        if (bind == mutateTypeCause::mtc_BIND) {
-          // This is certainly an internal node, and hence just one BB with one
-          // return terminator instruction. Change return statement
-          ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator());
-          ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy));
-          ReplaceInstWithInst(RI, newRI);        
-        }
-        if (bind == mutateTypeCause::mtc_RETURN) {
-          // Nothing
-        }
-        replaceNodeFunctionInIR(*f->getParent(), f, newF);
-        DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
-    }
-
-
-  }
-  return false; //TODO: What does returning "false" mean?
-}
-
-// Generate Code for declaring a constant string [L x i8] and return a pointer
-// to the start of it.
-Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
-  Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true);
-  Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true,
-                                      GlobalValue::InternalLinkage, SConstant, Name);
-  Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
-  Value* GEPArgs[] = {Zero, Zero};
-  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
-                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
-  return SPtr;
-}
-
-void GenVISC::initializeTimerSet(Instruction* InsertBefore) {
-  Value* TimerSetAddr;
-  StoreInst* SI;
-  TIMER(TimerSet = new GlobalVariable(*M,
-                                      Type::getInt8PtrTy(M->getContext()),
-                                      false,
-                                      GlobalValue::CommonLinkage,
-                                      Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
-                                      "viscTimerSet_GenVISC"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n");
-  //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
-
-  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
-                                        None,
-                                        "",
-                                        InsertBefore));
-  DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
-  TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
-  DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
-}
-
-void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
-  Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)};
-  TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value*>(switchArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-void GenVISC::printTimerSet(Instruction* InsertBefore) {
-  Value* TimerName;
-  TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore));
-  Value* printArgs[] = {TimerSet, TimerName};
-  TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value*>(printArgs, 2),
-                         "",
-                         InsertBefore));
-}
-
-static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
-  return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
-}
-
-static Function* transformReturnTypeToStruct(Function* F) {
-  // Currently only works for void return types
-  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
-
-  if (isa<StructType>(F->getReturnType())) {
-    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
-    return F;
-  }
-
-  assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
-
-  // Create the argument type list with added argument types
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-  
-  StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true);
-  FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
-  
-  SmallVector<ReturnInst*, 8> Returns;
-  Function* newF = cloneFunction(F, FTy, false, &Returns);
-  // Replace ret void instruction with ret %RetTy undef
-  for(auto &RI: Returns) {
-    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
-    ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
-    ReplaceInstWithInst(RI, newRI);
-  }
-
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-  return newF;
-}
-
-static Type* getReturnTypeFromReturnInst(Function* F) {
-  for(BasicBlock &BB: *F) {
-    if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n");
-      return RI->getReturnValue()->getType();
-    }
-  }
-}
-
-
-char genvisc::GenVISC::ID = 0;
-static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false);
-
-} // End of namespace genvisc
-
-
diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
index 7bd66b62c6c8cda589fe3e6c1e3711893aceaffb..fc33ebee71123d89c5f931901dd213c82a401941 100644
--- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
+++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "LocalMem"
-#include "SupportVISC/DFG2LLVM.h"
+#include "SupportHPVM/DFG2LLVM.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
@@ -134,7 +134,7 @@ void AT_OCL::codeGen(DFLeafNode *N) {
 // Return pointer to property if this leaf node matches the conditions for being
 // an allocation node. Conditions
 // 1. No incoming memory pointer. No in/out attribute on a pointer argument
-// 2. Uses visc malloc intrinsic to allocate memory
+// 2. Uses hpvm malloc intrinsic to allocate memory
 // 3. Sends it out
 // 2. (TODO:) Whether the allocated pointer escapes the parent node
 AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
@@ -148,18 +148,18 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
 
   Function *F = N->getFuncPointer();
 
-  // Allocation node must use visc malloc intrinsic
-  bool usesVISCMalloc = false;
+  // Allocation node must use hpvm malloc intrinsic
+  bool usesHPVMMalloc = false;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) {
     Instruction *I = &*i;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
-        usesVISCMalloc = true;
+      if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) {
+        usesHPVMMalloc = true;
         break;
       }
     }
   }
-  if (!usesVISCMalloc)
+  if (!usesHPVMMalloc)
     return NULL;
 
   // TODO: Check if allocated pointer leaves parent node
@@ -197,20 +197,20 @@ AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
     assert(OutValues[i]->getType()->isPointerTy() &&
            "Expected outgoing edge to be of pointer type");
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) {
-      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
+      if (II->getIntrinsicID() == Intrinsic::hpvm_malloc) {
         // Sanity check: Size passed to malloc intrinsic is same as the value
         // going into the next outgoing edge
-        DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n");
+        DEBUG(errs() << "HPVM malloc size: " << *II->getArgOperand(0) << "\n");
         DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n");
         assert(II->getArgOperand(0) == OutValues[i + 1] &&
-               "Sanity Check Failed: VISC Malloc size argument != next "
+               "Sanity Check Failed: HPVM Malloc size argument != next "
                "outgoing edge");
         ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0));
         i = i + 2;
         continue;
       }
     }
-    llvm_unreachable("Expecting visc malloc intrinsic instruction!");
+    llvm_unreachable("Expecting hpvm malloc intrinsic instruction!");
   }
   return ANP;
 }
diff --git a/hpvm/llvm_installer/llvm_installer.sh b/hpvm/llvm_installer/llvm_installer.sh
index d7fcda4ac4de8c129e47cfce65264097e040d228..e072d042b79a1a3caf8003794a89b5cee2dca67a 100755
--- a/hpvm/llvm_installer/llvm_installer.sh
+++ b/hpvm/llvm_installer/llvm_installer.sh
@@ -179,10 +179,10 @@ echo make -j$NUM_THREADS
 make -j$NUM_THREADS
 #make install
 
-#echo Building HPVM runtime
-#HPVM_RT_DIR=$HPVM_DIR/projects/visc-rt
-#cd $HPVM_RT_DIR
-#make
+# echo Building HPVM runtime
+# HPVM_RT_DIR=$HPVM_DIR/projects/hpvm-rt
+# cd $HPVM_RT_DIR
+# make
 
 #cp -r $CURRENT_DIR/projects $HPVM_DIR/
 #make -j$NUM_THREADS
diff --git a/hpvm/llvm_patches/apply_patch.sh b/hpvm/llvm_patches/apply_patch.sh
index ea86575207a4aa7b4ca138b604f7423943924b22..289e5c11e319aa16262952d2d079f986c2e987b8 100644
--- a/hpvm/llvm_patches/apply_patch.sh
+++ b/hpvm/llvm_patches/apply_patch.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 ### File Copies
-cp include/IR/IntrinsicsVISC.td  ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsVISC.td
+cp include/IR/IntrinsicsHPVM.td  ${LLVM_SRC_ROOT}/include/llvm/IR/IntrinsicsHPVM.td
 
 
 ## Header File Patches
diff --git a/hpvm/llvm_patches/include/IR/Attributes.td b/hpvm/llvm_patches/include/IR/Attributes.td
index b644cdb30bbd590a8b8c238bfde15e4b451e8ea3..c6ff8ef3c6c962f5444d718ff5a7e16ce392a522 100644
--- a/hpvm/llvm_patches/include/IR/Attributes.td
+++ b/hpvm/llvm_patches/include/IR/Attributes.td
@@ -151,7 +151,7 @@ def ShadowCallStack : EnumAttr<"shadowcallstack">;
 /// Sign extended before/after call.
 def SExt : EnumAttr<"signext">;
 
-/// VISC Attributes
+/// HPVM Attributes
 /// Pointer to read only memory
 def In : EnumAttr<"in">;
 
diff --git a/hpvm/llvm_patches/include/IR/Intrinsics.td b/hpvm/llvm_patches/include/IR/Intrinsics.td
index 2f79964a2e381c6d4ec22a5bc3c80a9d411f9fb0..2e3f34eb1a8408371a0b516089dd970adfe9223c 100644
--- a/hpvm/llvm_patches/include/IR/Intrinsics.td
+++ b/hpvm/llvm_patches/include/IR/Intrinsics.td
@@ -1249,4 +1249,4 @@ include "llvm/IR/IntrinsicsBPF.td"
 include "llvm/IR/IntrinsicsSystemZ.td"
 include "llvm/IR/IntrinsicsWebAssembly.td"
 include "llvm/IR/IntrinsicsRISCV.td"
-include "llvm/IR/IntrinsicsVISC.td"
+include "llvm/IR/IntrinsicsHPVM.td"
diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td
new file mode 100644
index 0000000000000000000000000000000000000000..410e9c8d3345e67df9614e0d518e5e596a4368e1
--- /dev/null
+++ b/hpvm/llvm_patches/include/IR/IntrinsicsHPVM.td
@@ -0,0 +1,208 @@
+//===- IntrinsicsHPVM.td - Defines HPVM intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the HPVM-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "hpvm" in {
+  /* All intrinsics start with "llvm.hpvm."
+   * As we do not want the compiler to mess with these intrinsics, we assume
+   * worst memory behavior for all these intrinsics.
+   */
+
+  /* Initialization intrinsic -
+   * i8* llvm.hpvm.setup(function*);
+   */
+  def int_hpvm_init : Intrinsic<[], [], []>;
+
+  /* Launch intrinsic - with streaming argument
+   * i8* llvm.hpvm.launch(i8*, ArgList*, i1);
+   */
+  def int_hpvm_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                  llvm_ptr_ty, llvm_i1_ty], []>;
+
+  /* Push intrinsic - push data on streaming pipeline
+   * void llvm.hpvm.push(i8*, ArgList*);
+   */
+  def int_hpvm_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
+
+  /* Pop intrinsic - pop data from streaming pipeline
+   * i8* llvm.hpvm.pop(i8*);
+   */
+  def int_hpvm_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
+  /* Cleanup intrinsic -
+   * void llvm.hpvm.cleanup(i8*);
+   */
+  def int_hpvm_cleanup : Intrinsic<[], [], []>;
+
+  /* Wait intrinsic -
+   * void llvm.hpvm.wait(graphID*);
+   */
+  def int_hpvm_wait : Intrinsic<[], [llvm_ptr_ty], []>;
+
+  /* Track memory intrinsic -
+   * void llvm.hpvm.trackMemory(i8*, i64);
+   */
+  def int_hpvm_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
+
+  /* Track memory intrinsic -
+   * void llvm.hpvm.untrackMemory(i8*);
+   */
+  def int_hpvm_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>;
+
+  /* Request memory intrinsic -
+   * void llvm.hpvm.requestMemory(i8*, i64);
+   */
+  def int_hpvm_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
+
+  /* Create Node intrinsic -
+   * i8* llvm.hpvm.createNode(function*);
+   */
+  def int_hpvm_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
+  /* Create Node 1D array intrinsic -
+   * i8* llvm.hpvm.createNode1D(function*, i64);
+   */
+  def int_hpvm_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty], []>;
+
+  /* Create Node 2D array intrinsic -
+   * i8* llvm.hpvm.createNode2D(function*, i64, i64);
+   */
+  def int_hpvm_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty, llvm_i64_ty], []>;
+
+  /* Create Node 3D array intrinsic -
+   * i8* llvm.hpvm.createNode2D(function*, i64, i64, i64);
+   */
+  def int_hpvm_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
+                                        llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
+                                        []>;
+
+  /* Create dataflow edge intrinsic -
+   * i8* llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1);
+   */
+  def int_hpvm_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
+                                      llvm_i1_ty, llvm_i32_ty, llvm_i32_ty,
+                                      llvm_i1_ty],
+                                      []>;
+
+  /* Create bind input intrinsic -
+   * void llvm.hpvm.bind.input(i8*, i32, i32);
+   */
+  def int_hpvm_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
+                                      llvm_i32_ty, llvm_i1_ty], []>;
+
+  /* Create bind output intrinsic -
+   * void llvm.hpvm.bind.output(i8*, i32, i32);
+   */
+  def int_hpvm_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
+                                       llvm_i32_ty, llvm_i1_ty], []>;
+
+  /* Find associated dataflow node intrinsic -
+   * i8* llvm.hpvm.getNode();
+   */
+  def int_hpvm_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+
+  /* Find parent dataflow node intrinsic -
+   * i8* llvm.hpvm.getParentNode(i8*);
+   */
+  def int_hpvm_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>;
+
+  /* Find the number of dimensions of a dataflow node intrinsic -
+   * i32 llvm.hpvm.getNumDims(i8*);
+   */
+  def int_hpvm_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
+
+  /* Find the unique indentifier of a dataflow node (with respect to his parent
+   * node) in the specified dimension intrinsic -
+   */
+
+  /* i64 llvm.hpvm.getNodeInstanceID.[xyz](i8*);
+   */
+  def int_hpvm_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  def int_hpvm_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  def int_hpvm_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                               [IntrNoMem]>;
+
+  /* Find the number of instances of a dataflow node in the specified dimension
+   * intrinsic -
+   */
+
+  /* i64 llvm.hpvm.getNumNodeInstances.[xyz](i8*);
+   */
+  def int_hpvm_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  def int_hpvm_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  def int_hpvm_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
+                                                 [IntrNoMem]>;
+
+  /* Local Barrier
+   * void llvm.hpvm.barrier();
+   */
+  def int_hpvm_barrier : Intrinsic<[], [], []>;
+
+  /* Memory allocation inside the graph
+   * i8* llvm.hpvm.malloc();
+   */
+  def int_hpvm_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>;
+
+  /* Find the vector length supported by target architecture
+   * intrinsic -
+   * i32 llvm.hpvm.getVectorLength();
+   */
+  def int_hpvm_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>;
+
+  /* ============ Atomic intrinsics ============= */
+  // Atomic arithmetic operations
+
+  /* i32 llvm.hpvm.atomic.add(i32*, i32)*/
+  def int_hpvm_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.sub(i32*, i32)*/
+  def int_hpvm_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.xchg(i32*, i32)*/
+  def int_hpvm_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.min(i32*, i32)*/
+  def int_hpvm_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.maxi32*, i32)*/
+  def int_hpvm_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  // Atomic bitwise operations
+
+  /* i32 llvm.hpvm.atomic.and(i32*, i32)*/
+  def int_hpvm_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.or(i32*, i32)*/
+  def int_hpvm_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.hpvm.atomic.xor(i32*, i32)*/
+  def int_hpvm_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+}
diff --git a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td b/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td
deleted file mode 100644
index d5330175d86c9576394c9363a4ba30fd651f19e8..0000000000000000000000000000000000000000
--- a/hpvm/llvm_patches/include/IR/IntrinsicsVISC.td
+++ /dev/null
@@ -1,208 +0,0 @@
-//===- IntrinsicsVISC.td - Defines VISC intrinsics ---------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the VISC-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "visc" in {
-  /* All intrinsics start with "llvm.visc."
-   * As we do not want the compiler to mess with these intrinsics, we assume
-   * worst memory behavior for all these intrinsics.
-   */
-
-  /* Initialization intrinsic -
-   * i8* llvm.visc.setup(function*);
-   */
-  def int_visc_init : Intrinsic<[], [], []>;
-
-  /* Launch intrinsic - with streaming argument
-   * i8* llvm.visc.launch(i8*, ArgList*, i1);
-   */
-  def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                  llvm_ptr_ty, llvm_i1_ty], []>;
-
-  /* Push intrinsic - push data on streaming pipeline
-   * void llvm.visc.push(i8*, ArgList*);
-   */
-  def int_visc_push : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>;
-
-  /* Pop intrinsic - pop data from streaming pipeline
-   * i8* llvm.visc.pop(i8*);
-   */
-  def int_visc_pop : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
-
-  /* Cleanup intrinsic -
-   * void llvm.visc.cleanup(i8*);
-   */
-  def int_visc_cleanup : Intrinsic<[], [], []>;
-
-  /* Wait intrinsic -
-   * void llvm.visc.wait(graphID*);
-   */
-  def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>;
-
-  /* Track memory intrinsic -
-   * void llvm.visc.trackMemory(i8*, i64);
-   */
-  def int_visc_trackMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
-
-  /* Track memory intrinsic -
-   * void llvm.visc.untrackMemory(i8*);
-   */
-  def int_visc_untrackMemory : Intrinsic<[], [llvm_ptr_ty], []>;
-
-  /* Request memory intrinsic -
-   * void llvm.visc.requestMemory(i8*, i64);
-   */
-  def int_visc_requestMemory : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>;
-
-  /* Create Node intrinsic -
-   * i8* llvm.visc.createNode(function*);
-   */
-  def int_visc_createNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
-
-  /* Create Node 1D array intrinsic -
-   * i8* llvm.visc.createNode1D(function*, i64);
-   */
-  def int_visc_createNode1D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty], []>;
-
-  /* Create Node 2D array intrinsic -
-   * i8* llvm.visc.createNode2D(function*, i64, i64);
-   */
-  def int_visc_createNode2D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty, llvm_i64_ty], []>;
-
-  /* Create Node 3D array intrinsic -
-   * i8* llvm.visc.createNode2D(function*, i64, i64, i64);
-   */
-  def int_visc_createNode3D : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
-                                        llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
-                                        []>;
-
-  /* Create dataflow edge intrinsic -
-   * i8* llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1);
-   */
-  def int_visc_createEdge : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
-                                      llvm_i1_ty, llvm_i32_ty, llvm_i32_ty,
-                                      llvm_i1_ty],
-                                      []>;
-
-  /* Create bind input intrinsic -
-   * void llvm.visc.bind.input(i8*, i32, i32);
-   */
-  def int_visc_bind_input : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
-                                      llvm_i32_ty, llvm_i1_ty], []>;
-
-  /* Create bind output intrinsic -
-   * void llvm.visc.bind.output(i8*, i32, i32);
-   */
-  def int_visc_bind_output : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty,
-                                       llvm_i32_ty, llvm_i1_ty], []>;
-
-  /* Find associated dataflow node intrinsic -
-   * i8* llvm.visc.getNode();
-   */
-  def int_visc_getNode : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-
-  /* Find parent dataflow node intrinsic -
-   * i8* llvm.visc.getParentNode(i8*);
-   */
-  def int_visc_getParentNode : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>;
-
-  /* Find the number of dimensions of a dataflow node intrinsic -
-   * i32 llvm.visc.getNumDims(i8*);
-   */
-  def int_visc_getNumDims : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
-
-  /* Find the unique indentifier of a dataflow node (with respect to his parent
-   * node) in the specified dimension intrinsic -
-   */
-
-  /* i64 llvm.visc.getNodeInstanceID.[xyz](i8*);
-   */
-  def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                               [IntrNoMem]>;
-
-  /* Find the number of instances of a dataflow node in the specified dimension
-   * intrinsic -
-   */
-
-  /* i64 llvm.visc.getNumNodeInstances.[xyz](i8*);
-   */
-  def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
-                                                 [IntrNoMem]>;
-
-  /* Local Barrier
-   * void llvm.visc.barrier();
-   */
-  def int_visc_barrier : Intrinsic<[], [], []>;
-
-  /* Memory allocation inside the graph
-   * i8* llvm.visc.malloc();
-   */
-  def int_visc_malloc : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty], []>;
-
-  /* Find the vector length supported by target architecture
-   * intrinsic -
-   * i32 llvm.visc.getVectorLength();
-   */
-  def int_visc_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>;
-
-  /* ============ Atomic intrinsics ============= */
-  // Atomic arithmetic operations
-
-  /* i32 llvm.visc.atomic.add(i32*, i32)*/
-  def int_visc_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.sub(i32*, i32)*/
-  def int_visc_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.xchg(i32*, i32)*/
-  def int_visc_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.min(i32*, i32)*/
-  def int_visc_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.maxi32*, i32)*/
-  def int_visc_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  // Atomic bitwise operations
-
-  /* i32 llvm.visc.atomic.and(i32*, i32)*/
-  def int_visc_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.or(i32*, i32)*/
-  def int_visc_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-  /* i32 llvm.visc.atomic.xor(i32*, i32)*/
-  def int_visc_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
-                                    []>;
-
-}
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
index a924405a2cac85ccd2e5e903a1ee1abb52774566..2c54392f8020ac7334117f1343214d085dbd6b84 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
@@ -855,7 +855,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(bit);
   KEYWORD(varFlags);
 
-  // VISC parameter attributes
+  // HPVM parameter attributes
   KEYWORD(in);
   KEYWORD(out);
   KEYWORD(inout);
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
index f5ce44e2a920405f7e3790fcb1d9eb7fba28d636..7446ff1e32dd79a18fd678446af56e6d193468ad 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
@@ -1470,7 +1470,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
 
-    // VISC Parameter only attributes
+    // HPVM Parameter only attributes
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
@@ -1808,7 +1808,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
       B.addAttribute(Attribute::ImmArg);
       break;
 
-    // VISC parameter attributes
+    // HPVM parameter attributes
     case lltok::kw_in:
       B.addAttribute(Attribute::In);
       break;
@@ -1927,7 +1927,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
 
-    // VISC Parameter only attributes
+    // HPVM Parameter only attributes
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
index 7f9816965b2a21ae3d23873ca789a22481b575fa..cb0479b41c3b9e68d9697cd9d8adce4c80fa5c25 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h
+++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
@@ -351,7 +351,7 @@ enum Kind {
   kw_insertvalue,
   kw_blockaddress,
 
-  // VISC parameter attributes
+  // HPVM parameter attributes
   kw_in,
   kw_out,
   kw_inout,
diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
index 7eb289d5872713ef826174b1e691c6440d4dd43e..a1e64472850911013250976312a8dd7d8b879c98 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1395,7 +1395,7 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::NoFree:
     return 1ULL << 63;
 
-    // VISC Attributes
+    // HPVM Attributes
   case Attribute::In:
     return 3ULL << 0;
   case Attribute::Out:
diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
index 55e7415efbea2b37d85f20b1d123ce9a80efe67e..fd671c397583fad6ec8a9998635705417f59eed1 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -773,7 +773,7 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
   case Attribute::SanitizeMemTag:
     return bitc::ATTR_KIND_SANITIZE_MEMTAG;
 
-  // VISC Attributes
+  // HPVM Attributes
   case Attribute::In:
     return bitc::ATTR_KIND_IN;
   case Attribute::Out:
diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp
index 3cc95b3102fdf6c7062fffe1f9486cfa094bba9b..29c47a9e1107524278dcc57c188b320821ba7d86 100644
--- a/hpvm/llvm_patches/lib/IR/Attributes.cpp
+++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp
@@ -404,7 +404,7 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   if (hasAttribute(Attribute::ImmArg))
     return "immarg";
 
-  // VISC attributes for arguments
+  // HPVM attributes for arguments
   if (hasAttribute(Attribute::In))
     return "in";
   if (hasAttribute(Attribute::Out))
diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be7f69c4bfa7623c093bd5e913af1de3dbcf951c
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_definitions(-DNUM_CORES=8)
+
+SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
+SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
+
+add_llvm_library(hpvm-rt.ll hpvm-rt.cpp
+
+  DEPENDS
+  clang
+  llvm-dis
+  )
+
+
+target_compile_options(hpvm-rt.ll PUBLIC -flto )
+target_compile_options(hpvm-rt.ll PUBLIC -std=c++11)
+
+add_custom_target(hpvm-rt.cpp.o ALL
+  COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a
+  COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc
+  COMMAND  ${CMAKE_BINARY_DIR}/bin/llvm-dis  ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc)
+
+add_dependencies(hpvm-rt.cpp.o   hpvm-rt.ll)
diff --git a/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt b/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7069470a1a6f8b1a49eea2824f27204ebdf3fb26
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/deviceStatusSwitchIntervals.txt
@@ -0,0 +1,2 @@
+10
+10 15 10 16 15 30 15 25 20 15
diff --git a/hpvm/projects/hpvm-rt/device_abstraction.h b/hpvm/projects/hpvm-rt/device_abstraction.h
new file mode 100644
index 0000000000000000000000000000000000000000..4948502ce8ae47cbb7e37c1372fcd81813486e15
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/device_abstraction.h
@@ -0,0 +1,80 @@
+#ifndef __DEVICE_ABSTRACTION__
+#define __DEVICE_ABSTRACTION__
+
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <thread>
+#include <time.h>
+#include <vector>
+
+#define MIN_INTERVAL 2
+#define MAX_INTERVAL 8
+#define NUM_INTERVALS 10
+
+// Device status variable: true if the device is available for use
+volatile bool deviceStatus = true;
+// Intervals at which to change the device status
+std::vector<unsigned> Intervals;
+
+// Set to true when program execution ends and so we can end the device
+// simulation
+volatile bool executionEnd = false;
+
+void initializeDeviceStatusIntervals() {
+
+  unsigned sz = 0;
+  unsigned tmp = 0;
+
+  const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/hpvm-rt/"
+                   "deviceStatusSwitchIntervals.txt";
+  std::ifstream infile;
+  infile.open(fn);
+  if (!infile.is_open()) {
+    std::cout << "Failed to open " << fn << " for reading\n";
+    return;
+  }
+  infile >> sz;
+
+  if (sz) {
+    // We have data. Read them into the vector
+    for (unsigned i = 0; i < sz; i++) {
+      infile >> tmp;
+      Intervals.push_back(tmp);
+    }
+    infile.close();
+  } else {
+    // We have no data. Create random data and write them into the file
+    infile.close();
+    std::ofstream outfile;
+    outfile.open(fn);
+    if (!outfile.is_open()) {
+      std::cout << "Failed to open " << fn << " for writing\n";
+      return;
+    }
+    sz = 1 + rand() % NUM_INTERVALS;
+    outfile << sz;
+    for (unsigned i = 0; i < sz; i++) {
+      Intervals.push_back(MIN_INTERVAL +
+                          rand() % (MAX_INTERVAL - MIN_INTERVAL));
+      outfile << Intervals[i];
+    }
+    outfile.close();
+  }
+
+  return;
+}
+
+void updateDeviceStatus() {
+
+  unsigned i = 0;
+  while (!executionEnd) {
+    std::this_thread::sleep_for(std::chrono::seconds(Intervals[i]));
+    deviceStatus = !deviceStatus;
+    std::cout << "Changed device status to " << deviceStatus << "\n";
+    i = (i + 1) % Intervals.size();
+  }
+}
+
+#endif // __DEVICE_ABSTRACTION__
diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
similarity index 82%
rename from hpvm/projects/visc-rt/visc-rt.cpp
rename to hpvm/projects/hpvm-rt/hpvm-rt.cpp
index df5b1b80f7ae71ca49f461a50f36f81064028ef9..cb3206ef500f7223a0463598f7b35d0b182f5f5f 100644
--- a/hpvm/projects/visc-rt/visc-rt.cpp
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
@@ -13,7 +13,7 @@
 #if _POSIX_VERSION >= 200112L
 #include <sys/time.h>
 #endif
-#include "visc-rt.h"
+#include "hpvm-rt.h"
 
 #ifndef DEBUG_BUILD
 #define DEBUG(s)                                                               \
@@ -58,7 +58,7 @@ vector<DFGDepth> DStack;
 pthread_mutex_t ocl_mtx;
 
 #define NUM_TESTS 1
-visc_TimerSet kernel_timer;
+hpvm_TimerSet kernel_timer;
 
 static inline void checkErr(cl_int err, cl_int success, const char *name) {
   if (err != success) {
@@ -70,7 +70,7 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) {
 
 /************************* Depth Stack Routines ***************************/
 
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
+void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
                                uint64_t limitY, uint64_t iY, uint64_t limitZ,
                                uint64_t iZ) {
   DEBUG(cout << "Pushing node information on stack:\n");
@@ -84,7 +84,7 @@ void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_x86_dstack_pop() {
+void llvm_hpvm_x86_dstack_pop() {
   DEBUG(cout << "Popping from depth stack\n");
   pthread_mutex_lock(&ocl_mtx);
   DStack.pop_back();
@@ -92,7 +92,7 @@ void llvm_visc_x86_dstack_pop() {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
+uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim) {
   DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level
              << flush << "\n");
   pthread_mutex_lock(&ocl_mtx);
@@ -104,7 +104,7 @@ uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
   return result;
 }
 
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
+uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim) {
   DEBUG(cout << "Request instance id for dim " << dim << " of ancestor "
              << level << flush << "\n");
   pthread_mutex_lock(&ocl_mtx);
@@ -118,7 +118,7 @@ uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
 
 /********************** Memory Tracking Routines **************************/
 
-void llvm_visc_track_mem(void *ptr, size_t size) {
+void llvm_hpvm_track_mem(void *ptr, size_t size) {
   DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE != NULL) {
@@ -130,7 +130,7 @@ void llvm_visc_track_mem(void *ptr, size_t size) {
   DEBUG(MTracker.print());
 }
 
-void llvm_visc_untrack_mem(void *ptr) {
+void llvm_hpvm_untrack_mem(void *ptr) {
   DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE == NULL) {
@@ -145,7 +145,7 @@ void llvm_visc_untrack_mem(void *ptr) {
   DEBUG(MTracker.print());
 }
 
-static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
+static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size,
                                        DFNodeContext_OCL *Context, bool isInput,
                                        bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
@@ -183,7 +183,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
   else
     clFlags = CL_MEM_READ_ONLY;
 
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
   // pthread_mutex_lock(&ocl_mtx);
   cl_mem d_input =
       clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
@@ -199,7 +199,7 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
     checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
   }
 
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
   DEBUG(cout << " done\n");
   MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context);
   DEBUG(cout << "Updated Table\n");
@@ -208,11 +208,11 @@ static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
   return d_input;
 }
 
-void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) {
-  return llvm_visc_request_mem(ptr, size);
+void *llvm_hpvm_x86_argument_ptr(void *ptr, size_t size) {
+  return llvm_hpvm_request_mem(ptr, size);
 }
 
-void *llvm_visc_request_mem(void *ptr, size_t size) {
+void *llvm_hpvm_request_mem(void *ptr, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
@@ -233,13 +233,13 @@ void *llvm_visc_request_mem(void *ptr, size_t size) {
   DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
              << "\n");
   DEBUG(cout << "\tCopying ...");
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY);
   // pthread_mutex_lock(&ocl_mtx);
   cl_int errcode = clEnqueueReadBuffer(
       ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue,
       (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
   DEBUG(cout << " done\n");
   checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
   DEBUG(cout << "Free mem object on device\n");
@@ -253,25 +253,25 @@ void *llvm_visc_request_mem(void *ptr, size_t size) {
 
 /*************************** Timer Routines **********************************/
 
-static int is_async(enum visc_TimerID timer) {
-  return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC);
+static int is_async(enum hpvm_TimerID timer) {
+  return (timer == hpvm_TimerID_KERNEL) || (timer == hpvm_TimerID_COPY_ASYNC);
 }
 
-static int is_blocking(enum visc_TimerID timer) {
-  return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
+static int is_blocking(enum hpvm_TimerID timer) {
+  return (timer == hpvm_TimerID_COPY) || (timer == hpvm_TimerID_NONE);
 }
 
-#define INVALID_TIMERID visc_TimerID_LAST
+#define INVALID_TIMERID hpvm_TimerID_LAST
 
-static int asyncs_outstanding(struct visc_TimerSet *timers) {
+static int asyncs_outstanding(struct hpvm_TimerSet *timers) {
   return (timers->async_markers != NULL) &&
          (timers->async_markers->timerID != INVALID_TIMERID);
 }
 
-static struct visc_async_time_marker_list *
-get_last_async(struct visc_TimerSet *timers) {
+static struct hpvm_async_time_marker_list *
+get_last_async(struct hpvm_TimerSet *timers) {
   /* Find the last event recorded thus far */
-  struct visc_async_time_marker_list *last_event = timers->async_markers;
+  struct hpvm_async_time_marker_list *last_event = timers->async_markers;
   if (last_event != NULL && last_event->timerID != INVALID_TIMERID) {
     while (last_event->next != NULL &&
            last_event->next->timerID != INVALID_TIMERID)
@@ -281,17 +281,17 @@ get_last_async(struct visc_TimerSet *timers) {
     return NULL;
 }
 
-static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) {
+static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
+  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
   while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
   if (*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)malloc(
-        sizeof(struct visc_async_time_marker_list));
+    *new_event = (struct hpvm_async_time_marker_list *)malloc(
+        sizeof(struct hpvm_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
     // I don't think this is needed at all. I believe clEnqueueMarker 'creates'
@@ -322,18 +322,18 @@ Event Status!\n");
   }
 }
 
-static void insert_submarker(struct visc_TimerSet *tset, char *label,
-                             enum visc_TimerID timer) {
+static void insert_submarker(struct hpvm_TimerSet *tset, char *label,
+                             enum hpvm_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
+  struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
   while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
   if (*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)malloc(
-        sizeof(struct visc_async_time_marker_list));
+    *new_event = (struct hpvm_async_time_marker_list *)malloc(
+        sizeof(struct hpvm_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
 #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
@@ -364,10 +364,10 @@ Event Status!\n");
 }
 
 /* Assumes that all recorded events have completed */
-static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
-  struct visc_async_time_marker_list *next_interval = NULL;
-  struct visc_async_time_marker_list *last_marker = get_last_async(tset);
-  visc_Timestamp total_async_time = 0;
+static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) {
+  struct hpvm_async_time_marker_list *next_interval = NULL;
+  struct hpvm_async_time_marker_list *last_marker = get_last_async(tset);
+  hpvm_Timestamp total_async_time = 0;
 
   for (next_interval = tset->async_markers; next_interval != last_marker;
        next_interval = next_interval->next) {
@@ -389,11 +389,11 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
               ciErrNum);
     }
 
-    visc_Timestamp interval =
-        (visc_Timestamp)(((double)(command_end - command_start)));
+    hpvm_Timestamp interval =
+        (hpvm_Timestamp)(((double)(command_end - command_start)));
     tset->timers[next_interval->timerID].elapsed += interval;
     if (next_interval->label != NULL) {
-      struct visc_SubTimer *subtimer =
+      struct hpvm_SubTimer *subtimer =
           tset->sub_timer_list[next_interval->timerID]->subtimer_list;
       while (subtimer != NULL) {
         if (strcmp(subtimer->label, next_interval->label) == 0) {
@@ -413,8 +413,8 @@ static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
   return total_async_time;
 }
 
-static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start,
-                            visc_Timestamp end) {
+static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start,
+                            hpvm_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
@@ -423,33 +423,33 @@ static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start,
 }
 
 #if _POSIX_VERSION >= 200112L
-static visc_Timestamp get_time() {
+static hpvm_Timestamp get_time() {
   struct timespec tv;
   clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
+  return (hpvm_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
 }
 #else
 #error "no supported time libraries are available on this platform"
 #endif
 
-void visc_ResetTimer(struct visc_Timer *timer) {
-  timer->state = visc_Timer_STOPPED;
+void hpvm_ResetTimer(struct hpvm_Timer *timer) {
+  timer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-#error "visc_ResetTimer: not implemented for this system"
+#error "hpvm_ResetTimer: not implemented for this system"
 #endif
 }
 
-void visc_StartTimer(struct visc_Timer *timer) {
-  if (timer->state != visc_Timer_STOPPED) {
+void hpvm_StartTimer(struct hpvm_Timer *timer) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     // FIXME: Removing warning statement to avoid printing this error
     // fputs("Ignoring attempt to start a running timer\n", stderr);
     return;
   }
 
-  timer->state = visc_Timer_RUNNING;
+  timer->state = hpvm_Timer_RUNNING;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -458,19 +458,19 @@ void visc_StartTimer(struct visc_Timer *timer) {
     timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StartTimer: not implemented for this system"
+#error "hpvm_StartTimer: not implemented for this system"
 #endif
 }
 
-void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
-                                struct visc_Timer *subtimer) {
+void hpvm_StartTimerAndSubTimer(struct hpvm_Timer *timer,
+                                struct hpvm_Timer *subtimer) {
 
   unsigned int numNotStopped = 0x3; // 11
-  if (timer->state != visc_Timer_STOPPED) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     fputs("Warning: Timer was not stopped\n", stderr);
     numNotStopped &= 0x1; // Zero out 2^1
   }
-  if (subtimer->state != visc_Timer_STOPPED) {
+  if (subtimer->state != hpvm_Timer_STOPPED) {
     fputs("Warning: Subtimer was not stopped\n", stderr);
     numNotStopped &= 0x2; // Zero out 2^0
   }
@@ -479,8 +479,8 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
     return;
   }
 
-  timer->state = visc_Timer_RUNNING;
-  subtimer->state = visc_Timer_RUNNING;
+  timer->state = hpvm_Timer_RUNNING;
+  subtimer->state = hpvm_Timer_RUNNING;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -496,19 +496,19 @@ void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
     }
   }
 #else
-#error "visc_StartTimer: not implemented for this system"
+#error "hpvm_StartTimer: not implemented for this system"
 #endif
 }
 
-void visc_StopTimer(struct visc_Timer *timer) {
-  visc_Timestamp fini;
+void hpvm_StopTimer(struct hpvm_Timer *timer) {
+  hpvm_Timestamp fini;
 
-  if (timer->state != visc_Timer_RUNNING) {
+  if (timer->state != hpvm_Timer_RUNNING) {
     // fputs("Ignoring attempt to stop a stopped timer\n", stderr);
     return;
   }
 
-  timer->state = visc_Timer_STOPPED;
+  timer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -517,24 +517,24 @@ void visc_StopTimer(struct visc_Timer *timer) {
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StopTimer: not implemented for this system"
+#error "hpvm_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
 }
 
-void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
-                               struct visc_Timer *subtimer) {
+void hpvm_StopTimerAndSubTimer(struct hpvm_Timer *timer,
+                               struct hpvm_Timer *subtimer) {
 
-  visc_Timestamp fini;
+  hpvm_Timestamp fini;
 
   unsigned int numNotRunning = 0x3; // 11
-  if (timer->state != visc_Timer_RUNNING) {
+  if (timer->state != hpvm_Timer_RUNNING) {
     fputs("Warning: Timer was not running\n", stderr);
     numNotRunning &= 0x1; // Zero out 2^1
   }
-  if (subtimer->state != visc_Timer_RUNNING) {
+  if (subtimer->state != hpvm_Timer_RUNNING) {
     fputs("Warning: Subtimer was not running\n", stderr);
     numNotRunning &= 0x2; // Zero out 2^0
   }
@@ -543,8 +543,8 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
     return;
   }
 
-  timer->state = visc_Timer_STOPPED;
-  subtimer->state = visc_Timer_STOPPED;
+  timer->state = hpvm_Timer_STOPPED;
+  subtimer->state = hpvm_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   {
@@ -553,7 +553,7 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-#error "visc_StopTimer: not implemented for this system"
+#error "hpvm_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
@@ -568,59 +568,59 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
 }
 
 /* Get the elapsed time in seconds. */
-double visc_GetElapsedTime(struct visc_Timer *timer) {
+double hpvm_GetElapsedTime(struct hpvm_Timer *timer) {
   double ret;
 
-  if (timer->state != visc_Timer_STOPPED) {
+  if (timer->state != hpvm_Timer_STOPPED) {
     fputs("Elapsed time from a running timer is inaccurate\n", stderr);
   }
 
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e9;
 #else
-#error "visc_GetElapsedTime: not implemented for this system"
+#error "hpvm_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void visc_InitializeTimerSet(struct visc_TimerSet *timers) {
+void hpvm_InitializeTimerSet(struct hpvm_TimerSet *timers) {
   int n;
 
   timers->wall_begin = get_time();
-  timers->current = visc_TimerID_NONE;
+  timers->current = hpvm_TimerID_NONE;
 
   timers->async_markers = NULL;
 
-  for (n = 0; n < visc_TimerID_LAST; n++) {
-    visc_ResetTimer(&timers->timers[n]);
+  for (n = 0; n < hpvm_TimerID_LAST; n++) {
+    hpvm_ResetTimer(&timers->timers[n]);
     timers->sub_timer_list[n] = NULL;
   }
 }
 
-void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
-                      enum visc_TimerID visc_Category) {
+void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
+                      enum hpvm_TimerID hpvm_Category) {
 
-  struct visc_SubTimer *subtimer =
-      (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer));
+  struct hpvm_SubTimer *subtimer =
+      (struct hpvm_SubTimer *)malloc(sizeof(struct hpvm_SubTimer));
 
   int len = strlen(label);
 
   subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s", label);
 
-  visc_ResetTimer(&subtimer->timer);
+  hpvm_ResetTimer(&subtimer->timer);
   subtimer->next = NULL;
 
-  struct visc_SubTimerList *subtimerlist =
-      timers->sub_timer_list[visc_Category];
+  struct hpvm_SubTimerList *subtimerlist =
+      timers->sub_timer_list[hpvm_Category];
   if (subtimerlist == NULL) {
     subtimerlist =
-        (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList));
+        (struct hpvm_SubTimerList *)calloc(1, sizeof(struct hpvm_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
-    timers->sub_timer_list[visc_Category] = subtimerlist;
+    timers->sub_timer_list[hpvm_Category] = subtimerlist;
   } else {
     // Append to list
-    struct visc_SubTimer *element = subtimerlist->subtimer_list;
+    struct hpvm_SubTimer *element = subtimerlist->subtimer_list;
     while (element->next != NULL) {
       element = element->next;
     }
@@ -628,37 +628,37 @@ void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
   }
 }
 
-void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
+void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) {
   // cerr << "Switch to timer: " << timer << flush << "\n";
   /* Stop the currently running timer */
-  if (timers->current != visc_TimerID_NONE) {
-    struct visc_SubTimerList *subtimerlist =
+  if (timers->current != hpvm_TimerID_NONE) {
+    struct hpvm_SubTimerList *subtimerlist =
         timers->sub_timer_list[timers->current];
-    struct visc_SubTimer *currSubTimer =
+    struct hpvm_SubTimer *currSubTimer =
         (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
     if (!is_async(timers->current)) {
       if (timers->current != timer) {
         if (currSubTimer != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
                                     &currSubTimer->timer);
         } else {
-          visc_StopTimer(&timers->timers[timers->current]);
+          hpvm_StopTimer(&timers->timers[timers->current]);
         }
       } else {
         if (currSubTimer != NULL) {
-          visc_StopTimer(&currSubTimer->timer);
+          hpvm_StopTimer(&currSubTimer->timer);
         }
       }
     } else {
       insert_marker(timers, timer);
       if (!is_async(timer)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
 
-  visc_Timestamp currentTime = get_time();
+  hpvm_Timestamp currentTime = get_time();
 
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
@@ -666,7 +666,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
   if (asyncs_outstanding(timers) &&
       (!is_async(timers->current) || is_blocking(timer))) {
 
-    struct visc_async_time_marker_list *last_event = get_last_async(timers);
+    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -686,7 +686,7 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
 
       // timer to switch to is COPY or NONE
       if (async_done != CL_COMPLETE) {
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
                         timers->async_begin, currentTime);
       }
 
@@ -696,14 +696,14 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
         fprintf(stderr, "Error Waiting for Events!\n");
       }
 
-      visc_Timestamp total_async_time = record_async_times(timers);
+      hpvm_Timestamp total_async_time = record_async_times(timers);
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       if (async_done == CL_COMPLETE) {
         // fprintf(stderr, "Async_done: total_async_type = %lld\n",
         // total_async_time);
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
       }
 
     } else
@@ -713,15 +713,15 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
         if (async_done == CL_COMPLETE) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
           record_async_times(timers);
     }
   }
 
   /* Start the new timer */
-  if (timer != visc_TimerID_NONE) {
+  if (timer != hpvm_TimerID_NONE) {
     if (!is_async(timer)) {
-      visc_StartTimer(&timers->timers[timer]);
+      hpvm_StartTimer(&timers->timers[timer]);
     } else {
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
       if (!asyncs_outstanding(timers)) {
@@ -735,48 +735,48 @@ void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list *last_event = get_last_async(timers);
+        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
         last_event->label = NULL;
         last_event->timerID = timer;
       }
       if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
   timers->current = timer;
 }
 
-void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
-                           enum visc_TimerID category) {
-  struct visc_SubTimerList *subtimerlist =
+void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
+                           enum hpvm_TimerID category) {
+  struct hpvm_SubTimerList *subtimerlist =
       timers->sub_timer_list[timers->current];
-  struct visc_SubTimer *curr =
+  struct hpvm_SubTimer *curr =
       (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
-  if (timers->current != visc_TimerID_NONE) {
+  if (timers->current != hpvm_TimerID_NONE) {
     if (!is_async(timers->current)) {
       if (timers->current != category) {
         if (curr != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+          hpvm_StopTimerAndSubTimer(&timers->timers[timers->current],
                                     &curr->timer);
         } else {
-          visc_StopTimer(&timers->timers[timers->current]);
+          hpvm_StopTimer(&timers->timers[timers->current]);
         }
       } else {
         if (curr != NULL) {
-          visc_StopTimer(&curr->timer);
+          hpvm_StopTimer(&curr->timer);
         }
       }
     } else {
       insert_submarker(timers, label, category);
       if (!is_async(category)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
 
-  visc_Timestamp currentTime = get_time();
+  hpvm_Timestamp currentTime = get_time();
 
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
@@ -784,7 +784,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   if (asyncs_outstanding(timers) &&
       (!is_async(timers->current) || is_blocking(category))) {
 
-    struct visc_async_time_marker_list *last_event = get_last_async(timers);
+    struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -808,7 +808,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // because everything is being stopped to wait for synchronization it
       // seems that the extra sync wall time isn't being recorded anywhere
       if (async_done != CL_COMPLETE)
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+        accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed),
                         timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
@@ -816,7 +816,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       if (ciErrNum != CL_SUCCESS) {
         fprintf(stderr, "Error Waiting for Events!\n");
       }
-      visc_Timestamp total_async_time = record_async_times(timers);
+      hpvm_Timestamp total_async_time = record_async_times(timers);
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
@@ -824,7 +824,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // into OVERLAP the immediately preceding EventSynchronize theoretically
       // didn't have any effect since it was already completed.
       if (async_done == CL_COMPLETE /*cudaSuccess*/)
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+        timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time;
 
     } else
         /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
@@ -833,14 +833,14 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
         if (async_done == CL_COMPLETE /*cudaSuccess*/) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+      timers->timers[hpvm_TimerID_OVERLAP].elapsed +=
           record_async_times(timers);
     }
     // else, this isn't blocking, so just check the next time around
   }
 
   subtimerlist = timers->sub_timer_list[category];
-  struct visc_SubTimer *subtimer = NULL;
+  struct hpvm_SubTimer *subtimer = NULL;
 
   if (label != NULL) {
     subtimer = subtimerlist->subtimer_list;
@@ -854,18 +854,18 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   }
 
   /* Start the new timer */
-  if (category != visc_TimerID_NONE) {
+  if (category != hpvm_TimerID_NONE) {
     if (!is_async(category)) {
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
 
       if (category != timers->current && subtimer != NULL) {
-        visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
+        hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
       } else if (subtimer != NULL) {
-        visc_StartTimer(&subtimer->timer);
+        hpvm_StartTimer(&subtimer->timer);
       } else {
-        visc_StartTimer(&timers->timers[category]);
+        hpvm_StartTimer(&timers->timers[category]);
       }
     } else {
       if (subtimerlist != NULL) {
@@ -883,7 +883,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list *last_event = get_last_async(timers);
+        struct hpvm_async_time_marker_list *last_event = get_last_async(timers);
         last_event->timerID = category;
         last_event->label = label;
       } // else, marker for switchToThis was already inserted
@@ -891,7 +891,7 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
       // toSwitchto is already asynchronous, but if current/prev state is async
       // too, then DRIVER is already running
       if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+        hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]);
       }
     }
   }
@@ -899,11 +899,11 @@ void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
   timers->current = category;
 }
 
-void visc_PrintTimerSet(struct visc_TimerSet *timers) {
-  visc_Timestamp wall_end = get_time();
+void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
+  hpvm_Timestamp wall_end = get_time();
 
-  struct visc_Timer *t = timers->timers;
-  struct visc_SubTimer *sub = NULL;
+  struct hpvm_Timer *t = timers->timers;
+  struct hpvm_SubTimer *sub = NULL;
 
   int maxSubLength;
 
@@ -920,13 +920,13 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) {
   const int maxCategoryLength = 20;
 
   int i;
-  for (i = 1; i < visc_TimerID_LAST;
+  for (i = 1; i < hpvm_TimerID_LAST;
        ++i) { // exclude NONE and OVRELAP from this format
-    if (visc_GetElapsedTime(&t[i]) != 0 || true) {
+    if (hpvm_GetElapsedTime(&t[i]) != 0 || true) {
 
       // Print Category Timer
       printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1],
-             visc_GetElapsedTime(&t[i]));
+             hpvm_GetElapsedTime(&t[i]));
 
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
@@ -949,24 +949,24 @@ void visc_PrintTimerSet(struct visc_TimerSet *timers) {
         // Print SubTimers
         while (sub != NULL) {
           printf(" -%-*s: %.9f\n", maxSubLength, sub->label,
-                 visc_GetElapsedTime(&sub->timer));
+                 hpvm_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
 
-  if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
+  if (hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]) != 0)
     printf("CPU/Kernel Overlap: %.9f\n",
-           visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
+           hpvm_GetElapsedTime(&t[hpvm_TimerID_OVERLAP]));
 
   float walltime = (wall_end - timers->wall_begin) / 1e9;
   printf("Timer Wall Time: %.9f\n", walltime);
 }
 
-void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
+void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct visc_async_time_marker_list *event = timers->async_markers;
+  struct hpvm_async_time_marker_list *event = timers->async_markers;
   while (event != NULL) {
 
     cl_int ciErrNum = CL_SUCCESS;
@@ -981,7 +981,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
     }
 
     free((event)->marker);
-    struct visc_async_time_marker_list *next = ((event)->next);
+    struct hpvm_async_time_marker_list *next = ((event)->next);
 
     free(event);
 
@@ -990,10 +990,10 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
   }
 
   int i = 0;
-  for (i = 0; i < visc_TimerID_LAST; ++i) {
+  for (i = 0; i < hpvm_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
-      struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
-      struct visc_SubTimer *prev = NULL;
+      struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
+      struct hpvm_SubTimer *prev = NULL;
       while (subtimer != NULL) {
         free(subtimer->label);
         prev = subtimer;
@@ -1009,7 +1009,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
 #define BUFFER_SIZE 1
 
 // Launch API for a streaming dataflow graph
-void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
+void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
   DFNodeContext_X86 *Context =
       (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
 
@@ -1031,7 +1031,7 @@ void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
 }
 
 // Push API for a streaming dataflow graph
-void llvm_visc_streamPush(void *graphID, void *args) {
+void llvm_hpvm_streamPush(void *graphID, void *args) {
   DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args
              << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
@@ -1044,17 +1044,17 @@ void llvm_visc_streamPush(void *graphID, void *args) {
       if (Ctx->BindInSourcePort->at(j) == i) {
         // Push to all bind buffers connected to parent node at this port
         // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-        llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
+        llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(j), element);
       }
     }
   }
   // Push 0 in isLastInput buffers of all child nodes
   for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers))
-    llvm_visc_bufferPush(buffer, 0);
+    llvm_hpvm_bufferPush(buffer, 0);
 }
 
 // Pop API for a streaming dataflow graph
-void *llvm_visc_streamPop(void *graphID) {
+void *llvm_hpvm_streamPop(void *graphID) {
   DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   unsigned totalBytes = 0;
@@ -1063,7 +1063,7 @@ void *llvm_visc_streamPop(void *graphID) {
   void *output = malloc(totalBytes);
   unsigned offset = 0;
   for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) {
-    uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
+    uint64_t element = llvm_hpvm_bufferPop(Ctx->BindOutputBuffers->at(i));
     // DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
     memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i));
     offset += Ctx->BindOutSizes->at(i);
@@ -1072,24 +1072,24 @@ void *llvm_visc_streamPop(void *graphID) {
 }
 
 // Wait API for a streaming dataflow graph
-void llvm_visc_streamWait(void *graphID) {
+void llvm_hpvm_streamWait(void *graphID) {
   DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   // Push garbage to all other input buffers
   for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) {
     uint64_t element = 0;
     // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-    llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
+    llvm_hpvm_bufferPush(Ctx->BindInputBuffers->at(i), element);
   }
   // Push 1 in isLastInput buffers of all child nodes
   for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++)
-    llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
+    llvm_hpvm_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
 
-  llvm_visc_freeThreads(graphID);
+  llvm_hpvm_freeThreads(graphID);
 }
 
 // Create a buffer and return the bufferID
-void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size,
+void *llvm_hpvm_createBindInBuffer(void *graphID, uint64_t size,
                                    unsigned inArgPort) {
   DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
@@ -1104,7 +1104,7 @@ void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size,
   return bufferID;
 }
 
-void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createBindOutBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1116,7 +1116,7 @@ void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) {
   Context->BindOutSizes->push_back(size);
   return bufferID;
 }
-void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createEdgeBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size
              << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1129,7 +1129,7 @@ void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) {
   return bufferID;
 }
 
-void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) {
+void *llvm_hpvm_createLastInputBuffer(void *graphID, uint64_t size) {
   DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID
              << ", Size: " << size << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
@@ -1142,7 +1142,7 @@ void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) {
 }
 
 // Free buffers
-void llvm_visc_freeBuffers(void *graphID) {
+void llvm_hpvm_freeBuffers(void *graphID) {
   DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
   for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers))
@@ -1156,19 +1156,19 @@ void llvm_visc_freeBuffers(void *graphID) {
 }
 
 // Pop an element from the buffer
-uint64_t llvm_visc_bufferPop(void *bufferID) {
+uint64_t llvm_hpvm_bufferPop(void *bufferID) {
   CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   return buffer->pop();
 }
 
 // Push an element into the buffer
-void llvm_visc_bufferPush(void *bufferID, uint64_t element) {
+void llvm_hpvm_bufferPush(void *bufferID, uint64_t element) {
   CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   buffer->push(element);
 }
 
 // Create a thread
-void llvm_visc_createThread(void *graphID, void *(*Func)(void *),
+void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *),
                             void *arguments) {
   DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func
              << ", Args: " << arguments << flush << "\n");
@@ -1182,7 +1182,7 @@ void llvm_visc_createThread(void *graphID, void *(*Func)(void *),
 }
 
 // Wait for thread to finish
-void llvm_visc_freeThreads(void *graphID) {
+void llvm_hpvm_freeThreads(void *graphID) {
   DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
   DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   for (pthread_t thread : *(Ctx->threads))
@@ -1191,7 +1191,7 @@ void llvm_visc_freeThreads(void *graphID) {
 
 /************************ OPENCL & PTHREAD API ********************************/
 
-void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) {
+void *llvm_hpvm_x86_launch(void *(*rootFunc)(void *), void *arguments) {
   DFNodeContext_X86 *Context =
       (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
   // int err;
@@ -1202,7 +1202,7 @@ void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) {
   return Context;
 }
 
-void llvm_visc_x86_wait(void *graphID) {
+void llvm_hpvm_x86_wait(void *graphID) {
   DEBUG(cout << "Waiting for pthread to finish ...\n");
   // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
   // pthread_join(Context->threadID, NULL);
@@ -1210,9 +1210,9 @@ void llvm_visc_x86_wait(void *graphID) {
   DEBUG(cout << "\t... pthread Done!\n");
 }
 
-void *llvm_visc_ocl_initContext(enum visc::Target T) {
+void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) {
   pthread_mutex_lock(&ocl_mtx);
-  DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR");
+  DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR");
   DEBUG(cout << "Initializing Context for " << Target << " device\n");
   cl_uint numPlatforms;
   cl_int errcode;
@@ -1249,10 +1249,10 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
   // Choose second one which is X86 AVX
   cl_context_properties properties[] = {
-      CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0};
+      CL_CONTEXT_PLATFORM, (long)platforms[T == hpvm::GPU_TARGET ? 0 : 1], 0};
   globalOCLContext = clCreateContextFromType(
       properties,
-      T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
+      T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
       NULL, &errcode);
   // get the list of OCL devices associated with context
   size_t dataBytes;
@@ -1264,7 +1264,7 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
                               clDevices, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-  if (false && T == visc::SPIR_TARGET) {
+  if (false && T == hpvm::SPIR_TARGET) {
     cl_device_partition_property props[4];
     props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
     props[1] = NUM_CORES;
@@ -1290,13 +1290,13 @@ void *llvm_visc_ocl_initContext(enum visc::Target T) {
   checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
 
   DEBUG(cout << "Initialize Kernel Timer\n");
-  visc_InitializeTimerSet(&kernel_timer);
+  hpvm_InitializeTimerSet(&kernel_timer);
 
   pthread_mutex_unlock(&ocl_mtx);
   return globalOCLContext;
 }
 
-void llvm_visc_ocl_clearContext(void *graphID) {
+void llvm_hpvm_ocl_clearContext(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Clear Context\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
@@ -1309,12 +1309,12 @@ void llvm_visc_ocl_clearContext(void *graphID) {
   // DEBUG(cout << "Released context at: " << globalOCLContext);
   free(Context);
   DEBUG(cout << "Done with OCL kernel\n");
-  cout << "Printing VISC Timer: KernelTimer\n";
-  visc_PrintTimerSet(&kernel_timer);
+  cout << "Printing HPVM Timer: KernelTimer\n";
+  hpvm_PrintTimerSet(&kernel_timer);
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
+void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Shared Memory Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1329,7 +1329,7 @@ void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index,
+void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index,
                                    size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Scalar Input:");
@@ -1345,7 +1345,7 @@ void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index,
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
+void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index,
                                  size_t size, bool isInput, bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Pointer Input:");
@@ -1359,7 +1359,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
 
   pthread_mutex_unlock(&ocl_mtx);
   // Check with runtime the location of this memory
-  cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context,
+  cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context,
                                                      isInput, isOutput);
 
   pthread_mutex_lock(&ocl_mtx);
@@ -1374,7 +1374,7 @@ void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
   return d_input;
 }
 
-void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
+void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set device memory for Output Struct:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1396,13 +1396,13 @@ void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   return d_output;
 }
 
-void llvm_visc_ocl_free(void *ptr) {
+void llvm_hpvm_ocl_free(void *ptr) {
   // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
   // cl_mem d_ptr = (cl_mem) ptr;
   // clReleaseMemObject(d_ptr);
 }
 
-void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output,
+void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output,
                               size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Get Output:\n");
@@ -1421,7 +1421,7 @@ void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output,
   return h_output;
 }
 
-void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
+void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim,
                                 const size_t *localWorkSize,
                                 const size_t *globalWorkSize) {
   pthread_mutex_lock(&ocl_mtx);
@@ -1467,7 +1467,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
   // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION);
   // for(int i=0 ;i < NUM_TESTS; i++) {
   // cout << "Iteration = " << i << flush << "\n";
   // pthread_mutex_lock(&ocl_mtx);
@@ -1480,7 +1480,7 @@ void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
   // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
   // pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE);
 
   pthread_mutex_unlock(&ocl_mtx);
   return event;
@@ -1529,7 +1529,7 @@ static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
   return cSourceString;
 }
 
-void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) {
+void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Launch OCL Kernel\n");
   // Initialize OpenCL
@@ -1599,7 +1599,7 @@ void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) {
   return Context;
 }
 
-void llvm_visc_ocl_wait(void *graphID) {
+void llvm_hpvm_ocl_wait(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Wait\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
@@ -1609,27 +1609,27 @@ void llvm_visc_ocl_wait(void *graphID) {
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) {
+void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) {
   // cout << "Switching to timer " << timer << flush << "\n";
   pthread_mutex_lock(&ocl_mtx);
-  // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
+  // hpvm_SwitchToTimer((hpvm_TimerSet*)(*timerSet), timer);
   pthread_mutex_unlock(&ocl_mtx);
 }
-void llvm_visc_printTimerSet(void **timerSet, char *timerName) {
+void llvm_hpvm_printTimerSet(void **timerSet, char *timerName) {
   pthread_mutex_lock(&ocl_mtx);
-  cout << "Printing VISC Timer: ";
+  cout << "Printing HPVM Timer: ";
   if (timerName != NULL)
     cout << timerName << flush << "\n";
   else
     cout << "Anonymous\n";
-  visc_PrintTimerSet((visc_TimerSet *)(*timerSet));
+  hpvm_PrintTimerSet((hpvm_TimerSet *)(*timerSet));
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void *llvm_visc_initializeTimerSet() {
+void *llvm_hpvm_initializeTimerSet() {
   pthread_mutex_lock(&ocl_mtx);
-  visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet));
-  visc_InitializeTimerSet(TS);
+  hpvm_TimerSet *TS = (hpvm_TimerSet *)malloc(sizeof(hpvm_TimerSet));
+  hpvm_InitializeTimerSet(TS);
   pthread_mutex_unlock(&ocl_mtx);
   return TS;
 }
diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/hpvm-rt/hpvm-rt.h
similarity index 74%
rename from hpvm/projects/visc-rt/visc-rt.h
rename to hpvm/projects/hpvm-rt/hpvm-rt.h
index d9d946f1da14245f8cde426e7b5ea92f791537f5..519b467c9047fbbdeea3a4610bedda3a77c36fe2 100644
--- a/hpvm/projects/visc-rt/visc-rt.h
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.h
@@ -2,8 +2,8 @@
  *
  * (c) 2010 The Board of Trustees of the University of Illinois.
  */
-#ifndef VISC_RT_HEADER
-#define VISC_RT_HEADER
+#ifndef HPVM_RT_HEADER
+#define HPVM_RT_HEADER
 
 #include <ctime>
 #include <iostream>
@@ -13,8 +13,8 @@
 #include <vector>
 //#include <condition_variable>
 
-#include "../../include/SupportVISC/VISCHint.h"
-#include "../../include/SupportVISC/VISCTimer.h"
+#include "../../include/SupportHPVM/HPVMHint.h"
+#include "../../include/SupportHPVM/HPVMTimer.h"
 
 #ifndef DEBUG_BUILD
 #define DEBUG(s)                                                               \
@@ -64,12 +64,12 @@ public:
   unsigned getNumDim() const { return numDim; }
 };
 
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
+void llvm_hpvm_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
                                uint64_t limitY = 0, uint64_t iY = 0,
                                uint64_t limitZ = 0, uint64_t iZ = 0);
-void llvm_visc_x86_dstack_pop();
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
+void llvm_hpvm_x86_dstack_pop();
+uint64_t llvm_hpvm_x86_getDimLimit(unsigned level, unsigned dim);
+uint64_t llvm_hpvm_x86_getDimInstance(unsigned level, unsigned dim);
 
 /********************* Memory Tracker **********************************/
 class MemTrackerEntry {
@@ -143,32 +143,32 @@ public:
   }
 };
 
-void llvm_visc_track_mem(void *, size_t);
-void llvm_visc_untrack_mem(void *);
-void *llvm_visc_request_mem(void *, size_t);
+void llvm_hpvm_track_mem(void *, size_t);
+void llvm_hpvm_untrack_mem(void *);
+void *llvm_hpvm_request_mem(void *, size_t);
 
 /*********************** OPENCL & PTHREAD API **************************/
-void *llvm_visc_x86_launch(void *(void *), void *);
-void llvm_visc_x86_wait(void *);
-void *llvm_visc_ocl_initContext(enum visc::Target);
-
-void *llvm_visc_x86_argument_ptr(void *, size_t);
-
-void llvm_visc_ocl_clearContext(void *);
-void llvm_visc_ocl_argument_shared(void *, int, size_t);
-void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t);
-void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool);
-void *llvm_visc_ocl_output_ptr(void *, int, size_t);
-void llvm_visc_ocl_free(void *);
-void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t);
-void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *,
+void *llvm_hpvm_x86_launch(void *(void *), void *);
+void llvm_hpvm_x86_wait(void *);
+void *llvm_hpvm_ocl_initContext(enum hpvm::Target);
+
+void *llvm_hpvm_x86_argument_ptr(void *, size_t);
+
+void llvm_hpvm_ocl_clearContext(void *);
+void llvm_hpvm_ocl_argument_shared(void *, int, size_t);
+void llvm_hpvm_ocl_argument_scalar(void *, void *, int, size_t);
+void *llvm_hpvm_ocl_argument_ptr(void *, void *, int, size_t, bool, bool);
+void *llvm_hpvm_ocl_output_ptr(void *, int, size_t);
+void llvm_hpvm_ocl_free(void *);
+void *llvm_hpvm_ocl_getOutput(void *, void *, void *, size_t);
+void *llvm_hpvm_ocl_executeNode(void *, unsigned, const size_t *,
                                 const size_t *);
-void *llvm_visc_ocl_launch(const char *, const char *);
-void llvm_visc_ocl_wait(void *);
+void *llvm_hpvm_ocl_launch(const char *, const char *);
+void llvm_hpvm_ocl_wait(void *);
 
-void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID);
-void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL);
-void *llvm_visc_initializeTimerSet();
+void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID);
+void llvm_hpvm_printTimerSet(void **timerSet, char *timerName = NULL);
+void *llvm_hpvm_initializeTimerSet();
 }
 
 /*************************** Pipeline API ******************************/
@@ -249,30 +249,30 @@ template <class ElementType> ElementType CircularBuffer<ElementType>::pop() {
 
 extern "C" {
 // Functions to push and pop values from pipeline buffers
-uint64_t llvm_visc_bufferPop(void *);
-void llvm_visc_bufferPush(void *, uint64_t);
+uint64_t llvm_hpvm_bufferPop(void *);
+void llvm_hpvm_bufferPush(void *, uint64_t);
 
 // Functions to create and destroy buffers
-void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned);
-void *llvm_visc_createBindOutBuffer(void *, uint64_t);
-void *llvm_visc_createEdgeBuffer(void *, uint64_t);
-void *llvm_visc_createLastInputBuffer(void *, uint64_t);
+void *llvm_hpvm_createBindInBuffer(void *, uint64_t, unsigned);
+void *llvm_hpvm_createBindOutBuffer(void *, uint64_t);
+void *llvm_hpvm_createEdgeBuffer(void *, uint64_t);
+void *llvm_hpvm_createLastInputBuffer(void *, uint64_t);
 
-void llvm_visc_freeBuffers(void *);
+void llvm_hpvm_freeBuffers(void *);
 
 // Functions to create and destroy threads
-void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *);
-void llvm_visc_freeThreads(void *);
+void llvm_hpvm_createThread(void *graphID, void *(*Func)(void *), void *);
+void llvm_hpvm_freeThreads(void *);
 
 // Launch API for a streaming graph.
 // Arguments:
 // (1) Launch Function: void* (void*, void*)
 // (2) Push Function:   void (void*, std::vector<uint64_t>**, unsgined)
 // (3) Pop Function:    void* (std::vector<uint64_t>**, unsigned)
-void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *);
-void llvm_visc_streamPush(void *graphID, void *args);
-void *llvm_visc_streamPop(void *graphID);
-void llvm_visc_streamWait(void *graphID);
+void *llvm_hpvm_streamLaunch(void (*LaunchFunc)(void *, void *), void *);
+void llvm_hpvm_streamPush(void *graphID, void *args);
+void *llvm_hpvm_streamPop(void *graphID);
+void llvm_hpvm_streamWait(void *graphID);
 }
 
-#endif // VISC_RT_HEADER
+#endif // HPVM_RT_HEADER
diff --git a/hpvm/projects/hpvm-rt/makefile b/hpvm/projects/hpvm-rt/makefile
new file mode 100644
index 0000000000000000000000000000000000000000..927e26e254a2b2f980fed8efd8858935e9f3cbdf
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/makefile
@@ -0,0 +1,29 @@
+#LLVM_SRC_ROOT = 
+LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/
+
+CUDA_INC_PATH = /software/cuda-9.1/include/CL/
+
+
+ifeq ($(NUM_CORES),)
+  NUM_CORES=1
+endif
+
+CPP_FLAGS = -I$(LLVM_SRC_ROOT)/include -I$(LLVM_BUILD_ROOT)/include -I$(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+TARGET:=hpvm-rt
+
+LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
+LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
+
+OPTS =
+
+ifeq ($(DEBUG),1)
+  OPTS+=-DDEBUG_BUILD
+endif
+
+all: $(TARGET:%=%.ll)
+
+$(TARGET:%=%.ll):%.ll:%.cpp %.h
+	$(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@
+
+clean :
+	rm -f $(TARGET).ll
diff --git a/hpvm/projects/hpvm-rt/policy.h b/hpvm/projects/hpvm-rt/policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50e65868b376bfbcc3d4bd00d4919db677722b8
--- /dev/null
+++ b/hpvm/projects/hpvm-rt/policy.h
@@ -0,0 +1,108 @@
+#ifndef __POLICY__
+#define __POLICY__
+
+#include "device_abstraction.h"
+#include <string>
+
+/************************* Policies *************************************/
+class Policy {
+public:
+  virtual int getVersion(const char *, int64_t) = 0;
+  virtual ~Policy(){};
+};
+
+class ConstPolicy : public Policy {
+public:
+  ConstPolicy(int deviceID) : deviceID(deviceID) {}
+
+  int getVersion(const char *, int64_t) override { return deviceID; }
+
+private:
+  int deviceID;
+};
+
+class NodePolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    std::string s(name);
+    // std::string NodeNames[1] = {
+    // "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
+    std::string NodeNames[] = {
+        "WrapperGaussianSmoothing_cloned",
+        "WrapperlaplacianEstimate_cloned",
+        "WrapperComputeZeroCrossings_cloned",
+        "WrapperComputeGradient_cloned",
+        "WrapperComputeMaxGradient_cloned",
+        "WrapperRejectZeroCrossings_cloned",
+    };
+    // if (!s.compare(NodeNames[4])) {
+    //  std::cout << s << ": CPU" << "\n";
+    //  return 0;
+    //}
+    return 2;
+  }
+};
+
+class IterationPolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    if ((it % 10 == 0) || (it % 10 == 1))
+      return 0;
+    else
+      return 2;
+  }
+};
+
+class DeviceStatusPolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    if (deviceStatus) {
+      // std::cout << "Returning GPU\n";
+      return 2;
+    } else {
+      // std::cout << "Returning CPU\n";
+      return 0;
+    }
+  }
+};
+
+/* ------------------------------------------------------------------------- */
+// Added for the CFAR interactive policy demo.
+
+class InteractivePolicy : public Policy {
+private:
+  // 0 :for CPU, 1 for GPU, 2 for Vector
+  unsigned int userTargetDeviceChoice;
+  // Used to end thread execution
+  bool end;
+  // Thread that will update userTargetDeviceChoice
+  std::thread userTargetDeviceChoiceThread;
+  // Thread function
+  void updateUserTargetChoice() {
+    while (!end) {
+      std::cout << "Select target device (0 for CPU, 1 fpr GPU): ";
+      std::cin >> userTargetDeviceChoice;
+      if (userTargetDeviceChoice > 1) {
+        std::cout << "Invalid target device. Selecting GPU instead.\n";
+        userTargetDeviceChoice = 1;
+      }
+    }
+  }
+
+public:
+  // Inherited method, erquired for every policy object
+  virtual int getVersion(const char *name, int64_t it) {
+    return userTargetDeviceChoice;
+  }
+
+  InteractivePolicy() {
+    userTargetDeviceChoice = 1;
+    end = false;
+    userTargetDeviceChoiceThread =
+        std::thread(&InteractivePolicy::updateUserTargetChoice, this);
+  }
+
+  ~InteractivePolicy() {
+    end = true;
+    userTargetDeviceChoiceThread.join();
+  }
+};
+
+#endif // __POLICY__
diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
index c61573a13abcc09b1db10d86e141264a8b1c1760..50a7e6848350ef99c96f56cb5ac6d2d75308f398 100644
--- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
+++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
@@ -14,14 +14,14 @@
 
 #include "CBackend.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Config/config.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Host.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Config/config.h"
 #include "llvm/Transforms/Utils.h"
 
 #include <algorithm>
@@ -29,14 +29,13 @@
 
 #include <iostream>
 
-
 //#include "PHINodePass.h"
 
-//Jackson Korba 9/29/14
+// Jackson Korba 9/29/14
 #ifndef DEBUG_TYPE
 #define DEBUG_TYPE ""
 #endif
-//End Modification
+// End Modification
 
 #define DEBUG(x) x
 // Some ms header decided to define setjmp as _setjmp, undo this for this file
@@ -53,7 +52,8 @@ extern "C" void LLVMInitializeCBackendTarget() {
 
 char CWriter::ID = 0;
 
-// extra (invalid) Ops tags for tracking unary ops as a special case of the available binary ops
+// extra (invalid) Ops tags for tracking unary ops as a special case of the
+// available binary ops
 enum UnaryOps {
   BinaryNeg = Instruction::OtherOpsEnd + 1,
   BinaryNot,
@@ -62,19 +62,16 @@ enum UnaryOps {
 static bool isEmptyType(Type *Ty) {
   if (StructType *STy = dyn_cast<StructType>(Ty))
     return STy->getNumElements() == 0 ||
-      std::all_of(STy->element_begin(), STy->element_end(), [](Type *T){ return isEmptyType(T); });
+           std::all_of(STy->element_begin(), STy->element_end(),
+                       [](Type *T) { return isEmptyType(T); });
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return VTy->getNumElements() == 0 ||
-      isEmptyType(VTy->getElementType());
+    return VTy->getNumElements() == 0 || isEmptyType(VTy->getElementType());
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
-    return ATy->getNumElements() == 0 ||
-      isEmptyType(ATy->getElementType());
+    return ATy->getNumElements() == 0 || isEmptyType(ATy->getElementType());
   return Ty->isVoidTy();
 }
 
-bool CWriter::isEmptyType(Type *Ty) const {
-  return ::isEmptyType(Ty);
-}
+bool CWriter::isEmptyType(Type *Ty) const { return ::isEmptyType(Ty); }
 
 /// isAddressExposed - Return true if the specified value's name needs to
 /// have its address taken in order to get a C value of the correct type.
@@ -108,10 +105,9 @@ bool CWriter::isInlinableInst(Instruction &I) const {
   }
   // Must be an expression, must be used exactly once.  If it is dead, we
   // emit it inline where it would go.
-  if (isEmptyType(I.getType()) || !I.hasOneUse() ||
-      I.isTerminator() || isa<CallInst>(I) || isa<PHINode>(I) ||
-      isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
-      isa<InsertValueInst>(I))
+  if (isEmptyType(I.getType()) || !I.hasOneUse() || I.isTerminator() ||
+      isa<CallInst>(I) || isa<PHINode>(I) || isa<LoadInst>(I) ||
+      isa<VAArgInst>(I) || isa<InsertElementInst>(I) || isa<InsertValueInst>(I))
     // Don't inline a load across a store or other bad things!
     return false;
 
@@ -133,17 +129,18 @@ bool CWriter::isInlinableInst(Instruction &I) const {
 AllocaInst *CWriter::isDirectAlloca(Value *V) const {
   DEBUG(errs() << "Checking if " << *V << " is a direct alloca!\n");
   AllocaInst *AI = dyn_cast<AllocaInst>(V);
-  if (!AI) return 0;
+  if (!AI)
+    return 0;
   // Modification to inline fixed size array alloca!
   if (AI->isArrayAllocation())
-    return AI;   // FIXME: we can also inline fixed size array allocas!
+    return AI; // FIXME: we can also inline fixed size array allocas!
   if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
     return 0;
   return AI;
 }
 
 // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
-bool CWriter::isInlineAsm(Instruction& I) const {
+bool CWriter::isInlineAsm(Instruction &I) const {
   if (CallInst *CI = dyn_cast<CallInst>(&I))
     return isa<InlineAsm>(CI->getCalledValue());
   return false;
@@ -161,18 +158,19 @@ bool CWriter::runOnFunction(Function &F) {
   PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
   // Adding Scalar Evolution Pass for loop induction variable
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  //Adding Dominator Tree Pass
+  // Adding Dominator Tree Pass
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   // Adding Assumption Cache
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   // Adding IVUsers Pass for loop recongnition
   //  IU = &getAnalysis<IVUsersWrapperPass>().getIU();
 
-  BasicBlock* entry = &(F.getEntryBlock());
-  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { 
+  BasicBlock *entry = &(F.getEntryBlock());
+  for (df_iterator<BasicBlock *> BI = df_begin(entry), BE = df_end(entry);
+       BI != BE; ++BI) {
     BasicBlock *BB = *BI;
     if (Loop *L = LI->getLoopFor(&*BB)) {
-      if(simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/false)) {
+      if (simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/ false)) {
         DEBUG(errs() << "Simplified loop!\n" << *L << "\n");
       }
     }
@@ -180,7 +178,6 @@ bool CWriter::runOnFunction(Function &F) {
   // Get rid of intrinsics we can't handle.
   lowerIntrinsics(F);
 
-
   printFunction(F);
 
   LI = NULL;
@@ -196,15 +193,15 @@ static std::string CBEMangle(const std::string &S) {
       Result += S[i];
     } else {
       Result += '_';
-      Result += 'A'+(S[i]&15);
-      Result += 'A'+((S[i]>>4)&15);
+      Result += 'A' + (S[i] & 15);
+      Result += 'A' + ((S[i] >> 4) & 15);
       Result += '_';
     }
   return Result;
 }
 
-raw_ostream &
-CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printTypeString(raw_ostream &Out, Type *Ty,
+                                      bool isSigned) {
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
     assert(!isEmptyType(ST));
     TypedefDeclTypes.insert(Ty);
@@ -224,46 +221,51 @@ CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) {
   }
 
   switch (Ty->getTypeID()) {
-    case Type::VoidTyID:   return Out << "void";
-    case Type::IntegerTyID: {
-                              unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-                              if (NumBits == 1)
-                                return Out << "bool";
-                              else {
-                                assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-                                return Out << (isSigned?"i":"u") << NumBits;
-                              }
-                            }
-    case Type::FloatTyID:    return Out << "f32";
-    case Type::DoubleTyID:   return Out << "f64";
-    case Type::X86_FP80TyID: return Out << "f80";
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID:    return Out << "f128";
-
-    case Type::X86_MMXTyID:
-                             return Out << (isSigned ? "i32y2" : "u32y2");
-
-    case Type::VectorTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             VectorType *VTy = cast<VectorType>(Ty);
-                             assert(VTy->getNumElements() != 0);
-                             printTypeString(Out, VTy->getElementType(), isSigned);
-                             return Out << "x" << VTy->getNumElements();
-                           }
-
-    case Type::ArrayTyID: {
-                            TypedefDeclTypes.insert(Ty);
-                            ArrayType *ATy = cast<ArrayType>(Ty);
-                            assert(ATy->getNumElements() != 0);
-                            printTypeString(Out, ATy->getElementType(), isSigned);
-                            return Out << "a" << ATy->getNumElements();
-                          }
+  case Type::VoidTyID:
+    return Out << "void";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return Out << "bool";
+    else {
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned ? "i" : "u") << NumBits;
+    }
+  }
+  case Type::FloatTyID:
+    return Out << "f32";
+  case Type::DoubleTyID:
+    return Out << "f64";
+  case Type::X86_FP80TyID:
+    return Out << "f80";
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return Out << "f128";
+
+  case Type::X86_MMXTyID:
+    return Out << (isSigned ? "i32y2" : "u32y2");
+
+  case Type::VectorTyID: {
+    TypedefDeclTypes.insert(Ty);
+    VectorType *VTy = cast<VectorType>(Ty);
+    assert(VTy->getNumElements() != 0);
+    printTypeString(Out, VTy->getElementType(), isSigned);
+    return Out << "x" << VTy->getNumElements();
+  }
 
-    default:
+  case Type::ArrayTyID: {
+    TypedefDeclTypes.insert(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
+    assert(ATy->getNumElements() != 0);
+    printTypeString(Out, ATy->getElementType(), isSigned);
+    return Out << "a" << ATy->getNumElements();
+  }
+
+  default:
 #ifndef NDEBUG
-                          errs() << "Unknown primitive type: " << *Ty << "\n";
+    errs() << "Unknown primitive type: " << *Ty << "\n";
 #endif
-                          llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
@@ -278,8 +280,9 @@ std::string CWriter::getStructName(StructType *ST) {
   return "struct l_unnamed_" + utostr(id);
 }
 
-std::string CWriter::getFunctionName(FunctionType *FT,
-    std::pair<AttributeList, CallingConv::ID> PAL) {
+std::string
+CWriter::getFunctionName(FunctionType *FT,
+                         std::pair<AttributeList, CallingConv::ID> PAL) {
   unsigned &id = UnnamedFunctionIDs[std::make_pair(FT, PAL)];
   if (id == 0)
     id = ++NextFunctionNumber;
@@ -293,7 +296,8 @@ std::string CWriter::getArrayName(ArrayType *AT) {
   // value semantics (avoiding the array "decay").
   assert(!isEmptyType(AT));
   printTypeName(ArrayInnards, AT->getElementType(), false);
-  return "struct l_array_" + utostr(AT->getNumElements()) + '_' + CBEMangle(ArrayInnards.str());
+  return "struct l_array_" + utostr(AT->getNumElements()) + '_' +
+         CBEMangle(ArrayInnards.str());
 }
 
 std::string CWriter::getVectorName(VectorType *VT, bool Aligned) {
@@ -304,95 +308,125 @@ std::string CWriter::getVectorName(VectorType *VT, bool Aligned) {
   //    if (Aligned)
   //      Out << "__MSALIGN__(" << TD->getABITypeAlignment(VT) << ") ";
   printTypeName(VectorInnards, VT->getElementType(), false);
-  return "struct l_vector_" + utostr(VT->getNumElements()) + '_' + CBEMangle(VectorInnards.str());
+  return "struct l_vector_" + utostr(VT->getNumElements()) + '_' +
+         CBEMangle(VectorInnards.str());
 }
 
-
 static const std::string getCmpPredicateName(CmpInst::Predicate P) {
   switch (P) {
-    case FCmpInst::FCMP_FALSE: return "0";
-    case FCmpInst::FCMP_OEQ: return "oeq";
-    case FCmpInst::FCMP_OGT: return "ogt";
-    case FCmpInst::FCMP_OGE: return "oge";
-    case FCmpInst::FCMP_OLT: return "olt";
-    case FCmpInst::FCMP_OLE: return "ole";
-    case FCmpInst::FCMP_ONE: return "one";
-    case FCmpInst::FCMP_ORD: return "ord";
-    case FCmpInst::FCMP_UNO: return "uno";
-    case FCmpInst::FCMP_UEQ: return "ueq";
-    case FCmpInst::FCMP_UGT: return "ugt";
-    case FCmpInst::FCMP_UGE: return "uge";
-    case FCmpInst::FCMP_ULT: return "ult";
-    case FCmpInst::FCMP_ULE: return "ule";
-    case FCmpInst::FCMP_UNE: return "une";
-    case FCmpInst::FCMP_TRUE: return "1";
-    case ICmpInst::ICMP_EQ:  return "eq";
-    case ICmpInst::ICMP_NE:  return "ne";
-    case ICmpInst::ICMP_ULE: return "ule";
-    case ICmpInst::ICMP_SLE: return "sle";
-    case ICmpInst::ICMP_UGE: return "uge";
-    case ICmpInst::ICMP_SGE: return "sge";
-    case ICmpInst::ICMP_ULT: return "ult";
-    case ICmpInst::ICMP_SLT: return "slt";
-    case ICmpInst::ICMP_UGT: return "ugt";
-    case ICmpInst::ICMP_SGT: return "sgt";
-    default:
+  case FCmpInst::FCMP_FALSE:
+    return "0";
+  case FCmpInst::FCMP_OEQ:
+    return "oeq";
+  case FCmpInst::FCMP_OGT:
+    return "ogt";
+  case FCmpInst::FCMP_OGE:
+    return "oge";
+  case FCmpInst::FCMP_OLT:
+    return "olt";
+  case FCmpInst::FCMP_OLE:
+    return "ole";
+  case FCmpInst::FCMP_ONE:
+    return "one";
+  case FCmpInst::FCMP_ORD:
+    return "ord";
+  case FCmpInst::FCMP_UNO:
+    return "uno";
+  case FCmpInst::FCMP_UEQ:
+    return "ueq";
+  case FCmpInst::FCMP_UGT:
+    return "ugt";
+  case FCmpInst::FCMP_UGE:
+    return "uge";
+  case FCmpInst::FCMP_ULT:
+    return "ult";
+  case FCmpInst::FCMP_ULE:
+    return "ule";
+  case FCmpInst::FCMP_UNE:
+    return "une";
+  case FCmpInst::FCMP_TRUE:
+    return "1";
+  case ICmpInst::ICMP_EQ:
+    return "eq";
+  case ICmpInst::ICMP_NE:
+    return "ne";
+  case ICmpInst::ICMP_ULE:
+    return "ule";
+  case ICmpInst::ICMP_SLE:
+    return "sle";
+  case ICmpInst::ICMP_UGE:
+    return "uge";
+  case ICmpInst::ICMP_SGE:
+    return "sge";
+  case ICmpInst::ICMP_ULT:
+    return "ult";
+  case ICmpInst::ICMP_SLT:
+    return "slt";
+  case ICmpInst::ICMP_UGT:
+    return "ugt";
+  case ICmpInst::ICMP_SGT:
+    return "sgt";
+  default:
 #ifndef NDEBUG
-                             errs() << "Invalid icmp predicate!" << P;
+    errs() << "Invalid icmp predicate!" << P;
 #endif
-                             llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
-
-raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printSimpleType(raw_ostream &Out, Type *Ty,
+                                      bool isSigned) {
   assert((Ty->isSingleValueType() || Ty->isVoidTy()) &&
-      "Invalid type for printSimpleType");
+         "Invalid type for printSimpleType");
   switch (Ty->getTypeID()) {
-    case Type::VoidTyID:   return Out << "void";
-    case Type::IntegerTyID: {
-                              unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-                              if (NumBits == 1)
-                                return Out << "bool";
-                              else if (NumBits <= 8)
-                                return Out << (isSigned?"char":"uchar");
-                              else if (NumBits <= 16)
-                                return Out << (isSigned?"short":"ushort");
-                              else if (NumBits <= 32)
-                                return Out << (isSigned?"int":"uint"); // !!FIX ME
-                              else if (NumBits <= 64)
-                                return Out << (isSigned?"long":"ulong");
-                              else {
-                                assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-                                return Out << (isSigned?"int128_t":"uint128_t");
-                              }
-                            }
-    case Type::FloatTyID:  return Out << "float";
-    case Type::DoubleTyID: return Out << "double";
-                           // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
-                           // present matches host 'long double'.
-    case Type::X86_FP80TyID:
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID:  return Out << "long double";
-
-    case Type::X86_MMXTyID:
-                           return Out << (isSigned?"int":"uint") << " __attribute__((vector_size(8)))";
-
-    default:
+  case Type::VoidTyID:
+    return Out << "void";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return Out << "bool";
+    else if (NumBits <= 8)
+      return Out << (isSigned ? "char" : "uchar");
+    else if (NumBits <= 16)
+      return Out << (isSigned ? "short" : "ushort");
+    else if (NumBits <= 32)
+      return Out << (isSigned ? "int" : "uint"); // !!FIX ME
+    else if (NumBits <= 64)
+      return Out << (isSigned ? "long" : "ulong");
+    else {
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned ? "int128_t" : "uint128_t");
+    }
+  }
+  case Type::FloatTyID:
+    return Out << "float";
+  case Type::DoubleTyID:
+    return Out << "double";
+    // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+    // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return Out << "long double";
+
+  case Type::X86_MMXTyID:
+    return Out << (isSigned ? "int" : "uint")
+               << " __attribute__((vector_size(8)))";
+
+  default:
 #ifndef NDEBUG
-                           errs() << "Unknown primitive type: " << *Ty << "\n";
+    errs() << "Unknown primitive type: " << *Ty << "\n";
 #endif
-                           llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty,
-    bool isSigned,
-    std::pair<AttributeList, CallingConv::ID> PAL) {
+raw_ostream &
+CWriter::printTypeName(raw_ostream &Out, Type *Ty, bool isSigned,
+                       std::pair<AttributeList, CallingConv::ID> PAL) {
 
   if (Ty->isSingleValueType() || Ty->isVoidTy()) {
     if (!Ty->isPointerTy() && !Ty->isVectorTy())
@@ -403,39 +437,40 @@ raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty,
     return Out << "void";
 
   switch (Ty->getTypeID()) {
-    case Type::FunctionTyID: {
-                               FunctionType *FTy = cast<FunctionType>(Ty);
-                               return Out << getFunctionName(FTy, PAL);
-                             }
-    case Type::StructTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             return Out << getStructName(cast<StructType>(Ty));
-                           }
-
-    case Type::PointerTyID: {
-                              Type *ElTy = Ty->getPointerElementType();
-                              return printTypeName(Out, ElTy, false) << '*';
-                            }
-
-    case Type::ArrayTyID: {
-                            TypedefDeclTypes.insert(Ty);
-                            return Out << getArrayName(cast<ArrayType>(Ty));
-                          }
-
-    case Type::VectorTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             return Out << getVectorName(cast<VectorType>(Ty), true);
-                           }
+  case Type::FunctionTyID: {
+    FunctionType *FTy = cast<FunctionType>(Ty);
+    return Out << getFunctionName(FTy, PAL);
+  }
+  case Type::StructTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getStructName(cast<StructType>(Ty));
+  }
 
-    default:
+  case Type::PointerTyID: {
+    Type *ElTy = Ty->getPointerElementType();
+    return printTypeName(Out, ElTy, false) << '*';
+  }
+
+  case Type::ArrayTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getArrayName(cast<ArrayType>(Ty));
+  }
+
+  case Type::VectorTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getVectorName(cast<VectorType>(Ty), true);
+  }
+
+  default:
 #ifndef NDEBUG
-                           errs() << "Unexpected type: " << *Ty << "\n";
+    errs() << "Unexpected type: " << *Ty << "\n";
 #endif
-                           llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
-raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty,
+                                             bool isSigned) {
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     // MSVC doesn't handle __declspec(align) on parameters,
     // but we specify it for Vector (hoping the compiler will vectorize it)
@@ -446,13 +481,15 @@ raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool is
   return printTypeName(Out, Ty, isSigned);
 }
 
-raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy) {
+raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out,
+                                             StructType *STy) {
   if (STy->isPacked())
     Out << "#ifdef _MSC_VER\n#pragma pack(push, 1)\n#endif\n";
   Out << getStructName(STy) << " {\n";
   unsigned Idx = 0;
   for (StructType::element_iterator I = STy->element_begin(),
-      E = STy->element_end(); I != E; ++I, Idx++) {
+                                    E = STy->element_end();
+       I != E; ++I, Idx++) {
     Out << "  ";
     bool empty = isEmptyType(*I);
     if (empty)
@@ -472,21 +509,23 @@ raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy)
   return Out;
 }
 
-raw_ostream &CWriter::printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty,
-    std::pair<AttributeList, CallingConv::ID> PAL){
+raw_ostream &CWriter::printFunctionDeclaration(
+    raw_ostream &Out, FunctionType *Ty,
+    std::pair<AttributeList, CallingConv::ID> PAL) {
 
   Out << "typedef ";
   printFunctionProto(Out, Ty, PAL, getFunctionName(Ty, PAL), NULL, false);
   return Out << ";\n";
 }
 
-raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
-    std::pair<AttributeList, CallingConv::ID> Attrs,
-    const std::string &Name,
-    Function::arg_iterator ArgList,
-    bool isKernel) {
+raw_ostream &
+CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
+                            std::pair<AttributeList, CallingConv::ID> Attrs,
+                            const std::string &Name,
+                            Function::arg_iterator ArgList, bool isKernel) {
 
-  // NOTE: AttributeSet is replaced by 'AttributeList' at function level in LLVM-9
+  // NOTE: AttributeSet is replaced by 'AttributeList' at function level in
+  // LLVM-9
   AttributeList &PAL = Attrs.first;
 
   if (PAL.hasAttribute(AttributeList::FunctionIndex, Attribute::NoReturn))
@@ -497,7 +536,7 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
 
   // Should this function actually return a struct by-value?
   bool isStructReturn = PAL.hasAttribute(1, Attribute::StructRet) ||
-    PAL.hasAttribute(2, Attribute::StructRet);
+                        PAL.hasAttribute(2, Attribute::StructRet);
   // Get the return type for the function.
   Type *RetTy;
   if (!isStructReturn)
@@ -507,24 +546,25 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     RetTy = cast<PointerType>(FTy->getParamType(0))->getElementType();
   }
   printTypeName(Out, RetTy,
-      /*isSigned=*/PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt));
+                /*isSigned=*/
+                PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt));
 
   Out << "/* Processing Function: " << Name << ": " << Attrs.second << "*/\n";
   switch (Attrs.second) {
-    case CallingConv::C:
-      break;
-    case CallingConv::X86_StdCall:
-      Out << " __stdcall";
-      break;
-    case CallingConv::X86_FastCall:
-      Out << " __fastcall";
-      break;
-    case CallingConv::X86_ThisCall:
-      Out << " __thiscall";
-      break;
-    default:
-      //    assert(0 && "Encountered Unhandled Calling Convention");
-      break;
+  case CallingConv::C:
+    break;
+  case CallingConv::X86_StdCall:
+    Out << " __stdcall";
+    break;
+  case CallingConv::X86_FastCall:
+    Out << " __fastcall";
+    break;
+  case CallingConv::X86_ThisCall:
+    Out << " __thiscall";
+    break;
+  default:
+    //    assert(0 && "Encountered Unhandled Calling Convention");
+    break;
   }
   Out << ' ' << Name << '(';
 
@@ -532,7 +572,8 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
   bool PrintedArg = false;
   FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
 
-  //Function::arg_iterator ArgName = ArgList ? ArgList->begin() : Function::arg_iterator();
+  // Function::arg_iterator ArgName = ArgList ? ArgList->begin() :
+  // Function::arg_iterator();
   // NOTE: ArgumentLists not supported in LLVM-9
   Function::arg_iterator ArgName = ArgList ? ArgList : Function::arg_iterator();
 
@@ -543,8 +584,10 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     assert(I != E && "Invalid struct return function!");
     ++I;
     ++Idx;
-    // CHECK: very confused as to how next loop starts from first Function Param?
-    if (ArgList) ++ArgName;
+    // CHECK: very confused as to how next loop starts from first Function
+    // Param?
+    if (ArgList)
+      ++ArgName;
   }
 
   for (; I != E; ++I) {
@@ -559,26 +602,26 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     if (PointerType *PTy = dyn_cast<PointerType>(ArgTy)) {
       unsigned AddrSpace = PTy->getAddressSpace();
       DEBUG(errs() << "AddrSpace for " << Idx << " = " << AddrSpace << "\n");
-      switch(AddrSpace) {
-        case GLOBAL_ADDRSPACE:
-          Out << "__global ";
-          break;
-        case SHARED_ADDRSPACE:
-          Out << "__local ";
-          break;
-        case CONSTANT_ADDRSPACE:
-          Out << "__constant ";
-          break;
-        case PRIVATE_ADDRSPACE:
-          Out << "__private ";
-          break;
-        default:
-          break;
+      switch (AddrSpace) {
+      case GLOBAL_ADDRSPACE:
+        Out << "__global ";
+        break;
+      case SHARED_ADDRSPACE:
+        Out << "__local ";
+        break;
+      case CONSTANT_ADDRSPACE:
+        Out << "__constant ";
+        break;
+      case PRIVATE_ADDRSPACE:
+        Out << "__private ";
+        break;
+      default:
+        break;
       }
     }
 
     printTypeNameUnaligned(Out, ArgTy,
-        /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt));
+                           /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt));
     PrintedArg = true;
     bool noalias = false;
     if (PAL.hasAttribute(Idx, Attribute::NoAlias)) {
@@ -587,15 +630,16 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     ++Idx;
     if (ArgList) {
 
-      Out << ' ' << (noalias ? " restrict " : "")  << GetValueName(&*ArgName);
+      Out << ' ' << (noalias ? " restrict " : "") << GetValueName(&*ArgName);
       ++ArgName;
     }
   }
 
   if (FTy->isVarArg()) {
     if (!PrintedArg) {
-      Out << "int"; //dummy argument for empty vaarg functs
-      if (ArgList) Out << " vararg_dummy_arg";
+      Out << "int"; // dummy argument for empty vaarg functs
+      if (ArgList)
+        Out << " vararg_dummy_arg";
     }
     Out << ", ...";
   } else if (!PrintedArg) {
@@ -615,16 +659,20 @@ raw_ostream &CWriter::printArrayDeclaration(raw_ostream &Out, ArrayType *ATy) {
   return Out;
 }
 
-raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out, VectorType *VTy) {
+raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out,
+                                             VectorType *VTy) {
   assert(!isEmptyType(VTy));
   // Vectors are printed like arrays
   Out << getVectorName(VTy, false) << " {\n  ";
   printTypeName(Out, VTy->getElementType());
-  Out << " vector[" << utostr(VTy->getNumElements()) << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy) << ")));\n";
+  Out << " vector[" << utostr(VTy->getNumElements())
+      << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy)
+      << ")));\n";
   return Out;
 }
 
-void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context) {
+void CWriter::printConstantArray(ConstantArray *CPA,
+                                 enum OperandContext Context) {
   printConstant(cast<Constant>(CPA->getOperand(0)), Context);
   for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
     Out << ", ";
@@ -632,7 +680,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context
   }
 }
 
-void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Context) {
+void CWriter::printConstantVector(ConstantVector *CP,
+                                  enum OperandContext Context) {
   printConstant(cast<Constant>(CP->getOperand(0)), Context);
   for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
     Out << ", ";
@@ -640,7 +689,8 @@ void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Contex
   }
 }
 
-void CWriter::printConstantDataSequential(ConstantDataSequential *CDS, enum OperandContext Context) {
+void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
+                                          enum OperandContext Context) {
   printConstant(CDS->getElementAsConstant(0), Context);
   for (unsigned i = 1, e = CDS->getNumElements(); i != e; ++i) {
     Out << ", ";
@@ -652,8 +702,10 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
   // As a special case, print the array as a string if it is an array of
   // ubytes or an array of sbytes with positive values.
   ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C);
-  if (!CDS || !CDS->isCString()) return false;
-  if (Context != ContextStatic) return false; // TODO
+  if (!CDS || !CDS->isCString())
+    return false;
+  if (Context != ContextStatic)
+    return false; // TODO
 
   Out << "{ \"";
   // Keep track of whether the last number was a hexadecimal escape.
@@ -680,19 +732,34 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
     } else {
       LastWasHex = false;
       switch (C) {
-        case '\n': Out << "\\n"; break;
-        case '\t': Out << "\\t"; break;
-        case '\r': Out << "\\r"; break;
-        case '\v': Out << "\\v"; break;
-        case '\a': Out << "\\a"; break;
-        case '\"': Out << "\\\""; break;
-        case '\'': Out << "\\\'"; break;
-        default:
-                   Out << "\\x";
-                   Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
-                   Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
-                   LastWasHex = true;
-                   break;
+      case '\n':
+        Out << "\\n";
+        break;
+      case '\t':
+        Out << "\\t";
+        break;
+      case '\r':
+        Out << "\\r";
+        break;
+      case '\v':
+        Out << "\\v";
+        break;
+      case '\a':
+        Out << "\\a";
+        break;
+      case '\"':
+        Out << "\\\"";
+        break;
+      case '\'':
+        Out << "\\\'";
+        break;
+      default:
+        Out << "\\x";
+        Out << (char)((C / 16 < 10) ? (C / 16 + '0') : (C / 16 - 10 + 'A'));
+        Out << (char)(((C & 15) < 10) ? ((C & 15) + '0')
+                                      : ((C & 15) - 10 + 'A'));
+        LastWasHex = true;
+        break;
       }
     }
   }
@@ -700,7 +767,6 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
   return true;
 }
 
-
 // isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
 // textually as a double (rather than as a reference to a stack-allocated
 // variable). We decide this by converting CFP to a string and back into a
@@ -711,7 +777,7 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
 //
 
 // TODO copied from CppBackend, new code should use raw_ostream
-static inline std::string ftostr(const APFloat& V) {
+static inline std::string ftostr(const APFloat &V) {
   std::string Buf;
   if (&V.getSemantics() == &APFloat::IEEEdouble()) {
     raw_string_ostream(Buf) << V.convertToDouble();
@@ -729,14 +795,13 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
   if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
       CFP->getType() != Type::getDoubleTy(CFP->getContext()))
     return false;
-  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  APFloat APF = APFloat(CFP->getValueAPF()); // copy
   if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
     APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &ignored);
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
   char Buffer[100];
   sprintf(Buffer, "%a", APF.convertToDouble());
-  if (!strncmp(Buffer, "0x", 2) ||
-      !strncmp(Buffer, "-0x", 3) ||
+  if (!strncmp(Buffer, "0x", 2) || !strncmp(Buffer, "-0x", 3) ||
       !strncmp(Buffer, "+0x", 3))
     return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
   return false;
@@ -763,211 +828,249 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
 void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
   // Print the destination type cast
   switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::IntToPtr:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
-      Out << '(';
-      printTypeName(Out, DstTy);
-      Out << ')';
-      break;
-    case Instruction::ZExt:
-    case Instruction::PtrToInt:
-    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
-      Out << '(';
-      printSimpleType(Out, DstTy, false);
-      Out << ')';
-      break;
-    case Instruction::SExt:
-    case Instruction::FPToSI: // For these, make sure we get a signed dest
-      Out << '(';
-      printSimpleType(Out, DstTy, true);
-      Out << ')';
-      break;
-    default:
-      llvm_unreachable("Invalid cast opcode");
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::IntToPtr:
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+    Out << '(';
+    printTypeName(Out, DstTy);
+    Out << ')';
+    break;
+  case Instruction::ZExt:
+  case Instruction::PtrToInt:
+  case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+    Out << '(';
+    printSimpleType(Out, DstTy, false);
+    Out << ')';
+    break;
+  case Instruction::SExt:
+  case Instruction::FPToSI: // For these, make sure we get a signed dest
+    Out << '(';
+    printSimpleType(Out, DstTy, true);
+    Out << ')';
+    break;
+  default:
+    llvm_unreachable("Invalid cast opcode");
   }
 
   // Print the source type cast
   switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::ZExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, false);
-      Out << ')';
-      break;
-    case Instruction::SIToFP:
-    case Instruction::SExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, true);
-      Out << ')';
-      break;
-    case Instruction::IntToPtr:
-    case Instruction::PtrToInt:
-      // Avoid "cast to pointer from integer of different size" warnings
-      Out << "(uintptr_t)";
-      break;
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPToSI:
-    case Instruction::FPToUI:
-      break; // These don't need a source cast.
-    default:
-      llvm_unreachable("Invalid cast opcode");
+  case Instruction::UIToFP:
+  case Instruction::ZExt:
+    Out << '(';
+    printSimpleType(Out, SrcTy, false);
+    Out << ')';
+    break;
+  case Instruction::SIToFP:
+  case Instruction::SExt:
+    Out << '(';
+    printSimpleType(Out, SrcTy, true);
+    Out << ')';
+    break;
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // Avoid "cast to pointer from integer of different size" warnings
+    Out << "(uintptr_t)";
+    break;
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+    break; // These don't need a source cast.
+  default:
+    llvm_unreachable("Invalid cast opcode");
   }
 }
 
 // printConstant - The LLVM Constant to C Constant converter.
 void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
-    assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() || CE->getType()->isPointerTy()); // TODO: VectorType are valid here, but not supported
+    assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() ||
+           CE->getType()->isPointerTy()); // TODO: VectorType are valid here,
+                                          // but not supported
     GetElementPtrInst *GEPI;
     switch (CE->getOpcode()) {
-      case Instruction::Trunc:
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPTrunc:
-      case Instruction::FPExt:
-      case Instruction::UIToFP:
-      case Instruction::SIToFP:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::BitCast:
-        Out << "(";
-        printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
-        if (CE->getOpcode() == Instruction::SExt &&
-            CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
-          // Make sure we really sext from bool here by subtracting from 0
-          Out << "0-";
-        }
-        printConstant(CE->getOperand(0), ContextCasted);
-        if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
-            (CE->getOpcode() == Instruction::Trunc ||
-             CE->getOpcode() == Instruction::FPToUI ||
-             CE->getOpcode() == Instruction::FPToSI ||
-             CE->getOpcode() == Instruction::PtrToInt)) {
-          // Make sure we really truncate to bool here by anding with 1
-          Out << "&1u";
-        }
-        Out << ')';
-        return;
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      Out << "(";
+      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0), ContextCasted);
+      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      return;
 
-      case Instruction::GetElementPtr:
-        Out << "(";
-        DEBUG(errs() << "\n----------\nCE: " << *CE << "\n");
-        GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction());
-        DEBUG(errs() << "GEPI: " << *GEPI << "\n");
-        printGEPExpression(CE->getOperand(0), gep_type_begin(CPV), gep_type_end(CPV), CE->getOperand(0)->getType()->isArrayTy(), GEPI);
-        delete(GEPI);
-        DEBUG(errs() << "Deleted GEPI!\n");
-        Out << ")";
-        return;
-      case Instruction::Select:
-        Out << '(';
-        printConstant(CE->getOperand(0), ContextCasted);
-        Out << '?';
-        printConstant(CE->getOperand(1), ContextNormal);
-        Out << ':';
-        printConstant(CE->getOperand(2), ContextNormal);
-        Out << ')';
-        return;
+    case Instruction::GetElementPtr:
+      Out << "(";
+      DEBUG(errs() << "\n----------\nCE: " << *CE << "\n");
+      GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction());
+      DEBUG(errs() << "GEPI: " << *GEPI << "\n");
+      printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
+                         gep_type_end(CPV),
+                         CE->getOperand(0)->getType()->isArrayTy(), GEPI);
+      delete (GEPI);
+      DEBUG(errs() << "Deleted GEPI!\n");
+      Out << ")";
+      return;
+    case Instruction::Select:
+      Out << '(';
+      printConstant(CE->getOperand(0), ContextCasted);
+      Out << '?';
+      printConstant(CE->getOperand(1), ContextNormal);
+      Out << ':';
+      printConstant(CE->getOperand(2), ContextNormal);
+      Out << ')';
+      return;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE);
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
       case Instruction::Add:
       case Instruction::FAdd:
+        Out << " + ";
+        break;
       case Instruction::Sub:
       case Instruction::FSub:
+        Out << " - ";
+        break;
       case Instruction::Mul:
       case Instruction::FMul:
-      case Instruction::SDiv:
-      case Instruction::UDiv:
-      case Instruction::FDiv:
+        Out << " * ";
+        break;
       case Instruction::URem:
       case Instruction::SRem:
       case Instruction::FRem:
+        Out << " % ";
+        break;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+        Out << " / ";
+        break;
       case Instruction::And:
+        Out << " & ";
+        break;
       case Instruction::Or:
+        Out << " | ";
+        break;
       case Instruction::Xor:
-      case Instruction::ICmp:
+        Out << " ^ ";
+        break;
       case Instruction::Shl:
+        Out << " << ";
+        break;
       case Instruction::LShr:
       case Instruction::AShr:
-        {
-          Out << '(';
-          bool NeedsClosingParens = printConstExprCast(CE);
-          printConstantWithCast(CE->getOperand(0), CE->getOpcode());
-          switch (CE->getOpcode()) {
-            case Instruction::Add:
-            case Instruction::FAdd: Out << " + "; break;
-            case Instruction::Sub:
-            case Instruction::FSub: Out << " - "; break;
-            case Instruction::Mul:
-            case Instruction::FMul: Out << " * "; break;
-            case Instruction::URem:
-            case Instruction::SRem:
-            case Instruction::FRem: Out << " % "; break;
-            case Instruction::UDiv:
-            case Instruction::SDiv:
-            case Instruction::FDiv: Out << " / "; break;
-            case Instruction::And: Out << " & "; break;
-            case Instruction::Or:  Out << " | "; break;
-            case Instruction::Xor: Out << " ^ "; break;
-            case Instruction::Shl: Out << " << "; break;
-            case Instruction::LShr:
-            case Instruction::AShr: Out << " >> "; break;
-            case Instruction::ICmp:
-                                    switch (CE->getPredicate()) {
-                                      case ICmpInst::ICMP_EQ: Out << " == "; break;
-                                      case ICmpInst::ICMP_NE: Out << " != "; break;
-                                      case ICmpInst::ICMP_SLT:
-                                      case ICmpInst::ICMP_ULT: Out << " < "; break;
-                                      case ICmpInst::ICMP_SLE:
-                                      case ICmpInst::ICMP_ULE: Out << " <= "; break;
-                                      case ICmpInst::ICMP_SGT:
-                                      case ICmpInst::ICMP_UGT: Out << " > "; break;
-                                      case ICmpInst::ICMP_SGE:
-                                      case ICmpInst::ICMP_UGE: Out << " >= "; break;
-                                      default: llvm_unreachable("Illegal ICmp predicate");
-                                    }
-                                    break;
-            default: llvm_unreachable("Illegal opcode here!");
-          }
-          printConstantWithCast(CE->getOperand(1), CE->getOpcode());
-          if (NeedsClosingParens)
-            Out << "))";
-          Out << ')';
-          return;
+        Out << " >> ";
+        break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+        case ICmpInst::ICMP_EQ:
+          Out << " == ";
+          break;
+        case ICmpInst::ICMP_NE:
+          Out << " != ";
+          break;
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          Out << " < ";
+          break;
+        case ICmpInst::ICMP_SLE:
+        case ICmpInst::ICMP_ULE:
+          Out << " <= ";
+          break;
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          Out << " > ";
+          break;
+        case ICmpInst::ICMP_SGE:
+        case ICmpInst::ICMP_UGE:
+          Out << " >= ";
+          break;
+        default:
+          llvm_unreachable("Illegal ICmp predicate");
         }
-      case Instruction::FCmp: {
-                                Out << '(';
-                                bool NeedsClosingParens = printConstExprCast(CE);
-                                if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
-                                  Out << "0";
-                                else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
-                                  Out << "1";
-                                else {
-                                  Out << "llvm_fcmp_" << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate()) << "(";
-                                  printConstant(CE->getOperand(0), ContextCasted);
-                                  Out << ", ";
-                                  printConstant(CE->getOperand(1), ContextCasted);
-                                  Out << ")";
-                                }
-                                if (NeedsClosingParens)
-                                  Out << "))";
-                                Out << ')';
-                                return;
-                              }
+        break;
       default:
+        llvm_unreachable("Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE);
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        Out << "llvm_fcmp_"
+            << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate())
+            << "(";
+        printConstant(CE->getOperand(0), ContextCasted);
+        Out << ", ";
+        printConstant(CE->getOperand(1), ContextCasted);
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    default:
 #ifndef NDEBUG
-                              errs() << "CWriter Error: Unhandled constant expression: "
-                                << *CE << "\n";
+      errs() << "CWriter Error: Unhandled constant expression: " << *CE << "\n";
 #endif
-                              llvm_unreachable(0);
+      llvm_unreachable(0);
     }
   } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
     if (CPV->getType()->isVectorTy()) {
@@ -984,7 +1087,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
       Constant *Zero = Constant::getNullValue(VT->getElementType());
       unsigned NumElts = VT->getNumElements();
       for (unsigned i = 0; i != NumElts; ++i) {
-        if (i) Out << ", ";
+        if (i)
+          Out << ", ";
         printConstant(Zero, ContextCasted);
       }
       Out << ")";
@@ -998,9 +1102,10 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
-    Type* Ty = CI->getType();
+    Type *Ty = CI->getType();
     unsigned ActiveBits = CI->getValue().getMinSignedBits();
-    DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits << "\n");
+    DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits
+                 << "\n");
     Out << CI->getSExtValue();
     //    if (Ty == Type::getInt1Ty(CPV->getContext())) {
     //      Out << (CI->getZExtValue() ? '1' : '0');
@@ -1013,7 +1118,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
     //      Out << CI->getSExtValue(); // most likely a shorter representation
     ////      if (ActiveBits >= 32)
     ////        Out << ")";
-    //    } else if (Ty->getPrimitiveSizeInBits() < 32 && Context == ContextNormal) {
+    //    } else if (Ty->getPrimitiveSizeInBits() < 32 && Context ==
+    //    ContextNormal) {
     //      Out << "((";
     //      printSimpleType(Out, Ty, false) << ')';
     //      if (CI->isMinValue(true))
@@ -1030,248 +1136,266 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
     ////      const APInt &V = CI->getValue();
     ////      const APInt &Vlo = V.getLoBits(64);
     ////      const APInt &Vhi = V.getHiBits(64);
-    ////      Out << (Context == ContextStatic ? "UINT128_C" : "llvm_ctor_u128");
-    ////      Out << "(UINT64_C(" << Vhi.getZExtValue() << "), UINT64_C(" << Vlo.getZExtValue() << "))";
+    ////      Out << (Context == ContextStatic ? "UINT128_C" :
+    ///"llvm_ctor_u128"); /      Out << "(UINT64_C(" << Vhi.getZExtValue() <<
+    ///"), UINT64_C(" << Vlo.getZExtValue() << "))";
     //    }
     return;
   }
 
   switch (CPV->getType()->getTypeID()) {
-    case Type::FloatTyID:
-    case Type::DoubleTyID:
-    case Type::X86_FP80TyID:
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID: {
-                            ConstantFP *FPC = cast<ConstantFP>(CPV);
-                            std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
-                            if (I != FPConstantMap.end()) {
-                              // Because of FP precision problems we must load from a stack allocated
-                              // value that holds the value in hex.
-                              Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
-                                      "float" :
-                                      FPC->getType() == Type::getDoubleTy(CPV->getContext()) ?
-                                      "double" :
-                                      "long double")
-                                << "*)&FPConstant" << I->second << ')';
-                            } else {
-                              double V;
-                              if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
-                                V = FPC->getValueAPF().convertToFloat();
-                              else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
-                                V = FPC->getValueAPF().convertToDouble();
-                              else {
-                                // Long double.  Convert the number to double, discarding precision.
-                                // This is not awesome, but it at least makes the CBE output somewhat
-                                // useful.
-                                APFloat Tmp = FPC->getValueAPF();
-                                bool LosesInfo;
-                                Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo);
-                                V = Tmp.convertToDouble();
-                              }
-
-                              if (std::isnan(V)) {
-                                // The value is NaN
-
-                                // FIXME the actual NaN bits should be emitted.
-                                // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
-                                // it's 0x7ff4.
-                                const unsigned long QuietNaN = 0x7ff8UL;
-                                //const unsigned long SignalNaN = 0x7ff4UL;
-
-                                // We need to grab the first part of the FP #
-                                char Buffer[100];
-
-                                uint64_t ll = DoubleToBits(V);
-                                sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
-
-                                std::string Num(&Buffer[0], &Buffer[6]);
-                                unsigned long Val = strtoul(Num.c_str(), 0, 16);
-
-                                if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
-                                  Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
-                                    << Buffer << "\") /*nan*/ ";
-                                else
-                                  Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
-                                    << Buffer << "\") /*nan*/ ";
-                              } else if (std::isinf(V)) {
-                                // The value is Inf
-                                if (V < 0) Out << '-';
-                                Out << "LLVM_INF" <<
-                                  (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
-                                  << " /*inf*/ ";
-                              } else {
-                                std::string Num;
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP *, unsigned>::iterator I =
+        FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*("
+          << (FPC->getType() == Type::getFloatTy(CPV->getContext())
+                  ? "float"
+                  : FPC->getType() == Type::getDoubleTy(CPV->getContext())
+                        ? "double"
+                        : "long double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      double V;
+      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToFloat();
+      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToDouble();
+      else {
+        // Long double.  Convert the number to double, discarding precision.
+        // This is not awesome, but it at least makes the CBE output somewhat
+        // useful.
+        APFloat Tmp = FPC->getValueAPF();
+        bool LosesInfo;
+        Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo);
+        V = Tmp.convertToDouble();
+      }
+
+      if (std::isnan(V)) {
+        // The value is NaN
+
+        // FIXME the actual NaN bits should be emitted.
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        // const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(V);
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\"" << Buffer
+              << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\"" << Buffer
+              << "\") /*nan*/ ";
+      } else if (std::isinf(V)) {
+        // The value is Inf
+        if (V < 0)
+          Out << '-';
+        Out << "LLVM_INF"
+            << (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F"
+                                                                      : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
-                                // Print out the constant as a floating point number.
-                                char Buffer[100];
-                                sprintf(Buffer, "%a", V);
-                                Num = Buffer;
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", V);
+        Num = Buffer;
 #else
-                                Num = ftostr(FPC->getValueAPF());
+        Num = ftostr(FPC->getValueAPF());
 #endif
-                                Out << Num;
-                              }
-                            }
-                            break;
-                          }
-
-    case Type::ArrayTyID: {
-                            if (printConstantString(CPV, Context)) break;
-                            ArrayType *AT = cast<ArrayType>(CPV->getType());
-                            assert(AT->getNumElements() != 0 && !isEmptyType(AT));
-                            if (Context != ContextStatic) {
-                              CtorDeclTypes.insert(AT);
-                              Out << "llvm_ctor_";
-                              printTypeString(Out, AT, false);
-                              Out << "(";
-                              Context = ContextCasted;
-                            } else {
-                              Out << "{ { "; // Arrays are wrapped in struct types.
-                            }
-                            if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
-                              printConstantArray(CA, Context);
-                            } else if (ConstantDataSequential *CDS =
-                                dyn_cast<ConstantDataSequential>(CPV)) {
-                              printConstantDataSequential(CDS, Context);
-                            } else {
-                              assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-                              Constant *CZ = Constant::getNullValue(AT->getElementType());
-                              printConstant(CZ, Context);
-                              for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
-                                Out << ", ";
-                                printConstant(CZ, Context);
-                              }
-                            }
-                            Out << (Context == ContextStatic ? " } }" : ")"); // Arrays are wrapped in struct types.
-                            break;
-                          }
-
-    case Type::VectorTyID: {
-                             VectorType *VT = cast<VectorType>(CPV->getType());
-                             assert(VT->getNumElements() != 0 && !isEmptyType(VT));
-                             if (Context != ContextStatic) {
-                               CtorDeclTypes.insert(VT);
-                               Out << "llvm_ctor_";
-                               printTypeString(Out, VT, false);
-                               Out << "(";
-                               Context = ContextCasted;
-                             } else {
-                               Out << "{ ";
-                             }
-                             if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
-                               printConstantVector(CV, Context);
-                             } else if (ConstantDataSequential *CDS =
-                                 dyn_cast<ConstantDataSequential>(CPV)) {
-                               printConstantDataSequential(CDS, Context);
-                             } else {
-                               assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-                               Constant *CZ = Constant::getNullValue(VT->getElementType());
-                               printConstant(CZ, Context);
-                               for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
-                                 Out << ", ";
-                                 printConstant(CZ, Context);
-                               }
-                             }
-                             Out << (Context == ContextStatic ? " }" : ")");
-                             break;
-                           }
-
-    case Type::StructTyID: {
-                             StructType *ST = cast<StructType>(CPV->getType());
-                             assert(!isEmptyType(ST));
-                             if (Context != ContextStatic) {
-                               CtorDeclTypes.insert(ST);
-                               Out << "llvm_ctor_";
-                               printTypeString(Out, ST, false);
-                               Out << "(";
-                               Context = ContextCasted;
-                             } else {
-                               Out << "{ ";
-                             }
-
-                             if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
-                               bool printed = false;
-                               for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-                                 Type *ElTy = ST->getElementType(i);
-                                 if (isEmptyType(ElTy)) continue;
-                                 if (printed) Out << ", ";
-                                 printConstant(Constant::getNullValue(ElTy), Context);
-                                 printed = true;
-                               }
-                               assert(printed);
-                             } else {
-                               bool printed = false;
-                               for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
-                                 Constant *C = cast<Constant>(CPV->getOperand(i));
-                                 if (isEmptyType(C->getType())) continue;
-                                 if (printed) Out << ", ";
-                                 printConstant(C, Context);
-                                 printed = true;
-                               }
-                               assert(printed);
-                             }
-                             Out << (Context == ContextStatic ? " }" : ")");
-                             break;
-                           }
-
-    case Type::PointerTyID:
-                           if (isa<ConstantPointerNull>(CPV)) {
-                             Out << "((";
-                             printTypeName(Out, CPV->getType()); // sign doesn't matter
-                             Out << ")/*NULL*/0)";
-                             break;
-                           } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
-                             writeOperand(GV);
-                             break;
-                           }
-                           // FALL THROUGH
-    default:
+        Out << Num;
+      }
+    }
+    break;
+  }
+
+  case Type::ArrayTyID: {
+    if (printConstantString(CPV, Context))
+      break;
+    ArrayType *AT = cast<ArrayType>(CPV->getType());
+    assert(AT->getNumElements() != 0 && !isEmptyType(AT));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(AT);
+      Out << "llvm_ctor_";
+      printTypeString(Out, AT, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ { "; // Arrays are wrapped in struct types.
+    }
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
+      printConstantArray(CA, Context);
+    } else if (ConstantDataSequential *CDS =
+                   dyn_cast<ConstantDataSequential>(CPV)) {
+      printConstantDataSequential(CDS, Context);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      Constant *CZ = Constant::getNullValue(AT->getElementType());
+      printConstant(CZ, Context);
+      for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Context);
+      }
+    }
+    Out << (Context == ContextStatic
+                ? " } }"
+                : ")"); // Arrays are wrapped in struct types.
+    break;
+  }
+
+  case Type::VectorTyID: {
+    VectorType *VT = cast<VectorType>(CPV->getType());
+    assert(VT->getNumElements() != 0 && !isEmptyType(VT));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(VT);
+      Out << "llvm_ctor_";
+      printTypeString(Out, VT, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ ";
+    }
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      printConstantVector(CV, Context);
+    } else if (ConstantDataSequential *CDS =
+                   dyn_cast<ConstantDataSequential>(CPV)) {
+      printConstantDataSequential(CDS, Context);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printConstant(CZ, Context);
+      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Context);
+      }
+    }
+    Out << (Context == ContextStatic ? " }" : ")");
+    break;
+  }
+
+  case Type::StructTyID: {
+    StructType *ST = cast<StructType>(CPV->getType());
+    assert(!isEmptyType(ST));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(ST);
+      Out << "llvm_ctor_";
+      printTypeString(Out, ST, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ ";
+    }
+
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      bool printed = false;
+      for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+        Type *ElTy = ST->getElementType(i);
+        if (isEmptyType(ElTy))
+          continue;
+        if (printed)
+          Out << ", ";
+        printConstant(Constant::getNullValue(ElTy), Context);
+        printed = true;
+      }
+      assert(printed);
+    } else {
+      bool printed = false;
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        Constant *C = cast<Constant>(CPV->getOperand(i));
+        if (isEmptyType(C->getType()))
+          continue;
+        if (printed)
+          Out << ", ";
+        printConstant(C, Context);
+        printed = true;
+      }
+      assert(printed);
+    }
+    Out << (Context == ContextStatic ? " }" : ")");
+    break;
+  }
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printTypeName(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV);
+      break;
+    }
+    // FALL THROUGH
+  default:
 #ifndef NDEBUG
-                           errs() << "Unknown constant type: " << *CPV << "\n";
+    errs() << "Unknown constant type: " << *CPV << "\n";
 #endif
-                           llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
 // Some constant expressions need to be casted back to the original types
 // because their operands were casted to the expected type. This function takes
 // care of detecting that case and printing the cast for the ConstantExpr.
-bool CWriter::printConstExprCast(ConstantExpr* CE) {
+bool CWriter::printConstExprCast(ConstantExpr *CE) {
   bool NeedsExplicitCast = false;
   Type *Ty = CE->getOperand(0)->getType();
   bool TypeIsSigned = false;
   switch (CE->getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::URem:
-    case Instruction::UDiv: NeedsExplicitCast = true; break;
-    case Instruction::AShr:
-    case Instruction::SRem:
-    case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
-    case Instruction::SExt:
-                            Ty = CE->getType();
-                            NeedsExplicitCast = true;
-                            TypeIsSigned = true;
-                            break;
-    case Instruction::ZExt:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast:
-                            Ty = CE->getType();
-                            NeedsExplicitCast = true;
-                            break;
-    default: break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv:
+    NeedsExplicitCast = true;
+    break;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv:
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default:
+    break;
   }
   if (NeedsExplicitCast) {
     Out << "((";
@@ -1284,11 +1408,13 @@ bool CWriter::printConstExprCast(ConstantExpr* CE) {
 //  Print a constant assuming that it is the operand for a given Opcode. The
 //  opcodes that care about sign need to cast their operands to the expected
 //  type before the operation proceeds. This function does the casting.
-void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+void CWriter::printConstantWithCast(Constant *CPV, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  Type* OpTy = CPV->getType();
-  assert(OpTy->isIntegerTy() || OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not supported
+  Type *OpTy = CPV->getType();
+  assert(OpTy->isIntegerTy() ||
+         OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not
+                                     // supported
 
   // Indicate whether to do the cast or not.
   bool shouldCast;
@@ -1331,8 +1457,7 @@ std::string CWriter::GetValueName(Value *Operand) {
   std::string VarName;
   VarName.reserve(Name.capacity());
 
-  for (std::string::iterator I = Name.begin(), E = Name.end();
-      I != E; ++I) {
+  for (std::string::iterator I = Name.begin(), E = Name.end(); I != E; ++I) {
     unsigned char ch = *I;
 
     if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
@@ -1356,7 +1481,7 @@ void CWriter::writeInstComputationInline(Instruction &I) {
   unsigned mask = 0;
   Type *Ty = I.getType();
   if (Ty->isIntegerTy()) {
-    IntegerType *ITy = static_cast<IntegerType*>(Ty);
+    IntegerType *ITy = static_cast<IntegerType *>(Ty);
     if (!ITy->isPowerOf2ByteWidth())
       mask = ITy->getBitMask();
   }
@@ -1374,20 +1499,21 @@ void CWriter::writeInstComputationInline(Instruction &I) {
     Out << ")&" << mask << ")";
 }
 
-
-void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context) {
-  DEBUG(errs() << "In write operand internal: " << *Operand << "\n"); 
+void CWriter::writeOperandInternal(Value *Operand,
+                                   enum OperandContext Context) {
+  DEBUG(errs() << "In write operand internal: " << *Operand << "\n");
   if (Instruction *I = dyn_cast<Instruction>(Operand))
     // Should we inline this instruction to build a tree?
     if (isInlinableInst(*I) && !isDirectAlloca(I)) {
-      DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n" << "\n");
+      DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n"
+                   << "\n");
       Out << '(';
       writeInstComputationInline(*I);
       Out << ')';
       return;
     }
 
-  Constant* CPV = dyn_cast<Constant>(Operand);
+  Constant *CPV = dyn_cast<Constant>(Operand);
 
   if (CPV && !isa<GlobalValue>(CPV))
     printConstant(CPV, Context);
@@ -1395,12 +1521,14 @@ void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context)
     Out << GetValueName(Operand);
 }
 
-void CWriter::writeOperand(Value *Operand, enum OperandContext Context, bool arrayAccess) {
-  DEBUG(errs() << "In write operand: " << *Operand << "; ArrayAccess = " << arrayAccess << "\n");
+void CWriter::writeOperand(Value *Operand, enum OperandContext Context,
+                           bool arrayAccess) {
+  DEBUG(errs() << "In write operand: " << *Operand
+               << "; ArrayAccess = " << arrayAccess << "\n");
   bool isAddressImplicit = isAddressExposed(Operand);
   if (isAddressImplicit && !arrayAccess) {
     DEBUG(errs() << "isAddressImplicit & NOT arrayAccess!\n");
-    Out << "(&";  // Global variables are referenced as their addresses by llvm
+    Out << "(&"; // Global variables are referenced as their addresses by llvm
   }
   writeOperandInternal(Operand, Context);
 
@@ -1429,26 +1557,27 @@ void CWriter::writeOperandDeref(Value *Operand) {
 bool CWriter::writeInstructionCast(Instruction &I) {
   Type *Ty = I.getOperand(0)->getType();
   switch (I.getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::URem:
-    case Instruction::UDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, false);
-      Out << ")(";
-      return true;
-    case Instruction::AShr:
-    case Instruction::SRem:
-    case Instruction::SDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, true);
-      Out << ")(";
-      return true;
-    default: break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default:
+    break;
   }
   return false;
 }
@@ -1456,7 +1585,8 @@ bool CWriter::writeInstructionCast(Instruction &I) {
 // Write the operand with a cast to another type based on the Opcode being used.
 // This will be used in cases where an instruction has specific type
 // requirements (usually signedness) for its operands.
-void CWriter::opcodeNeedsCast(unsigned Opcode,
+void CWriter::opcodeNeedsCast(
+    unsigned Opcode,
     // Indicate whether to do the cast or not.
     bool &shouldCast,
     // Indicate whether the cast should be to a signed type or not.
@@ -1466,33 +1596,33 @@ void CWriter::opcodeNeedsCast(unsigned Opcode,
   // the new type to which the operand should be casted by setting the value
   // of OpTy. If we change OpTy, also set shouldCast to true.
   switch (Opcode) {
-    default:
-      // for most instructions, it doesn't matter
-      shouldCast = false;
-      castIsSigned = false;
-      break;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::UDiv:
-    case Instruction::URem: // Cast to unsigned first
-      shouldCast = true;
-      castIsSigned = false;
-      break;
-    case Instruction::GetElementPtr:
-    case Instruction::AShr:
-    case Instruction::SDiv:
-    case Instruction::SRem: // Cast to signed first
-      shouldCast = true;
-      castIsSigned = true;
-      break;
+  default:
+    // for most instructions, it doesn't matter
+    shouldCast = false;
+    castIsSigned = false;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::UDiv:
+  case Instruction::URem: // Cast to unsigned first
+    shouldCast = true;
+    castIsSigned = false;
+    break;
+  case Instruction::GetElementPtr:
+  case Instruction::AShr:
+  case Instruction::SDiv:
+  case Instruction::SRem: // Cast to signed first
+    shouldCast = true;
+    castIsSigned = true;
+    break;
   }
 }
 
-void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+void CWriter::writeOperandWithCast(Value *Operand, unsigned Opcode) {
   DEBUG(errs() << "Here: " << *Operand << "\n");
   // Write out the casted operand if we should, otherwise just write the
   // operand.
@@ -1510,12 +1640,12 @@ void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
   //    writeOperand(Operand, ContextCasted);
   //    Out << ")";
   //  } else
-  writeOperand(Operand, ContextNormal/*ContextCasted*/);
+  writeOperand(Operand, ContextNormal /*ContextCasted*/);
 }
 
 // Write the operand with a cast to another type based on the icmp predicate
 // being used.
-void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
+void CWriter::writeOperandWithCast(Value *Operand, ICmpInst &Cmp) {
   // This has to do a cast to ensure the operand has the right signedness.
   // Also, if the operand is a pointer, we make sure to cast to an integer when
   // doing the comparison both for signedness and so that the C compiler doesn't
@@ -1534,7 +1664,7 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
   bool castIsSigned = Cmp.isSigned();
 
   // If the operand was a pointer, convert to a large integer type.
-  Type* OpTy = Operand->getType();
+  Type *OpTy = Operand->getType();
   if (OpTy->isPointerTy())
     OpTy = TD->getIntPtrType(Operand->getContext());
 
@@ -1548,61 +1678,64 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
 // generateCompilerSpecificCode - This is where we add conditional compilation
 // directives to cater to specific compilers as need be.
 //
-static void generateCompilerSpecificCode(raw_ostream& Out,
-    const DataLayout *TD) {
+static void generateCompilerSpecificCode(raw_ostream &Out,
+                                         const DataLayout *TD) {
   // Alloca is hard to get, and we don't want to include stdlib.h here.
   Out << "/* get a declaration for alloca */\n"
-    << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
-    << "#define  alloca(x) __builtin_alloca((x))\n"
-    << "#define _alloca(x) __builtin_alloca((x))\n"
-    << "#elif defined(__APPLE__)\n"
-    << "extern void *__builtin_alloca(unsigned long);\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#define longjmp _longjmp\n"
-    << "#define setjmp _setjmp\n"
-    << "#elif defined(__sun__)\n"
-    << "#if defined(__sparcv9)\n"
-    << "extern void *__builtin_alloca(unsigned long);\n"
-    << "#else\n"
-    << "extern void *__builtin_alloca(unsigned int);\n"
-    << "#endif\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#elif defined(_MSC_VER)\n"
-    << "#define alloca(x) _alloca(x)\n"
-    << "#else\n"
-    << "#include <alloca.h>\n"
-    << "#endif\n\n";
+      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
+      << "#define  alloca(x) __builtin_alloca((x))\n"
+      << "#define _alloca(x) __builtin_alloca((x))\n"
+      << "#elif defined(__APPLE__)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#define longjmp _longjmp\n"
+      << "#define setjmp _setjmp\n"
+      << "#elif defined(__sun__)\n"
+      << "#if defined(__sparcv9)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#else\n"
+      << "extern void *__builtin_alloca(unsigned int);\n"
+      << "#endif\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || "
+         "defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(_MSC_VER)\n"
+      << "#define alloca(x) _alloca(x)\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
 
   // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
   Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-    << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
-    << "#elif defined(__GNUC__)\n"
-    << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
-    << "#else\n"
-    << "#define __EXTERNAL_WEAK__\n"
-    << "#endif\n\n";
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
 
   // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
   Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-    << "#define __ATTRIBUTE_WEAK__\n"
-    << "#elif defined(__GNUC__)\n"
-    << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
-    << "#else\n"
-    << "#define __ATTRIBUTE_WEAK__\n"
-    << "#endif\n\n";
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
 
   // Add hidden visibility support. FIXME: APPLE_CC?
   Out << "#if defined(__GNUC__)\n"
-    << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
-    << "#endif\n\n";
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
 
   // Define unaligned-load helper macro
   Out << "#ifdef _MSC_VER\n";
-  Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type __unaligned*)op)\n";
+  Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type "
+         "__unaligned*)op)\n";
   Out << "#else\n";
-  Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data __attribute__((packed, aligned(align))); }*)op)->data\n";
+  Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data "
+         "__attribute__((packed, aligned(align))); }*)op)->data\n";
   Out << "#endif\n\n";
 
   // Define unaligned-load helper macro
@@ -1653,110 +1786,144 @@ static void generateCompilerSpecificCode(raw_ostream& Out,
   //
   // Similar to __builtin_inf, except the return type is float.
   Out << "#ifdef __GNUC__\n"
-    << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
-    << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
-    //<< "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
-    //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
-    << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
-    << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
-    << "#define LLVM_PREFETCH(addr,rw,locality) "
-    "__builtin_prefetch(addr,rw,locality)\n"
-    << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
-    << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
-    << "#else\n"
-    << "#define LLVM_NAN(NanStr)   ((double)NAN)           /* Double */\n"
-    << "#define LLVM_NANF(NanStr)  ((float)NAN))           /* Float */\n"
-    //<< "#define LLVM_NANS(NanStr)  ((double)NAN)           /* Double */\n"
-    //<< "#define LLVM_NANSF(NanStr) ((single)NAN)           /* Float */\n"
-    << "#define LLVM_INF           ((double)INFINITY)      /* Double */\n"
-    << "#define LLVM_INFF          ((float)INFINITY)       /* Float */\n"
-    << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
-    << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not supported on this compiler\"\n"
-    << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not supported on this compiler\"\n"
-    << "#endif\n\n";
-
-  Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
-    << "#define __builtin_stack_save() 0   /* not implemented */\n"
-    << "#define __builtin_stack_restore(X) /* noop */\n"
-    << "#endif\n\n";
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      //<< "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality) "
+         "__builtin_prefetch(addr,rw,locality)\n"
+      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "#else\n"
+      << "#define LLVM_NAN(NanStr)   ((double)NAN)           /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  ((float)NAN))           /* Float */\n"
+      //<< "#define LLVM_NANS(NanStr)  ((double)NAN)           /* Double */\n"
+      //<< "#define LLVM_NANSF(NanStr) ((single)NAN)           /* Float */\n"
+      << "#define LLVM_INF           ((double)INFINITY)      /* Double */\n"
+      << "#define LLVM_INFF          ((float)INFINITY)       /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not "
+         "supported on this compiler\"\n"
+      << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not "
+         "supported on this compiler\"\n"
+      << "#endif\n\n";
+
+  Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers "
+         "not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
 
   // Output typedefs for 128-bit integers
-  Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types */\n"
-    << "typedef int __attribute__((mode(TI))) int128_t;\n"
-    << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n"
-    << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | (uint128_t)(lo))\n"
-    << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
-    << " return UINT128_C(hi, lo); }\n"
-    << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {"
-    << " return l == r; }\n"
-    << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {"
-    << " return l != r; }\n"
-    << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {"
-    << " return l <= r; }\n"
-    << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {"
-    << " return l <= r; }\n"
-    << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {"
-    << " return l >= r; }\n"
-    << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {"
-    << " return l >= r; }\n"
-    << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {"
-    << " return l < r; }\n"
-    << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {"
-    << " return l < r; }\n"
-    << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {"
-    << " return l > r; }\n"
-    << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {"
-    << " return l > r; }\n"
-
-    << "#else /* manual 128-bit types */\n"
-    // TODO: field order should be reversed for big-endian
-    << "typedef struct { ulong lo; ulong hi; } uint128_t;\n"
-    << "typedef uint128_t int128_t;\n"
-    << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static context
-    << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
-    << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n"
-    << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi == r.hi && l.lo == r.lo; }\n"
-    << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi != r.hi || l.lo != r.lo; }\n"
-    << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > (long)l.lo : 0); }\n"
-    << "#define __emulate_i128\n"
-    << "#endif\n\n";
+  Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types "
+         "*/\n"
+      << "typedef int __attribute__((mode(TI))) int128_t;\n"
+      << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n"
+      << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | "
+         "(uint128_t)(lo))\n"
+      << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
+      << " return UINT128_C(hi, lo); }\n"
+      << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l == r; }\n"
+      << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l != r; }\n"
+      << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l <= r; }\n"
+      << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l <= r; }\n"
+      << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l >= r; }\n"
+      << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l >= r; }\n"
+      << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l < r; }\n"
+      << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l < r; }\n"
+      << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l > r; }\n"
+      << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l > r; }\n"
+
+      << "#else /* manual 128-bit types */\n"
+      // TODO: field order should be reversed for big-endian
+      << "typedef struct { ulong lo; ulong hi; } uint128_t;\n"
+      << "typedef uint128_t int128_t;\n"
+      << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static
+                                                    // context
+      << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
+      << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n"
+      << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi == r.hi && l.lo == r.lo; }\n"
+      << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi != r.hi || l.lo != r.lo; }\n"
+      << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > "
+         "(long)l.lo : 0); }\n"
+      << "#define __emulate_i128\n"
+      << "#endif\n\n";
 
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifdef _MSC_VER  /* Can only support \"linkonce\" vars with GCC */\n"
-    << "#define __attribute__(X)\n"
-    << "#endif\n\n";
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
 }
 
 /// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
 /// the StaticTors set.
-static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+static void FindStaticTors(GlobalVariable *GV,
+                           std::set<Function *> &StaticTors) {
   ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (!InitList) return;
+  if (!InitList)
+    return;
 
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
-    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
-      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+    if (ConstantStruct *CS =
+            dyn_cast<ConstantStruct>(InitList->getOperand(i))) {
+      if (CS->getNumOperands() != 2)
+        return; // Not array of 2-element structs.
 
       if (CS->getOperand(1)->isNullValue())
-        return;  // Found a null terminator, exit printing.
+        return; // Found a null terminator, exit printing.
       Constant *FP = CS->getOperand(1);
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
         if (CE->isCast())
@@ -1768,7 +1935,8 @@ static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
 
 enum SpecialGlobalClass {
   NotSpecial = 0,
-  GlobalCtors, GlobalDtors,
+  GlobalCtors,
+  GlobalDtors,
   NotPrinted
 };
 
@@ -1785,8 +1953,7 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) {
 
   // Otherwise, if it is other metadata, don't print it.  This catches things
   // like debug information.
-  if (StringRef(GV->getSection()) == "llvm.metadata")
-  {
+  if (StringRef(GV->getSection()) == "llvm.metadata") {
     DEBUG(errs() << "Printing Metada!\n" << *GV << "\n");
     return NotPrinted;
   }
@@ -1796,7 +1963,7 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) {
 // PrintEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 static void PrintEscapedString(const char *Str, unsigned Length,
-    raw_ostream &Out) {
+                               raw_ostream &Out) {
   for (unsigned i = 0; i != Length; ++i) {
     unsigned char C = Str[i];
     if (isprint(C) && C != '\\' && C != '"')
@@ -1823,9 +1990,10 @@ bool CWriter::doInitialization(Module &M) {
 
   TD = new DataLayout(&M);
   IL = new IntrinsicLowering(*TD);
-  // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not supported
-  // This func creates defs which are created once each call is referenced anyway
-  //IL->AddPrototypes(M);
+  // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not
+  // supported This func creates defs which are created once each call is
+  // referenced anyway
+  // IL->AddPrototypes(M);
 
 #if 0
   std::string Triple = TheModule->getTargetTriple();
@@ -1837,7 +2005,7 @@ bool CWriter::doInitialization(Module &M) {
     TAsm = Match->createMCAsmInfo(Triple);
 #endif
   TAsm = new CBEMCAsmInfo();
-  MRI  = new MCRegisterInfo();
+  MRI = new MCRegisterInfo();
   TCtx = new MCContext(TAsm, MRI, NULL);
   return false;
 }
@@ -1883,17 +2051,18 @@ bool CWriter::doFinalization(Module &M) {
 void CWriter::generateHeader(Module &M) {
   // Keep track of which functions are static ctors/dtors so they can have
   // an attribute added to their prototypes.
-  std::set<Function*> StaticCtors, StaticDtors;
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I) {
+  std::set<Function *> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I) {
     switch (getGlobalVariableClass(&*I)) {
-      default: break;
-      case GlobalCtors:
-               FindStaticTors(&*I, StaticCtors);
-               break;
-      case GlobalDtors:
-               FindStaticTors(&*I, StaticDtors);
-               break;
+    default:
+      break;
+    case GlobalCtors:
+      FindStaticTors(&*I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(&*I, StaticDtors);
+      break;
     }
   }
 
@@ -1903,8 +2072,9 @@ void CWriter::generateHeader(Module &M) {
   //  Out << "#include <setjmp.h>\n";      // Unwind support
   //  Out << "#include <limits.h>\n";      // With overflow intrinsics support.
   //  Out << "#include <stdint.h>\n";      // Sized integer support
-  //  Out << "#include <math.h>\n";        // definitions for some math functions and numeric constants
-  //  Out << "#include <APInt-C.h>\n";     // Implementations of many llvm intrinsics
+  //  Out << "#include <math.h>\n";        // definitions for some math
+  //  functions and numeric constants Out << "#include <APInt-C.h>\n";     //
+  //  Implementations of many llvm intrinsics
   //  // Provide a definition for `bool' if not compiling with a C++ compiler.
   //  Out << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n";
   //  Out << "\n";
@@ -1912,24 +2082,24 @@ void CWriter::generateHeader(Module &M) {
   //  generateCompilerSpecificCode(Out, TD);
 
   Out << "\n\n/* Support for floating point constants */\n"
-    << "typedef ulong ConstantDoubleTy;\n"
-    << "typedef uint ConstantFloatTy;\n"
-    << "typedef struct { ulong f1; ushort f2; "
-    "ushort pad[3]; } ConstantFP80Ty;\n"
-    // This is used for both kinds of 128-bit long double; meaning differs.
-    << "typedef struct { ulong f1; ulong f2; }"
-    " ConstantFP128Ty;\n"
-    << "\n\n/* OpenCL Pragmas */\n"
-    << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-    << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
-    << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
-    << "\n\n/* Global Declarations */\n";
+      << "typedef ulong ConstantDoubleTy;\n"
+      << "typedef uint ConstantFloatTy;\n"
+      << "typedef struct { ulong f1; ushort f2; "
+         "ushort pad[3]; } ConstantFP80Ty;\n"
+      // This is used for both kinds of 128-bit long double; meaning differs.
+      << "typedef struct { ulong f1; ulong f2; }"
+         " ConstantFP128Ty;\n"
+      << "\n\n/* OpenCL Pragmas */\n"
+      << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
+      << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
+      << "\n\n/* Global Declarations */\n";
 
   // First output all the declarations for the program, because C requires
   // Functions & globals to be declared before they are used.
   if (!M.getModuleInlineAsm().empty()) {
     Out << "\n/* Module asm statements */\n"
-      << "__asm__ (";
+        << "__asm__ (";
 
     // Split the string into lines, to make it easier to read the .ll file.
     std::string Asm = M.getModuleInlineAsm();
@@ -1939,22 +2109,22 @@ void CWriter::generateHeader(Module &M) {
       // We found a newline, print the portion of the asm string from the
       // last newline up to this newline.
       Out << "\"";
-      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
-          Out);
+      PrintEscapedString(
+          std::string(Asm.begin() + CurPos, Asm.begin() + NewLine), Out);
       Out << "\\n\"\n";
-      CurPos = NewLine+1;
+      CurPos = NewLine + 1;
       NewLine = Asm.find_first_of('\n', CurPos);
     }
     Out << "\"";
-    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    PrintEscapedString(std::string(Asm.begin() + CurPos, Asm.end()), Out);
     Out << "\");\n"
-      << "/* End Module asm statements */\n";
+        << "/* End Module asm statements */\n";
   }
 
   // collect any remaining types
   raw_null_ostream NullOut;
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I) {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I) {
     // Ignore special globals, such as debug info.
     if (getGlobalVariableClass(&*I))
       continue;
@@ -1966,8 +2136,9 @@ void CWriter::generateHeader(Module &M) {
   if (!M.global_empty()) {
     Out << "\n/* External Global Variable Declarations */\n";
     for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-        I != E; ++I) {
-      if (!I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType()))
+         I != E; ++I) {
+      if (!I->isDeclaration() ||
+          isEmptyType(I->getType()->getPointerElementType()))
         continue;
 
       if (I->hasDLLImportStorageClass())
@@ -1987,8 +2158,8 @@ void CWriter::generateHeader(Module &M) {
 
       Type *ElTy = I->getType()->getElementType();
       unsigned Alignment = I->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(ElTy);
+      bool IsOveraligned =
+          Alignment && Alignment > TD->getABITypeAlignment(ElTy);
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
       printTypeName(Out, ElTy, false) << ' ' << GetValueName(&*I);
@@ -2005,64 +2176,53 @@ void CWriter::generateHeader(Module &M) {
   Out << "\n/* Function Declarations */\n";
 
   // Store the intrinsics which will be declared/defined below.
-  SmallVector<Function*, 16> intrinsicsToDefine;
+  SmallVector<Function *, 16> intrinsicsToDefine;
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     // Don't print declarations for intrinsic functions.
     // Store the used intrinsics, which need to be explicitly defined.
     if (I->isIntrinsic()) {
       switch (I->getIntrinsicID()) {
-        default:
-          continue;
-        case Intrinsic::uadd_with_overflow:
-        case Intrinsic::sadd_with_overflow:
-        case Intrinsic::usub_with_overflow:
-        case Intrinsic::ssub_with_overflow:
-        case Intrinsic::umul_with_overflow:
-        case Intrinsic::smul_with_overflow:
-        case Intrinsic::bswap:
-        case Intrinsic::ceil:
-        case Intrinsic::ctlz:
-        case Intrinsic::ctpop:
-        case Intrinsic::cttz:
-        case Intrinsic::fabs:
-        case Intrinsic::floor:
-        case Intrinsic::fma:
-        case Intrinsic::fmuladd:
-        case Intrinsic::pow:
-        case Intrinsic::powi:
-        case Intrinsic::rint:
-        case Intrinsic::sqrt:
-        case Intrinsic::trunc:
-          intrinsicsToDefine.push_back(&*I);
-          continue;
+      default:
+        continue;
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::usub_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::bswap:
+      case Intrinsic::ceil:
+      case Intrinsic::ctlz:
+      case Intrinsic::ctpop:
+      case Intrinsic::cttz:
+      case Intrinsic::fabs:
+      case Intrinsic::floor:
+      case Intrinsic::fma:
+      case Intrinsic::fmuladd:
+      case Intrinsic::pow:
+      case Intrinsic::powi:
+      case Intrinsic::rint:
+      case Intrinsic::sqrt:
+      case Intrinsic::trunc:
+        intrinsicsToDefine.push_back(&*I);
+        continue;
       }
     }
 
     // Skip a few functions that have already been defined in headers
-    if (I->getName() == "setjmp" ||
-        I->getName() == "longjmp" ||
-        I->getName() == "_setjmp" ||
-        I->getName() == "siglongjmp" ||
-        I->getName() == "sigsetjmp" ||
-        I->getName() == "pow" ||
-        I->getName() == "powf" ||
-        I->getName() == "sqrt" ||
-        I->getName() == "sqrtf" ||
-        I->getName() == "trunc" ||
-        I->getName() == "truncf" ||
-        I->getName() == "rint" ||
-        I->getName() == "rintf" ||
-        I->getName() == "floor" ||
-        I->getName() == "floorf" ||
-        I->getName() == "ceil" ||
-        I->getName() == "ceilf" ||
-        I->getName() == "alloca" ||
-        I->getName() == "_alloca" ||
-        I->getName() == "_chkstk" ||
-        I->getName() == "__chkstk" ||
-        I->getName() == "___chkstk_ms")
-        continue;
+    if (I->getName() == "setjmp" || I->getName() == "longjmp" ||
+        I->getName() == "_setjmp" || I->getName() == "siglongjmp" ||
+        I->getName() == "sigsetjmp" || I->getName() == "pow" ||
+        I->getName() == "powf" || I->getName() == "sqrt" ||
+        I->getName() == "sqrtf" || I->getName() == "trunc" ||
+        I->getName() == "truncf" || I->getName() == "rint" ||
+        I->getName() == "rintf" || I->getName() == "floor" ||
+        I->getName() == "floorf" || I->getName() == "ceil" ||
+        I->getName() == "ceilf" || I->getName() == "alloca" ||
+        I->getName() == "_alloca" || I->getName() == "_chkstk" ||
+        I->getName() == "__chkstk" || I->getName() == "___chkstk_ms")
+      continue;
 
     if (I->hasDLLImportStorageClass())
       Out << "__declspec(dllimport) ";
@@ -2095,7 +2255,7 @@ void CWriter::generateHeader(Module &M) {
   if (!M.global_empty()) {
     Out << "\n\n/* Global Variable Definitions and Initialization */\n";
     for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-        I != E; ++I) {
+         I != E; ++I) {
       declareOneGlobalVariable(&*I);
     }
   }
@@ -2103,9 +2263,10 @@ void CWriter::generateHeader(Module &M) {
   // Alias declarations...
   if (!M.alias_empty()) {
     Out << "\n/* External Alias Declarations */\n";
-    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-        I != E; ++I) {
-      assert(!I->isDeclaration() && !isEmptyType(I->getType()->getPointerElementType()));
+    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
+         ++I) {
+      assert(!I->isDeclaration() &&
+             !isEmptyType(I->getType()->getPointerElementType()));
       if (I->hasLocalLinkage())
         continue; // Internal Global
 
@@ -2120,8 +2281,8 @@ void CWriter::generateHeader(Module &M) {
 
       Type *ElTy = I->getType()->getElementType();
       unsigned Alignment = I->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(ElTy);
+      bool IsOveraligned =
+          Alignment && Alignment > TD->getABITypeAlignment(ElTy);
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
       // GetValueName would resolve the alias, which is not what we want,
@@ -2176,9 +2337,11 @@ void CWriter::generateHeader(Module &M) {
   Out << "return 1; }\n";
 
   // Loop over all select operations
-  for (std::set<Type*>::iterator it = SelectDeclTypes.begin(), end = SelectDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4> iftrue, <u8 x 4> ifnot) {
+  for (std::set<Type *>::iterator it = SelectDeclTypes.begin(),
+                                  end = SelectDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4>
+    // iftrue, <u8 x 4> ifnot) {
     //   Rty r = {
     //     condition[0] ? iftrue[0] : ifnot[0],
     //     condition[1] ? iftrue[1] : ifnot[1],
@@ -2193,7 +2356,11 @@ void CWriter::generateHeader(Module &M) {
     printTypeString(Out, *it, false);
     Out << "(";
     if (isa<VectorType>(*it))
-      printTypeNameUnaligned(Out, VectorType::get(Type::getInt1Ty((*it)->getContext()), (*it)->getVectorNumElements()), false);
+      printTypeNameUnaligned(
+          Out,
+          VectorType::get(Type::getInt1Ty((*it)->getContext()),
+                          (*it)->getVectorNumElements()),
+          false);
     else
       Out << "bool";
     Out << " condition, ";
@@ -2206,19 +2373,22 @@ void CWriter::generateHeader(Module &M) {
     if (isa<VectorType>(*it)) {
       unsigned n, l = (*it)->getVectorNumElements();
       for (n = 0; n < l; n++) {
-        Out << "  r.vector[" << n << "] = condition.vector[" << n << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n";
+        Out << "  r.vector[" << n << "] = condition.vector[" << n
+            << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n";
       }
-    }
-    else {
+    } else {
       Out << "  r = condition ? iftrue : ifnot;\n";
     }
     Out << "  return r;\n}\n";
   }
 
   // Loop over all compare operations
-  for (std::set< std::pair<CmpInst::Predicate, VectorType*> >::iterator it = CmpDeclTypes.begin(), end = CmpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r) {
+  for (std::set<std::pair<CmpInst::Predicate, VectorType *>>::iterator
+           it = CmpDeclTypes.begin(),
+           end = CmpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r)
+    // {
     //   Rty c = {
     //     l[0] >= r[0],
     //     l[1] >= r[1],
@@ -2228,7 +2398,8 @@ void CWriter::generateHeader(Module &M) {
     //   return c;
     // }
     unsigned n, l = (*it).second->getVectorNumElements();
-    VectorType *RTy = VectorType::get(Type::getInt1Ty((*it).second->getContext()), l);
+    VectorType *RTy =
+        VectorType::get(Type::getInt1Ty((*it).second->getContext()), l);
     bool isSigned = CmpInst::isSigned((*it).first);
     Out << "static __forceinline ";
     printTypeName(Out, RTy, isSigned);
@@ -2248,25 +2419,38 @@ void CWriter::generateHeader(Module &M) {
     for (n = 0; n < l; n++) {
       Out << "  c.vector[" << n << "] = ";
       if (CmpInst::isFPPredicate((*it).first)) {
-        Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector[" << n << "], r.vector[" << n << "]);\n";
+        Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector["
+            << n << "], r.vector[" << n << "]);\n";
       } else {
         Out << "l.vector[" << n << "]";
         switch ((*it).first) {
-          case CmpInst::ICMP_EQ:  Out << " == "; break;
-          case CmpInst::ICMP_NE:  Out << " != "; break;
-          case CmpInst::ICMP_ULE:
-          case CmpInst::ICMP_SLE: Out << " <= "; break;
-          case CmpInst::ICMP_UGE:
-          case CmpInst::ICMP_SGE: Out << " >= "; break;
-          case CmpInst::ICMP_ULT:
-          case CmpInst::ICMP_SLT: Out << " < "; break;
-          case CmpInst::ICMP_UGT:
-          case CmpInst::ICMP_SGT: Out << " > "; break;
-          default:
+        case CmpInst::ICMP_EQ:
+          Out << " == ";
+          break;
+        case CmpInst::ICMP_NE:
+          Out << " != ";
+          break;
+        case CmpInst::ICMP_ULE:
+        case CmpInst::ICMP_SLE:
+          Out << " <= ";
+          break;
+        case CmpInst::ICMP_UGE:
+        case CmpInst::ICMP_SGE:
+          Out << " >= ";
+          break;
+        case CmpInst::ICMP_ULT:
+        case CmpInst::ICMP_SLT:
+          Out << " < ";
+          break;
+        case CmpInst::ICMP_UGT:
+        case CmpInst::ICMP_SGT:
+          Out << " > ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid icmp predicate!" << (*it).first;
+          errs() << "Invalid icmp predicate!" << (*it).first;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "r.vector[" << n << "];\n";
       }
@@ -2275,9 +2459,13 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all (vector) cast operations
-  for (std::set<std::pair<CastInst::CastOps, std::pair<Type*, Type*>>>::iterator it = CastOpDeclTypes.begin(), end = CastOpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { // Src->isVector == Dst->isVector
+  for (std::set<
+           std::pair<CastInst::CastOps, std::pair<Type *, Type *>>>::iterator
+           it = CastOpDeclTypes.begin(),
+           end = CastOpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { //
+    // Src->isVector == Dst->isVector
     //   Rty out = {
     //     in[0],
     //     in[1],
@@ -2286,7 +2474,8 @@ void CWriter::generateHeader(Module &M) {
     //   };
     //   return out;
     // }
-    // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { // Src->bitsSize == Dst->bitsSize
+    // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { //
+    // Src->bitsSize == Dst->bitsSize
     //   union {
     //     <u8 x 4> in;
     //     u32 out;
@@ -2299,18 +2488,18 @@ void CWriter::generateHeader(Module &M) {
     Type *DstTy = (*it).second.second;
     bool SrcSigned, DstSigned;
     switch (opcode) {
-      default:
-        SrcSigned = false;
-        DstSigned = false;
-      case Instruction::SIToFP:
-        SrcSigned = true;
-        DstSigned = false;
-      case Instruction::FPToSI:
-        SrcSigned = false;
-        DstSigned = true;
-      case Instruction::SExt:
-        SrcSigned = true;
-        DstSigned = true;
+    default:
+      SrcSigned = false;
+      DstSigned = false;
+    case Instruction::SIToFP:
+      SrcSigned = true;
+      DstSigned = false;
+    case Instruction::FPToSI:
+      SrcSigned = false;
+      DstSigned = true;
+    case Instruction::SExt:
+      SrcSigned = true;
+      DstSigned = true;
     }
 
     Out << "static __forceinline ";
@@ -2349,20 +2538,34 @@ void CWriter::generateHeader(Module &M) {
       Out << " out;\n";
       Out << "  LLVM";
       switch (opcode) {
-        case Instruction::UIToFP: Out << "UItoFP"; break;
-        case Instruction::SIToFP: Out << "SItoFP"; break;
-        case Instruction::Trunc: Out << "Trunc"; break;
-                                 //case Instruction::FPExt:
-                                 //case Instruction::FPTrunc:
-        case Instruction::ZExt: Out << "ZExt"; break;
-        case Instruction::FPToUI: Out << "FPtoUI"; break;
-        case Instruction::SExt: Out << "SExt"; break;
-        case Instruction::FPToSI: Out << "FPtoSI"; break;
-        default:
-                                  llvm_unreachable("Invalid cast opcode for i128");
+      case Instruction::UIToFP:
+        Out << "UItoFP";
+        break;
+      case Instruction::SIToFP:
+        Out << "SItoFP";
+        break;
+      case Instruction::Trunc:
+        Out << "Trunc";
+        break;
+        // case Instruction::FPExt:
+        // case Instruction::FPTrunc:
+      case Instruction::ZExt:
+        Out << "ZExt";
+        break;
+      case Instruction::FPToUI:
+        Out << "FPtoUI";
+        break;
+      case Instruction::SExt:
+        Out << "SExt";
+        break;
+      case Instruction::FPToSI:
+        Out << "FPtoSI";
+        break;
+      default:
+        llvm_unreachable("Invalid cast opcode for i128");
       }
       Out << "(" << SrcTy->getPrimitiveSizeInBits() << ", &in, "
-        << DstTy->getPrimitiveSizeInBits() << ", &out);\n";
+          << DstTy->getPrimitiveSizeInBits() << ", &out);\n";
       Out << "  return out;\n";
       Out << "#endif\n";
       Out << "}\n";
@@ -2370,9 +2573,12 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all simple vector operations
-  for (std::set<std::pair<unsigned, Type*>>::iterator it = InlineOpDeclTypes.begin(), end = InlineOpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b) {
+  for (std::set<std::pair<unsigned, Type *>>::iterator
+           it = InlineOpDeclTypes.begin(),
+           end = InlineOpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b)
+    // {
     //   Rty r = {
     //      a[0] OP b[0],
     //      a[1] OP b[1],
@@ -2416,7 +2622,7 @@ void CWriter::generateHeader(Module &M) {
     // C can't handle non-power-of-two integer types
     unsigned mask = 0;
     if (ElemTy->isIntegerTy()) {
-      IntegerType *ITy = static_cast<IntegerType*>(ElemTy);
+      IntegerType *ITy = static_cast<IntegerType *>(ElemTy);
       if (!ITy->isPowerOf2ByteWidth())
         mask = ITy->getBitMask();
     }
@@ -2438,34 +2644,54 @@ void CWriter::generateHeader(Module &M) {
             Out << "fmodf(a.vector[" << n << "], b.vector[" << n << "])";
           else if (ElemTy->isDoubleTy())
             Out << "fmod(a.vector[" << n << "], b.vector[" << n << "])";
-          else  // all 3 flavors of long double
+          else // all 3 flavors of long double
             Out << "fmodl(a.vector[" << n << "], b.vector[" << n << "])";
         } else {
           Out << "a.vector[" << n << "]";
           switch (opcode) {
-            case Instruction::Add:
-            case Instruction::FAdd: Out << " + "; break;
-            case Instruction::Sub:
-            case Instruction::FSub: Out << " - "; break;
-            case Instruction::Mul:
-            case Instruction::FMul: Out << " * "; break;
-            case Instruction::URem:
-            case Instruction::SRem:
-            case Instruction::FRem: Out << " % "; break;
-            case Instruction::UDiv:
-            case Instruction::SDiv:
-            case Instruction::FDiv: Out << " / "; break;
-            case Instruction::And:  Out << " & "; break;
-            case Instruction::Or:   Out << " | "; break;
-            case Instruction::Xor:  Out << " ^ "; break;
-            case Instruction::Shl : Out << " << "; break;
-            case Instruction::LShr:
-            case Instruction::AShr: Out << " >> "; break;
-            default:
+          case Instruction::Add:
+          case Instruction::FAdd:
+            Out << " + ";
+            break;
+          case Instruction::Sub:
+          case Instruction::FSub:
+            Out << " - ";
+            break;
+          case Instruction::Mul:
+          case Instruction::FMul:
+            Out << " * ";
+            break;
+          case Instruction::URem:
+          case Instruction::SRem:
+          case Instruction::FRem:
+            Out << " % ";
+            break;
+          case Instruction::UDiv:
+          case Instruction::SDiv:
+          case Instruction::FDiv:
+            Out << " / ";
+            break;
+          case Instruction::And:
+            Out << " & ";
+            break;
+          case Instruction::Or:
+            Out << " | ";
+            break;
+          case Instruction::Xor:
+            Out << " ^ ";
+            break;
+          case Instruction::Shl:
+            Out << " << ";
+            break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+            Out << " >> ";
+            break;
+          default:
 #ifndef NDEBUG
-                                    errs() << "Invalid operator type!" << opcode;
+            errs() << "Invalid operator type!" << opcode;
 #endif
-                                    llvm_unreachable(0);
+            llvm_unreachable(0);
           }
           Out << "b.vector[" << n << "]";
         }
@@ -2486,24 +2712,44 @@ void CWriter::generateHeader(Module &M) {
       } else {
         Out << "a";
         switch (opcode) {
-          case Instruction::Add: Out << " + "; break;
-          case Instruction::Sub: Out << " - "; break;
-          case Instruction::Mul: Out << " * "; break;
-          case Instruction::URem:
-          case Instruction::SRem: Out << " % "; break;
-          case Instruction::UDiv:
-          case Instruction::SDiv: Out << " / "; break;
-          case Instruction::And:  Out << " & "; break;
-          case Instruction::Or:   Out << " | "; break;
-          case Instruction::Xor:  Out << " ^ "; break;
-          case Instruction::Shl:  Out << " << "; break;
-          case Instruction::LShr:
-          case Instruction::AShr: Out << " >> "; break;
-          default:
+        case Instruction::Add:
+          Out << " + ";
+          break;
+        case Instruction::Sub:
+          Out << " - ";
+          break;
+        case Instruction::Mul:
+          Out << " * ";
+          break;
+        case Instruction::URem:
+        case Instruction::SRem:
+          Out << " % ";
+          break;
+        case Instruction::UDiv:
+        case Instruction::SDiv:
+          Out << " / ";
+          break;
+        case Instruction::And:
+          Out << " & ";
+          break;
+        case Instruction::Or:
+          Out << " | ";
+          break;
+        case Instruction::Xor:
+          Out << " ^ ";
+          break;
+        case Instruction::Shl:
+          Out << " << ";
+          break;
+        case Instruction::LShr:
+        case Instruction::AShr:
+          Out << " >> ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "b;\n";
       }
@@ -2525,7 +2771,8 @@ void CWriter::generateHeader(Module &M) {
       } else if (opcode == Instruction::Xor) {
         Out << "  r.hi = a.hi ^ b.hi;\n";
         Out << "  r.lo = a.lo ^ b.lo;\n";
-      } else if (opcode == Instruction::Shl) { // reminder: undef behavior if b >= 128
+      } else if (opcode ==
+                 Instruction::Shl) { // reminder: undef behavior if b >= 128
         Out << "  if (b.lo >= 64) {\n";
         Out << "    r.hi = (a.lo << (b.lo - 64));\n";
         Out << "    r.lo = 0;\n";
@@ -2540,26 +2787,44 @@ void CWriter::generateHeader(Module &M) {
         // everything that hasn't been manually implemented above
         Out << "  LLVM";
         switch (opcode) {
-          //case BinaryNeg: Out << "Neg"; break;
-          //case BinaryNot: Out << "FlipAllBits"; break;
-          case Instruction::Add: Out << "Add"; break;
-          case Instruction::Sub: Out << "Sub"; break;
-          case Instruction::Mul: Out << "Mul"; break;
-          case Instruction::URem: Out << "URem"; break;
-          case Instruction::SRem: Out << "SRem"; break;
-          case Instruction::UDiv: Out << "UDiv"; break;
-          case Instruction::SDiv: Out << "SDiv"; break;
-                                  //case Instruction::And:  Out << "And"; break;
-                                  //case Instruction::Or:   Out << "Or"; break;
-                                  //case Instruction::Xor:  Out << "Xor"; break;
-                                  //case Instruction::Shl: Out << "Shl"; break;
-          case Instruction::LShr: Out << "LShr"; break;
-          case Instruction::AShr: Out << "AShr"; break;
-          default:
+        // case BinaryNeg: Out << "Neg"; break;
+        // case BinaryNot: Out << "FlipAllBits"; break;
+        case Instruction::Add:
+          Out << "Add";
+          break;
+        case Instruction::Sub:
+          Out << "Sub";
+          break;
+        case Instruction::Mul:
+          Out << "Mul";
+          break;
+        case Instruction::URem:
+          Out << "URem";
+          break;
+        case Instruction::SRem:
+          Out << "SRem";
+          break;
+        case Instruction::UDiv:
+          Out << "UDiv";
+          break;
+        case Instruction::SDiv:
+          Out << "SDiv";
+          break;
+          // case Instruction::And:  Out << "And"; break;
+          // case Instruction::Or:   Out << "Or"; break;
+          // case Instruction::Xor:  Out << "Xor"; break;
+          // case Instruction::Shl: Out << "Shl"; break;
+        case Instruction::LShr:
+          Out << "LShr";
+          break;
+        case Instruction::AShr:
+          Out << "AShr";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "(16, &a, &b, &r);\n";
       }
@@ -2579,34 +2844,54 @@ void CWriter::generateHeader(Module &M) {
           Out << "fmodf(a, b)";
         else if (ElemTy->isDoubleTy())
           Out << "fmod(a, b)";
-        else  // all 3 flavors of long double
+        else // all 3 flavors of long double
           Out << "fmodl(a, b)";
       } else {
         Out << "a";
         switch (opcode) {
-          case Instruction::Add:
-          case Instruction::FAdd: Out << " + "; break;
-          case Instruction::Sub:
-          case Instruction::FSub: Out << " - "; break;
-          case Instruction::Mul:
-          case Instruction::FMul: Out << " * "; break;
-          case Instruction::URem:
-          case Instruction::SRem:
-          case Instruction::FRem: Out << " % "; break;
-          case Instruction::UDiv:
-          case Instruction::SDiv:
-          case Instruction::FDiv: Out << " / "; break;
-          case Instruction::And:  Out << " & "; break;
-          case Instruction::Or:   Out << " | "; break;
-          case Instruction::Xor:  Out << " ^ "; break;
-          case Instruction::Shl : Out << " << "; break;
-          case Instruction::LShr:
-          case Instruction::AShr: Out << " >> "; break;
-          default:
+        case Instruction::Add:
+        case Instruction::FAdd:
+          Out << " + ";
+          break;
+        case Instruction::Sub:
+        case Instruction::FSub:
+          Out << " - ";
+          break;
+        case Instruction::Mul:
+        case Instruction::FMul:
+          Out << " * ";
+          break;
+        case Instruction::URem:
+        case Instruction::SRem:
+        case Instruction::FRem:
+          Out << " % ";
+          break;
+        case Instruction::UDiv:
+        case Instruction::SDiv:
+        case Instruction::FDiv:
+          Out << " / ";
+          break;
+        case Instruction::And:
+          Out << " & ";
+          break;
+        case Instruction::Or:
+          Out << " | ";
+          break;
+        case Instruction::Xor:
+          Out << " ^ ";
+          break;
+        case Instruction::Shl:
+          Out << " << ";
+          break;
+        case Instruction::LShr:
+        case Instruction::AShr:
+          Out << " >> ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "b";
         if (mask)
@@ -2618,9 +2903,11 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all inline constructors
-  for (std::set<Type*>::iterator it = CtorDeclTypes.begin(), end = CtorDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3, u32 x4) {
+  for (std::set<Type *>::iterator it = CtorDeclTypes.begin(),
+                                  end = CtorDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3,
+    // u32 x4) {
     //   Rty r = {
     //     x1, x2, x3, x4
     //   };
@@ -2634,10 +2921,12 @@ void CWriter::generateHeader(Module &M) {
     StructType *STy = dyn_cast<StructType>(*it);
     ArrayType *ATy = dyn_cast<ArrayType>(*it);
     VectorType *VTy = dyn_cast<VectorType>(*it);
-    unsigned e = (STy ? STy->getNumElements() : (ATy ? ATy->getNumElements() : VTy->getNumElements()));
+    unsigned e = (STy ? STy->getNumElements()
+                      : (ATy ? ATy->getNumElements() : VTy->getNumElements()));
     bool printed = false;
     for (unsigned i = 0; i != e; ++i) {
-      Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
+      Type *ElTy =
+          STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
       if (isEmptyType(ElTy))
         Out << " /* ";
       else if (printed)
@@ -2653,7 +2942,8 @@ void CWriter::generateHeader(Module &M) {
     printTypeName(Out, *it);
     Out << " r;";
     for (unsigned i = 0; i != e; ++i) {
-      Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
+      Type *ElTy =
+          STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
       if (isEmptyType(ElTy))
         continue;
       if (STy)
@@ -2669,9 +2959,9 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Emit definitions of the intrinsics.
-  for (SmallVector<Function*, 16>::iterator
-      I = intrinsicsToDefine.begin(),
-      E = intrinsicsToDefine.end(); I != E; ++I) {
+  for (SmallVector<Function *, 16>::iterator I = intrinsicsToDefine.begin(),
+                                             E = intrinsicsToDefine.end();
+       I != E; ++I) {
     printIntrinsicDefinition(**I, Out);
   }
 
@@ -2679,7 +2969,7 @@ void CWriter::generateHeader(Module &M) {
     Out << "\n\n/* Function Bodies */\n";
 }
 
-void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
+void CWriter::declareOneGlobalVariable(GlobalVariable *I) {
   if (I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType()))
     return;
 
@@ -2701,8 +2991,7 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
 
   Type *ElTy = I->getType()->getElementType();
   unsigned Alignment = I->getAlignment();
-  bool IsOveraligned = Alignment &&
-    Alignment > TD->getABITypeAlignment(ElTy);
+  bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment(ElTy);
   //  if (IsOveraligned)
   //    Out << "__MSALIGN__(" << Alignment << ") ";
   printTypeName(Out, ElTy, false) << ' ' << GetValueName(I);
@@ -2726,13 +3015,13 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
   // and common, so we disable this optimization.
   // FIXME common linkage should avoid this problem.
   if (!I->getInitializer()->isNullValue()) {
-    Out << " = " ;
+    Out << " = ";
     writeOperand(I->getInitializer(), ContextStatic);
   } else if (I->hasWeakLinkage()) {
     // We have to specify an initializer, but it doesn't have to be
     // complete.  If the value is an aggregate, print out { 0 }, and let
     // the compiler figure out the rest of the zeros.
-    Out << " = " ;
+    Out << " = ";
     if (I->getInitializer()->getType()->isStructTy() ||
         I->getInitializer()->getType()->isVectorTy()) {
       Out << "{ 0 }";
@@ -2756,7 +3045,8 @@ void CWriter::printFloatingPointConstants(Function &F) {
   // precision.
   //
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
-    for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end(); I_Op != E_Op; ++I_Op)
+    for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end();
+         I_Op != E_Op; ++I_Op)
       if (const Constant *C = dyn_cast<Constant>(I_Op))
         printFloatingPointConstants(C);
   Out << '\n';
@@ -2779,44 +3069,39 @@ void CWriter::printFloatingPointConstants(const Constant *C) {
       FPConstantMap.count(FPC))
     return;
 
-  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+  FPConstantMap[FPC] = FPCounter; // Number the FP constants
 
   if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
     double Val = FPC->getValueAPF().convertToDouble();
     uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
-    Out << "const ConstantDoubleTy FPConstant" << FPCounter++
-      << " = 0x" << utohexstr(i)
-      << "ULL;    /* " << Val << " */\n";
+    Out << "const ConstantDoubleTy FPConstant" << FPCounter++ << " = 0x"
+        << utohexstr(i) << "ULL;    /* " << Val << " */\n";
   } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
     float Val = FPC->getValueAPF().convertToFloat();
-    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
-      getZExtValue();
-    Out << "const ConstantFloatTy FPConstant" << FPCounter++
-      << " = 0x" << utohexstr(i)
-      << "U;    /* " << Val << " */\n";
+    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    Out << "const ConstantFloatTy FPConstant" << FPCounter++ << " = 0x"
+        << utohexstr(i) << "U;    /* " << Val << " */\n";
   } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
     // api needed to prevent premature destruction
     const APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
-    Out << "const ConstantFP80Ty FPConstant" << FPCounter++
-      << " = { 0x" << utohexstr(p[0])
-      << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
-      << "}; /* Long double constant */\n";
+    Out << "const ConstantFP80Ty FPConstant" << FPCounter++ << " = { 0x"
+        << utohexstr(p[0]) << "ULL, 0x" << utohexstr((uint16_t)p[1])
+        << ",{0,0,0}"
+        << "}; /* Long double constant */\n";
   } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
-      FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
+             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
     const APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
-    Out << "const ConstantFP128Ty FPConstant" << FPCounter++
-      << " = { 0x"
-      << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
-      << "}; /* Long double constant */\n";
+    Out << "const ConstantFP128Ty FPConstant" << FPCounter++ << " = { 0x"
+        << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
+        << "}; /* Long double constant */\n";
 
   } else {
     llvm_unreachable("Unknown float type!");
   }
 }
 
-
 /// printSymbolTable - Run through symbol table looking for type names.  If a
 /// type name is found, emit its declaration...
 ///
@@ -2830,7 +3115,7 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   Out << "} llvmBitCastUnion;\n";
 
   // Keep track of which types have been printed so far.
-  std::set<Type*> TypesPrinted;
+  std::set<Type *> TypesPrinted;
 
   // Loop over all structures then push them into the stack so they are
   // printed in the correct order.
@@ -2839,8 +3124,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   // forward-declare all structs here first
 
   {
-    std::set<Type*> TypesPrinted;
-    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+    std::set<Type *> TypesPrinted;
+    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+         it != end; ++it) {
       forwardDeclareStructs(Out, *it, TypesPrinted);
     }
   }
@@ -2848,31 +3134,35 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   // forward-declare all function pointer typedefs (Issue #2)
 
   {
-    std::set<Type*> TypesPrinted;
-    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+    std::set<Type *> TypesPrinted;
+    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+         it != end; ++it) {
       forwardDeclareFunctionTypedefs(Out, *it, TypesPrinted);
     }
   }
 
-
   Out << "\n/* Types Definitions */\n";
 
-  for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+  for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+       it != end; ++it) {
     printContainedTypes(Out, *it, TypesPrinted);
   }
 
   Out << "\n/* Function definitions */\n";
 
   // Question: Is UnnamedFunctionIDs ever non-empty?
-  for (DenseMap<std::pair<FunctionType*,
-      std::pair<AttributeList, CallingConv::ID> >, unsigned>::iterator
-      I = UnnamedFunctionIDs.begin(), E = UnnamedFunctionIDs.end();
-      I != E; ++I) {
+  for (DenseMap<
+           std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>>,
+           unsigned>::iterator I = UnnamedFunctionIDs.begin(),
+                               E = UnnamedFunctionIDs.end();
+       I != E; ++I) {
 
     Out << '\n';
-    std::pair<FunctionType*, std::pair<AttributeList, CallingConv::ID> > F = I->first;
+    std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>> F =
+        I->first;
     if (F.second.first == AttributeList() && F.second.second == CallingConv::C)
-      if (!TypesPrinted.insert(F.first).second) continue; // already printed this above
+      if (!TypesPrinted.insert(F.first).second)
+        continue; // already printed this above
 
     // FIXME: Removing apparently unused function call - need to check
     printFunctionDeclaration(Out, F.first, F.second);
@@ -2880,9 +3170,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
 
   // We may have collected some intrinsic prototypes to emit.
   // Emit them now, before the function that uses them is emitted
-  for (std::vector<Function*>::iterator
-      I = prototypesToGen.begin(), E = prototypesToGen.end();
-      I != E; ++I) {
+  for (std::vector<Function *>::iterator I = prototypesToGen.begin(),
+                                         E = prototypesToGen.end();
+       I != E; ++I) {
     Out << '\n';
     Function *F = *I;
     printFunctionProto(Out, F);
@@ -2890,9 +3180,12 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   }
 }
 
-void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) {
-  if (!TypesPrinted.insert(Ty).second) return;
-  if (isEmptyType(Ty)) return;
+void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty,
+                                    std::set<Type *> &TypesPrinted) {
+  if (!TypesPrinted.insert(Ty).second)
+    return;
+  if (isEmptyType(Ty))
+    return;
 
   for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) {
     forwardDeclareStructs(Out, *I, TypesPrinted);
@@ -2903,9 +3196,12 @@ void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*>
   }
 }
 
-void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) {
-  if (!TypesPrinted.insert(Ty).second) return;
-  if (isEmptyType(Ty)) return;
+void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty,
+                                             std::set<Type *> &TypesPrinted) {
+  if (!TypesPrinted.insert(Ty).second)
+    return;
+  if (isEmptyType(Ty))
+    return;
 
   for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) {
     forwardDeclareFunctionTypedefs(Out, *I, TypesPrinted);
@@ -2920,15 +3216,17 @@ void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::se
 // this one depends on.
 //
 void CWriter::printContainedTypes(raw_ostream &Out, Type *Ty,
-    std::set<Type*> &TypesPrinted) {
+                                  std::set<Type *> &TypesPrinted) {
   // Check to see if we have already printed this struct.
-  if (!TypesPrinted.insert(Ty).second) return;
+  if (!TypesPrinted.insert(Ty).second)
+    return;
   // Skip empty structs
-  if (isEmptyType(Ty)) return;
+  if (isEmptyType(Ty))
+    return;
 
   // Print all contained types first.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-      E = Ty->subtype_end(); I != E; ++I)
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
     printContainedTypes(Out, *I, TypesPrinted);
 
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
@@ -2949,21 +3247,22 @@ static inline bool isFPIntBitCast(Instruction &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DstTy = I.getType();
   return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
-    (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
+         (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
 }
 
 void CWriter::printFunction(Function &F) {
   bool isKernel = false;
 
-  if (NamedMDNode * KernelMD = F.getParent()->getNamedMetadata("opencl.kernels")) {
+  if (NamedMDNode *KernelMD =
+          F.getParent()->getNamedMetadata("opencl.kernels")) {
     for (auto iter : KernelMD->operands()) {
-      DEBUG( errs() << "Kernel Metadata: " << *iter << "\n");
+      DEBUG(errs() << "Kernel Metadata: " << *iter << "\n");
       const MDOperand *KernelMDOp = iter->operands().begin();
       Metadata *KMD = KernelMDOp->get();
-      if(ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)){
+      if (ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)) {
         Value *KMDVal = KMDVAM->getValue();
         Function *KMDFunc = dyn_cast<Function>(KMDVal);
-        if(KMDFunc == &F) {
+        if (KMDFunc == &F) {
           DEBUG(errs() << "-->Kernel Func: " << KMDFunc->getName() << "\n");
           isKernel = true;
         }
@@ -2975,12 +3274,15 @@ void CWriter::printFunction(Function &F) {
   bool isStructReturn = F.hasStructRetAttr();
 
   assert(!F.isDeclaration());
-  if (F.hasDLLImportStorageClass()) Out << "__declspec(dllimport) ";
-  if (F.hasDLLExportStorageClass()) Out << "__declspec(dllexport) ";
-  if (F.hasLocalLinkage()) Out << "static ";
-  printFunctionProto(Out, F.getFunctionType(),
-      std::make_pair(F.getAttributes(), F.getCallingConv()),
-      GetValueName(&F),
+  if (F.hasDLLImportStorageClass())
+    Out << "__declspec(dllimport) ";
+  if (F.hasDLLExportStorageClass())
+    Out << "__declspec(dllexport) ";
+  if (F.hasLocalLinkage())
+    Out << "static ";
+  printFunctionProto(
+      Out, F.getFunctionType(),
+      std::make_pair(F.getAttributes(), F.getCallingConv()), GetValueName(&F),
       F.arg_begin(), // NOTE: replacing ArgumentList (LLVM-4) with arg iterator
       //&F.getArgumentList(),
       isKernel);
@@ -2990,16 +3292,17 @@ void CWriter::printFunction(Function &F) {
   // If this is a struct return function, handle the result with magic.
   if (isStructReturn) {
     Type *StructTy =
-      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+        cast<PointerType>(F.arg_begin()->getType())->getElementType();
     Out << "  ";
-    printTypeName(Out, StructTy, false) << " StructReturn;  /* Struct return temporary */\n";
+    printTypeName(Out, StructTy, false)
+        << " StructReturn;  /* Struct return temporary */\n";
 
     Out << "  ";
     printTypeName(Out, F.arg_begin()->getType(), false);
     Out << GetValueName(&*F.arg_begin()) << " = &StructReturn;\n";
   }
-  
-	// Output all floating point constants that cannot be printed accurately.
+
+  // Output all floating point constants that cannot be printed accurately.
   printFloatingPointConstants(F);
 
   bool PrintedVar = false;
@@ -3009,8 +3312,8 @@ void CWriter::printFunction(Function &F) {
     if (AllocaInst *AI = isDirectAlloca(&*I)) {
       DEBUG(errs() << "Processing alloca inst: " << *AI << "\n");
       unsigned Alignment = AI->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(AI->getAllocatedType());
+      bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment(
+                                                        AI->getAllocatedType());
       Out << "  ";
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
@@ -3020,20 +3323,21 @@ void CWriter::printFunction(Function &F) {
         Out << " __attribute__((aligned(" << Alignment << ")))";
       if (AI->isArrayAllocation()) {
         DEBUG(errs() << "Alloca is an array allocation!\n");
-        unsigned arraySize = dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        unsigned arraySize =
+            dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue();
         Out << "[" << arraySize << "]";
       }
       Out << ";    /* Address-exposed local */\n";
       PrintedVar = true;
-    } else if (!isEmptyType(I->getType()) &&
-        !isInlinableInst(*I)) {
+    } else if (!isEmptyType(I->getType()) && !isInlinableInst(*I)) {
       Out << "  ";
       printTypeName(Out, I->getType(), false) << ' ' << GetValueName(&*I);
       Out << ";\n";
 
-      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+      if (isa<PHINode>(*I)) { // Print out PHI node temporaries as well...
         Out << "  ";
-        printTypeName(Out, I->getType(), false) << ' ' << (GetValueName(&*I)+"__PHI_TEMPORARY");
+        printTypeName(Out, I->getType(), false)
+            << ' ' << (GetValueName(&*I) + "__PHI_TEMPORARY");
         Out << ";\n";
       }
       PrintedVar = true;
@@ -3043,7 +3347,7 @@ void CWriter::printFunction(Function &F) {
     // variable to hold the result of the BitCast.
     if (isFPIntBitCast(*I)) {
       Out << "  llvmBitCastUnion " << GetValueName(&*I)
-        << "__BITCAST_TEMPORARY;\n";
+          << "__BITCAST_TEMPORARY;\n";
       PrintedVar = true;
     }
   }
@@ -3054,11 +3358,13 @@ void CWriter::printFunction(Function &F) {
   // print the basic blocks
   //  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
 
-  std::set<BasicBlock*> VisitSet;
-  BasicBlock* entry = &(F.getEntryBlock());
-  // starting printing from entry, then CFG traversal will print the reachable blocks.
+  std::set<BasicBlock *> VisitSet;
+  BasicBlock *entry = &(F.getEntryBlock());
+  // starting printing from entry, then CFG traversal will print the reachable
+  // blocks.
   printBBorLoop(entry);
-  //  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { 
+  //  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry);
+  //  BI!=BE; ++BI) {
   //    BasicBlock *BB = *BI;
   //    printBBorLoop(BB);
   //    if(VisitedBlocks.find(BB) == VisitedBlocks.end()) {
@@ -3075,29 +3381,28 @@ void CWriter::printFunction(Function &F) {
   Out << "}\n\n";
 }
 
-
-bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *IndVarChain, Instruction *Branch, unsigned indent) {
-  //Traverse def-use chain of induction variable to make sure that
-  //it ends at the branch. Keep stack of all instructions leading there.
-  for(User *U : Inst->users()) {
+bool CWriter::extractIndVarChain(Instruction *Inst,
+                                 std::stack<Instruction *> *IndVarChain,
+                                 Instruction *Branch, unsigned indent) {
+  // Traverse def-use chain of induction variable to make sure that
+  // it ends at the branch. Keep stack of all instructions leading there.
+  for (User *U : Inst->users()) {
     DEBUG(errs() << std::string(indent, '-'));
     DEBUG(errs() << "->Found user: " << *U << "\n");
-    if(Instruction *UInst = dyn_cast<Instruction>(U)) {
-      if(UInst == Branch) {
+    if (Instruction *UInst = dyn_cast<Instruction>(U)) {
+      if (UInst == Branch) {
         DEBUG(errs() << "Found correct path, returning!\n");
         return true;
-      }
-      else if (isa<PHINode>(UInst)) {
+      } else if (isa<PHINode>(UInst)) {
         DEBUG(errs() << "Reached a PHI Node => Wrong path! Returning!\n");
         continue;
-      }
-      else {
+      } else {
         IndVarChain->push(UInst);
-        if(extractIndVarChain(UInst, IndVarChain, Branch, indent+2)) {
+        if (extractIndVarChain(UInst, IndVarChain, Branch, indent + 2)) {
           return true;
-        }
-        else {
-          DEBUG(errs() << "Wrong path, popping: " << *(IndVarChain->top()) << "\n");
+        } else {
+          DEBUG(errs() << "Wrong path, popping: " << *(IndVarChain->top())
+                       << "\n");
           IndVarChain->pop();
         }
       }
@@ -3107,53 +3412,60 @@ bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *In
   return false;
 }
 
-bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock* CurBlock, BasicBlock* LHeader, std::set<BasicBlock*>*visitSet) {
+bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock *CurBlock,
+                             BasicBlock *LHeader,
+                             std::set<BasicBlock *> *visitSet) {
   bool result = false;
   DEBUG(errs() << "Finding loop branch in " << CurBlock->getName() << "!\n");
-  if(BranchInst *LBranchTemp = dyn_cast<BranchInst>(CurBlock->getTerminator())) {
+  if (BranchInst *LBranchTemp =
+          dyn_cast<BranchInst>(CurBlock->getTerminator())) {
     DEBUG(errs() << "Branch: " << *LBranchTemp << "\n");
-    if(LBranchTemp->isConditional()) {
-      if(LBranchTemp->getSuccessor(0) == LHeader || LBranchTemp->getSuccessor(1) == LHeader) {
+    if (LBranchTemp->isConditional()) {
+      if (LBranchTemp->getSuccessor(0) == LHeader ||
+          LBranchTemp->getSuccessor(1) == LHeader) {
         *LBranch = LBranchTemp;
         DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n");
         result = true;
       } else {
-        BasicBlock* NextBlock1 = LBranchTemp->getSuccessor(0);
-        BasicBlock* NextBlock2 = LBranchTemp->getSuccessor(1);
-        if(visitSet->find(NextBlock1) == visitSet->end()) {
-          DEBUG(errs() << "Visiting unvisited node: " << NextBlock1->getName() << "\n");
+        BasicBlock *NextBlock1 = LBranchTemp->getSuccessor(0);
+        BasicBlock *NextBlock2 = LBranchTemp->getSuccessor(1);
+        if (visitSet->find(NextBlock1) == visitSet->end()) {
+          DEBUG(errs() << "Visiting unvisited node: " << NextBlock1->getName()
+                       << "\n");
           visitSet->insert(NextBlock1);
           result |= findLoopBranch(LBranch, NextBlock1, LHeader, visitSet);
         }
-        if(visitSet->find(NextBlock2) == visitSet->end()) {
-          DEBUG(errs() << "Visiting unvisited node: " << NextBlock2->getName() << "\n");
+        if (visitSet->find(NextBlock2) == visitSet->end()) {
+          DEBUG(errs() << "Visiting unvisited node: " << NextBlock2->getName()
+                       << "\n");
           visitSet->insert(NextBlock2);
           result |= findLoopBranch(LBranch, NextBlock2, LHeader, visitSet);
         }
       }
 
     } else {
-      if(LBranchTemp->getSuccessor(0) == LHeader) {
+      if (LBranchTemp->getSuccessor(0) == LHeader) {
         *LBranch = LBranchTemp;
         DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n");
         result = true;
       } else {
         BasicBlock *NextBlock = LBranchTemp->getSuccessor(0);
-        if(visitSet->find(NextBlock) == visitSet->end()) {
-          DEBUG(errs() << "Visiting unvisited node: " << NextBlock->getName() << "\n");
+        if (visitSet->find(NextBlock) == visitSet->end()) {
+          DEBUG(errs() << "Visiting unvisited node: " << NextBlock->getName()
+                       << "\n");
           visitSet->insert(NextBlock);
           result |= findLoopBranch(LBranch, NextBlock, LHeader, visitSet);
         }
       }
     }
   }
-  return result; 
+  return result;
 }
 
 bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) {
   DEBUG(errs() << "traversing: " << *I << "\n");
   bool result = false;
-  if(PHINode *PHI = dyn_cast<PHINode>(I)) {
+  if (PHINode *PHI = dyn_cast<PHINode>(I)) {
     if (PI == PHI) {
       DEBUG(errs() << "returning true\n");
       result = true;
@@ -3164,9 +3476,9 @@ bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) {
     }
   } else {
     for (Use &U : I->operands()) {
-      if(Instruction *UInst = dyn_cast<Instruction>(U)) {
+      if (Instruction *UInst = dyn_cast<Instruction>(U)) {
         result |= traverseUseDefChain(UInst, PI);
-      }  
+      }
     }
   }
   return result;
@@ -3178,1716 +3490,1839 @@ void CWriter::printLoop(Loop *L) {
   Out << "\n\n/* Processing Loop Block: " << L->getName() << " */\n";
   DEBUG(errs() << "\n\n/* Processing Loop Block: " << L->getName() << " */\n");
 
-
   PHINode *InductionVariable;
   //  auto *LoopLatch = L->getLoopLatch();
   InductionDescriptor ID;
   DEBUG(errs() << "Looking for induction variables\n");
   bool found = false;
   if (PHINode *IndVar = L->getCanonicalInductionVariable()) {
-		InductionVariable = IndVar;
-		found = true;
-		DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar << "\n");
-	} else {
-		for (auto I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-			PHINode *PHI = cast<PHINode>(I);
-			DEBUG(errs() << "Phi Node: " << *PHI << "\n");
-			if(InductionDescriptor::isInductionPHI(PHI,L,PSE,ID)) {
-				DEBUG(errs() << "Found induction: " << *PHI << "\n");
-				InductionVariable = PHI;
-				found = true;
-				break;
-			}
-		}
-	}
+    InductionVariable = IndVar;
+    found = true;
+    DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar << "\n");
+  } else {
+    for (auto I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PHI = cast<PHINode>(I);
+      DEBUG(errs() << "Phi Node: " << *PHI << "\n");
+      if (InductionDescriptor::isInductionPHI(PHI, L, PSE, ID)) {
+        DEBUG(errs() << "Found induction: " << *PHI << "\n");
+        InductionVariable = PHI;
+        found = true;
+        break;
+      }
+    }
+  }
 
-	if(!found) {
-		llvm_unreachable("Couldn't find induction Variable in loop!\n");
-	}
+  if (!found) {
+    llvm_unreachable("Couldn't find induction Variable in loop!\n");
+  }
 
-	LInductionVars.insert(InductionVariable);
-	LoopIndVarsMap.insert(std::pair<Loop*, PHINode*>(L,InductionVariable));
-
-	Value *IV = dyn_cast<Value>(InductionVariable);
-	std::string IVName = GetValueName(IV);
-
-	Optional<Loop::LoopBounds> OLB = L->getBounds(*SE);
-	if(OLB.hasValue()) {
-		Loop::LoopBounds LB = OLB.getValue();
-		Value *StartValue = &(LB.getInitialIVValue());
-		Instruction *StepInstruction = &(LB.getStepInst());
-		Value *StepValue = LB.getStepValue();
-		Value *FinalValue = &(LB.getFinalIVValue());
-		ICmpInst::Predicate LoopPredicate = LB.getCanonicalPredicate();
-		std::string BranchPredicate;
-		switch(LoopPredicate) {
-			case ICmpInst::ICMP_EQ:  BranchPredicate = " == "; break;
-			case ICmpInst::ICMP_NE:  BranchPredicate = " != "; break;
-			case ICmpInst::ICMP_ULE:
-			case ICmpInst::ICMP_SLE: BranchPredicate = " < "; break;
-			case ICmpInst::ICMP_UGE:
-			case ICmpInst::ICMP_SGE: BranchPredicate = " > "; break;
-			case ICmpInst::ICMP_ULT:
-			case ICmpInst::ICMP_SLT: BranchPredicate = " <= "; break;
-			case ICmpInst::ICMP_UGT:
-			case ICmpInst::ICMP_SGT: BranchPredicate = " >= "; break;
-			default: llvm_unreachable("Illegal ICmp predicate");
-		}
-		DEBUG(
-    errs() << "Found a Loop Bounds Object!\n";
-		errs() << "IV: " << *IV<< "\n";
-		errs() << "StartValue: " << *StartValue<< "\n";
-		errs() << "StepInstruction: " << *StepInstruction<< "\n";
-		errs() << "StepValue: " << *StepValue<< "\n";
-		errs() << "FinalValue: " << *FinalValue<< "\n";
-		errs() << "Branch Predicate: " << BranchPredicate<< "\n";
-		errs() << "Direction: " << ((LB.getDirection() == Loop::LoopBounds::Direction::Increasing) ? "increasing" : "decreasing") << "\n";
-		)
-
-		std::string startStr; 
-		if (ConstantInt *startConst = dyn_cast<ConstantInt>(StartValue)) {
-			startStr = std::to_string(startConst->getSExtValue());
-		} else {
-			startStr = GetValueName(StartValue);
-		}
-		std::string finalStr; 
-		if (ConstantInt *finalConst = dyn_cast<ConstantInt>(FinalValue)) {
-			finalStr = std::to_string(finalConst->getSExtValue());
-		} else {
-			finalStr = GetValueName(FinalValue);
-		}
-		std::string stepStr; 
-		if (ConstantInt *stepConst = dyn_cast<ConstantInt>(StepValue)) {
-			stepStr = std::to_string(stepConst->getSExtValue());
-		} else {
-			stepStr = GetValueName(StepValue);
-		}
-	  
-    DEBUG(
-		errs() << "\n  for ( " << IVName << " = " << startStr << "; " 
-			<< IVName << BranchPredicate << finalStr << "; " 
-			<< IVName << " = " << IVName << " + " << stepStr << ") {\n";
-    )
+  LInductionVars.insert(InductionVariable);
+  LoopIndVarsMap.insert(std::pair<Loop *, PHINode *>(L, InductionVariable));
+
+  Value *IV = dyn_cast<Value>(InductionVariable);
+  std::string IVName = GetValueName(IV);
+
+  Optional<Loop::LoopBounds> OLB = L->getBounds(*SE);
+  if (OLB.hasValue()) {
+    Loop::LoopBounds LB = OLB.getValue();
+    Value *StartValue = &(LB.getInitialIVValue());
+    Instruction *StepInstruction = &(LB.getStepInst());
+    Value *StepValue = LB.getStepValue();
+    Value *FinalValue = &(LB.getFinalIVValue());
+    ICmpInst::Predicate LoopPredicate = LB.getCanonicalPredicate();
+    std::string BranchPredicate;
+    switch (LoopPredicate) {
+    case ICmpInst::ICMP_EQ:
+      BranchPredicate = " == ";
+      break;
+    case ICmpInst::ICMP_NE:
+      BranchPredicate = " != ";
+      break;
+    case ICmpInst::ICMP_ULE:
+    case ICmpInst::ICMP_SLE:
+      BranchPredicate = " < ";
+      break;
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_SGE:
+      BranchPredicate = " > ";
+      break;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      BranchPredicate = " <= ";
+      break;
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      BranchPredicate = " >= ";
+      break;
+    default:
+      llvm_unreachable("Illegal ICmp predicate");
+    }
+    DEBUG(errs() << "Found a Loop Bounds Object!\n";
+          errs() << "IV: " << *IV << "\n";
+          errs() << "StartValue: " << *StartValue << "\n";
+          errs() << "StepInstruction: " << *StepInstruction << "\n";
+          errs() << "StepValue: " << *StepValue << "\n";
+          errs() << "FinalValue: " << *FinalValue << "\n";
+          errs() << "Branch Predicate: " << BranchPredicate << "\n";
+          errs() << "Direction: "
+                 << ((LB.getDirection() ==
+                      Loop::LoopBounds::Direction::Increasing)
+                         ? "increasing"
+                         : "decreasing")
+                 << "\n";)
+
+    std::string startStr;
+    if (ConstantInt *startConst = dyn_cast<ConstantInt>(StartValue)) {
+      startStr = std::to_string(startConst->getSExtValue());
+    } else {
+      startStr = GetValueName(StartValue);
+    }
+    std::string finalStr;
+    if (ConstantInt *finalConst = dyn_cast<ConstantInt>(FinalValue)) {
+      finalStr = std::to_string(finalConst->getSExtValue());
+    } else {
+      finalStr = GetValueName(FinalValue);
+    }
+    std::string stepStr;
+    if (ConstantInt *stepConst = dyn_cast<ConstantInt>(StepValue)) {
+      stepStr = std::to_string(stepConst->getSExtValue());
+    } else {
+      stepStr = GetValueName(StepValue);
+    }
 
-		Out << "\n  for ( " << IVName << " = " << startStr << "; " 
-			<< IVName << BranchPredicate << finalStr << "; " 
-			<< IVName << " = " << IVName << " + " << stepStr << ") {\n";
+    DEBUG(errs() << "\n  for ( " << IVName << " = " << startStr << "; "
+                 << IVName << BranchPredicate << finalStr << "; " << IVName
+                 << " = " << IVName << " + " << stepStr << ") {\n";)
 
-	} else {
-		llvm_unreachable("No Loop Bounds!");
-    DEBUG(errs() << "could not find a loop bounds object, searching for bounds manually!\n");
+    Out << "\n  for ( " << IVName << " = " << startStr << "; " << IVName
+        << BranchPredicate << finalStr << "; " << IVName << " = " << IVName
+        << " + " << stepStr << ") {\n";
+
+  } else {
+    llvm_unreachable("No Loop Bounds!");
+    DEBUG(errs() << "could not find a loop bounds object, searching for bounds "
+                    "manually!\n");
     auto *ExitingBlock = L->getExitingBlock();
     DEBUG(errs() << "Exiting Block: " << ExitingBlock->getName() << "\n");
     auto *ExitingBranch = ExitingBlock->getTerminator();
     DEBUG(errs() << "Exiting Branch: " << *ExitingBranch << "\n");
-		Value *StartValue = ID.getStartValue();
-		const SCEV *Step = ID.getStep();
-		//  unsigned IterationCount = SE->getSmallConstantMaxTripCount(L); 
-
-		std::string IVOp;
-
-		if (const SCEVConstant *stepConst = dyn_cast<SCEVConstant>(Step)) {
-			if(stepConst->getAPInt().isNonNegative()) {
-				IVOp = " + ";  
-			}
-		}
+    Value *StartValue = ID.getStartValue();
+    const SCEV *Step = ID.getStep();
+    //  unsigned IterationCount = SE->getSmallConstantMaxTripCount(L);
 
+    std::string IVOp;
 
-		std::string BranchPredicate;
-		ICmpInst *BranchCondition = dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition());
-		switch(BranchCondition->getPredicate()) {
-			case ICmpInst::ICMP_EQ:  BranchPredicate = " != "; break;
-			case ICmpInst::ICMP_NE:  BranchPredicate = " == "; break;
-			case ICmpInst::ICMP_ULE:
-			case ICmpInst::ICMP_SLE: BranchPredicate = " > "; break;
-			case ICmpInst::ICMP_UGE:
-			case ICmpInst::ICMP_SGE: BranchPredicate = " < "; break;
-			case ICmpInst::ICMP_ULT:
-			case ICmpInst::ICMP_SLT: BranchPredicate = " >= "; break;
-			case ICmpInst::ICMP_UGT:
-			case ICmpInst::ICMP_SGT: BranchPredicate = " <= "; break;
-			default: llvm_unreachable("Illegal ICmp predicate");
-		}
+    if (const SCEVConstant *stepConst = dyn_cast<SCEVConstant>(Step)) {
+      if (stepConst->getAPInt().isNonNegative()) {
+        IVOp = " + ";
+      }
+    }
 
-		DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n");
-
-		std::string compLHS, compRHS;
-		Value *CondOp1 = BranchCondition->getOperand(0);
-		DEBUG(errs() << "CondOp1: " << *CondOp1 << "\n");
-		if (Constant *constOp1 = dyn_cast<Constant>(CondOp1)) {
-			DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
-			compLHS = (constOp1->getUniqueInteger()).toString(10,1);
-		} else {
-			DEBUG(errs() << "Condition Operand is not a constant, ");
-			if(traverseUseDefChain(dyn_cast<Instruction>(CondOp1), InductionVariable)) {
-				DEBUG(errs() << "it is the IV.\n");
-				compLHS = GetValueName(IV);
-			} else {
-				DEBUG(errs() << "it is another variable.\n");
-				compLHS = GetValueName(CondOp1);
-			}
-		}
-		Value *CondOp2 = BranchCondition->getOperand(1);
-		DEBUG(errs() << "CondOp2: " << *CondOp2 << "\n");
-		if (Constant *constOp2 = dyn_cast<Constant>(CondOp2)) {
-			DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
-			compRHS = (constOp2->getUniqueInteger()).toString(10,1);
-		} else {
-			DEBUG(errs() << "Condition Operand is not a constant.\n");
-			if(traverseUseDefChain(dyn_cast<Instruction>(CondOp2), InductionVariable)) {
-				DEBUG(errs() << "It is the IV.\n");
-				compRHS = GetValueName(IV);
-			} else {
-				DEBUG(errs() << "It is another variable.\n");
-				compRHS = GetValueName(CondOp2);
-			}
-		}
+    std::string BranchPredicate;
+    ICmpInst *BranchCondition =
+        dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition());
+    switch (BranchCondition->getPredicate()) {
+    case ICmpInst::ICMP_EQ:
+      BranchPredicate = " != ";
+      break;
+    case ICmpInst::ICMP_NE:
+      BranchPredicate = " == ";
+      break;
+    case ICmpInst::ICMP_ULE:
+    case ICmpInst::ICMP_SLE:
+      BranchPredicate = " > ";
+      break;
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_SGE:
+      BranchPredicate = " < ";
+      break;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      BranchPredicate = " >= ";
+      break;
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      BranchPredicate = " <= ";
+      break;
+    default:
+      llvm_unreachable("Illegal ICmp predicate");
+    }
 
-		std::string startStr; 
-		if (Constant *startConst = dyn_cast<Constant>(StartValue)) {
-			startStr = (startConst->getUniqueInteger()).toString(10,1);
-		} else {
-			startStr = GetValueName(StartValue);
-		}
+    DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n");
 
+    std::string compLHS, compRHS;
+    Value *CondOp1 = BranchCondition->getOperand(0);
+    DEBUG(errs() << "CondOp1: " << *CondOp1 << "\n");
+    if (Constant *constOp1 = dyn_cast<Constant>(CondOp1)) {
+      DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
+      compLHS = (constOp1->getUniqueInteger()).toString(10, 1);
+    } else {
+      DEBUG(errs() << "Condition Operand is not a constant, ");
+      if (traverseUseDefChain(dyn_cast<Instruction>(CondOp1),
+                              InductionVariable)) {
+        DEBUG(errs() << "it is the IV.\n");
+        compLHS = GetValueName(IV);
+      } else {
+        DEBUG(errs() << "it is another variable.\n");
+        compLHS = GetValueName(CondOp1);
+      }
+    }
+    Value *CondOp2 = BranchCondition->getOperand(1);
+    DEBUG(errs() << "CondOp2: " << *CondOp2 << "\n");
+    if (Constant *constOp2 = dyn_cast<Constant>(CondOp2)) {
+      DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
+      compRHS = (constOp2->getUniqueInteger()).toString(10, 1);
+    } else {
+      DEBUG(errs() << "Condition Operand is not a constant.\n");
+      if (traverseUseDefChain(dyn_cast<Instruction>(CondOp2),
+                              InductionVariable)) {
+        DEBUG(errs() << "It is the IV.\n");
+        compRHS = GetValueName(IV);
+      } else {
+        DEBUG(errs() << "It is another variable.\n");
+        compRHS = GetValueName(CondOp2);
+      }
+    }
 
-		DEBUG(errs() << "  for ( " << IVName << " = " << startStr << "; " 
-				    << compLHS << BranchPredicate << compRHS << "; " 
-				    << IVName << " = " << IVName << IVOp << *Step << ") {\n");
+    std::string startStr;
+    if (Constant *startConst = dyn_cast<Constant>(StartValue)) {
+      startStr = (startConst->getUniqueInteger()).toString(10, 1);
+    } else {
+      startStr = GetValueName(StartValue);
+    }
 
-			Out << "\n  for ( " << IVName << " = " << startStr << "; " 
-			<< compLHS << BranchPredicate << compRHS << "; " 
-			<< IVName << " = " << IVName << IVOp << *Step << ") {\n";
-	}
+    DEBUG(errs() << "  for ( " << IVName << " = " << startStr << "; " << compLHS
+                 << BranchPredicate << compRHS << "; " << IVName << " = "
+                 << IVName << IVOp << *Step << ") {\n");
 
+    Out << "\n  for ( " << IVName << " = " << startStr << "; " << compLHS
+        << BranchPredicate << compRHS << "; " << IVName << " = " << IVName
+        << IVOp << *Step << ") {\n";
+  }
 
-	BasicBlock *BB = L->getHeader();
-	//  printBBorLoop(BB);
-	printBasicBlock(BB);
-	//  Loop *BBLoop = LI->getLoopFor(BB);
-	//  if (BBLoop == L)
-	//    printBasicBlock(BB);
-	//  else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
-	//    printLoop(BBLoop);
-
-	//  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
-	//    << "' to make GCC happy */\n";
-	//  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
-	//    BasicBlock *BB = L->getBlocks()[i];
-	//    Loop *BBLoop = LI->getLoopFor(BB);
-	//    if (BBLoop == L)
-	//      printBasicBlock(BB);
-	//    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
-	//      printLoop(BBLoop);
-	//  }
-	//  Out << "  } \n";
+  BasicBlock *BB = L->getHeader();
+  //  printBBorLoop(BB);
+  printBasicBlock(BB);
+  //  Loop *BBLoop = LI->getLoopFor(BB);
+  //  if (BBLoop == L)
+  //    printBasicBlock(BB);
+  //  else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+  //    printLoop(BBLoop);
+
+  //  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
+  //    << "' to make GCC happy */\n";
+  //  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+  //    BasicBlock *BB = L->getBlocks()[i];
+  //    Loop *BBLoop = LI->getLoopFor(BB);
+  //    if (BBLoop == L)
+  //      printBasicBlock(BB);
+  //    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+  //      printLoop(BBLoop);
+  //  }
+  //  Out << "  } \n";
 }
 
 void CWriter::printBasicBlock(BasicBlock *BB) {
-	DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n");
-	Out << "\n\n/* Processing Basic Block: " << BB->getName() << " */\n";
-
-	// Don't print the label for the basic block if there are no uses, or if
-	// the only terminator use is the predecessor basic block's terminator.
-	// We have to scan the use list because PHI nodes use basic blocks too but
-	// do not require a label to be generated.
-	//
-	bool NeedsLabel = false;
-	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-		if (isGotoCodeNecessary(*PI, BB)) {
-			NeedsLabel = true;
-			break;
-		}
+  DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n");
+  Out << "\n\n/* Processing Basic Block: " << BB->getName() << " */\n";
 
-	//  if (NeedsLabel) Out << "/* " << GetValueName(BB) << ": */\n";
-	Out << "/* " << GetValueName(BB) << ": */\n";
-
-	// Output all of the instructions in the basic block...
-	for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
-			++II) {
-		Instruction *I = &*II;
-		DEBUG(errs() << "*********Processing: " << *I << "\n");
-		bool skip = false;
-		for(Use &U : I->operands()) {
-			Value *v = U.get();
-			if(PHINode *PN = dyn_cast<PHINode>(v)) {
-				if (LInductionVars.find(PN) != LInductionVars.end()) {
-					bool UserPHI = false;
-					bool UserCMP = false;
-					bool UserOTHER = false;
-					DEBUG(errs() << "Instruction uses induction variable\n");
-					for (User *IUser : I->users()) {
-						if (Instruction *UserInst = dyn_cast<Instruction>(IUser)) {
-							DEBUG(errs() << "User: " << *UserInst << "\n");
-							if (dyn_cast<PHINode>(UserInst)) {
-								UserPHI = true;
-							} else if (dyn_cast<ICmpInst>(UserInst)) {
-								UserCMP = true;
-							} else {
-								UserOTHER = true;
-							}
-							//              skip = true;
-							//              break;
-						}
-					}
-					if (UserPHI && UserCMP && !UserOTHER) {
-						skip = true;
-					}
-				}
-			}
-			if (skip)
-				break;
-		}
-		if(skip){ 
-			DEBUG(errs() << "Skipping instruction that increments Induction Variable!\n");
-			Out << "/* Skipped induction variable use: " << *I << " */\n";
-			continue;
-		}
-		if(PHINode *PN = dyn_cast<PHINode>(I)) {
-			if (LInductionVars.find(PN) != LInductionVars.end()) { 
-				DEBUG(errs() << "Skipping PHINode for Induction Variable!\n");
-				Out << "/* PHINode of induction variable was here */\n";
-				continue;
-			}
-		}
-		if (!isInlinableInst(*II) && !isDirectAlloca(&*II)) {
-			if (!isEmptyType(II->getType()) &&
-					!isInlineAsm(*II))
-				outputLValue(&*II);
-			else
-				Out << "  ";
-			writeInstComputationInline(*II);
-			Out << ";\n";
-		} else {
-			DEBUG(errs() << "Skipping inlinable or direct alloca!\n");
-		}
-	}
+  // Don't print the label for the basic block if there are no uses, or if
+  // the only terminator use is the predecessor basic block's terminator.
+  // We have to scan the use list because PHI nodes use basic blocks too but
+  // do not require a label to be generated.
+  //
+  bool NeedsLabel = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (isGotoCodeNecessary(*PI, BB)) {
+      NeedsLabel = true;
+      break;
+    }
 
-	// Don't emit prefix or suffix for the terminator.
-	visit(*BB->getTerminator());
-}
+  //  if (NeedsLabel) Out << "/* " << GetValueName(BB) << ": */\n";
+  Out << "/* " << GetValueName(BB) << ": */\n";
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; ++II) {
+    Instruction *I = &*II;
+    DEBUG(errs() << "*********Processing: " << *I << "\n");
+    bool skip = false;
+    for (Use &U : I->operands()) {
+      Value *v = U.get();
+      if (PHINode *PN = dyn_cast<PHINode>(v)) {
+        if (LInductionVars.find(PN) != LInductionVars.end()) {
+          bool UserPHI = false;
+          bool UserCMP = false;
+          bool UserOTHER = false;
+          DEBUG(errs() << "Instruction uses induction variable\n");
+          for (User *IUser : I->users()) {
+            if (Instruction *UserInst = dyn_cast<Instruction>(IUser)) {
+              DEBUG(errs() << "User: " << *UserInst << "\n");
+              if (dyn_cast<PHINode>(UserInst)) {
+                UserPHI = true;
+              } else if (dyn_cast<ICmpInst>(UserInst)) {
+                UserCMP = true;
+              } else {
+                UserOTHER = true;
+              }
+              //              skip = true;
+              //              break;
+            }
+          }
+          if (UserPHI && UserCMP && !UserOTHER) {
+            skip = true;
+          }
+        }
+      }
+      if (skip)
+        break;
+    }
+    if (skip) {
+      DEBUG(errs()
+            << "Skipping instruction that increments Induction Variable!\n");
+      Out << "/* Skipped induction variable use: " << *I << " */\n";
+      continue;
+    }
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      if (LInductionVars.find(PN) != LInductionVars.end()) {
+        DEBUG(errs() << "Skipping PHINode for Induction Variable!\n");
+        Out << "/* PHINode of induction variable was here */\n";
+        continue;
+      }
+    }
+    if (!isInlinableInst(*II) && !isDirectAlloca(&*II)) {
+      if (!isEmptyType(II->getType()) && !isInlineAsm(*II))
+        outputLValue(&*II);
+      else
+        Out << "  ";
+      writeInstComputationInline(*II);
+      Out << ";\n";
+    } else {
+      DEBUG(errs() << "Skipping inlinable or direct alloca!\n");
+    }
+  }
 
+  // Don't emit prefix or suffix for the terminator.
+  visit(*BB->getTerminator());
+}
 
 // Specific Instruction type classes... note that all of the casts are
 // necessary because we use the instruction classes as opaque types...
 //
 void CWriter::visitReturnInst(ReturnInst &I) {
-	// If this is a struct return function, return the temporary struct.
-	bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
+  // If this is a struct return function, return the temporary struct.
+  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
 
-	if (isStructReturn) {
-		Out << "  return StructReturn;\n";
-		return;
-	}
+  if (isStructReturn) {
+    Out << "  return StructReturn;\n";
+    return;
+  }
 
-	// Don't output a void return if this is the last basic block in the function
-	// unless that would make the basic block empty
-	if (I.getNumOperands() == 0 &&
-			&*--I.getParent()->getParent()->end() == I.getParent() &&
-			&*I.getParent()->begin() != &I) {
-		return;
-	}
+  // Don't output a void return if this is the last basic block in the function
+  // unless that would make the basic block empty
+  if (I.getNumOperands() == 0 &&
+      &*--I.getParent()->getParent()->end() == I.getParent() &&
+      &*I.getParent()->begin() != &I) {
+    return;
+  }
 
-	Out << "  return";
-	if (I.getNumOperands()) {
-		Out << ' ';
-		writeOperand(I.getOperand(0), ContextCasted);
-	}
-	Out << ";\n";
+  Out << "  return";
+  if (I.getNumOperands()) {
+    Out << ' ';
+    writeOperand(I.getOperand(0), ContextCasted);
+  }
+  Out << ";\n";
 }
 
 void CWriter::visitSwitchInst(SwitchInst &SI) {
-	Value* Cond = SI.getCondition();
-	unsigned NumBits = cast<IntegerType>(Cond->getType())->getBitWidth();
-
-	if (SI.getNumCases() == 0) { // unconditional branch
-		printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
-		printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
-		Out << "\n";
-
-	} else if (NumBits <= 64) { // model as a switch statement
-		Out << "  switch (";
-		writeOperand(Cond);
-		Out << ") {\n  default:\n";
-		printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
-		printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
-
-
-		// CHECK: Needs much testing
-		for (auto Case : SI.cases()) {
-			ConstantInt* CaseVal = Case.getCaseValue();
-			BasicBlock* Succ = Case.getCaseSuccessor();
-			Out << "  case ";
-			writeOperand(CaseVal);
-			Out << ":\n";
-			printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
-			if (isGotoCodeNecessary(SI.getParent(), Succ))
-				printBranchToBlock(SI.getParent(), Succ, 2);
-			else
-				Out << "    break;\n";
-		}
-		Out << "  }\n";
-
-	} else { // model as a series of if statements
-		Out << "  ";
-		// CHECK: Needs much testing
-		for (auto Case : SI.cases()) {
-			Out << "if (";
-			ConstantInt* CaseVal = Case.getCaseValue();
-			BasicBlock* Succ = Case.getCaseSuccessor();
-			ICmpInst *icmp = new ICmpInst(CmpInst::ICMP_EQ, Cond, CaseVal);
-			visitICmpInst(*icmp);
-			delete icmp;
-			Out << ") {\n";
-			printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
-			printBranchToBlock(SI.getParent(), Succ, 2);
-			Out << "  } else ";
-		}
-		Out << "{\n";
-		printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
-		printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
-		Out << "  }\n";
-	}
-	Out << "\n";
+  Value *Cond = SI.getCondition();
+  unsigned NumBits = cast<IntegerType>(Cond->getType())->getBitWidth();
+
+  if (SI.getNumCases() == 0) { // unconditional branch
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
+    printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+    Out << "\n";
+
+  } else if (NumBits <= 64) { // model as a switch statement
+    Out << "  switch (";
+    writeOperand(Cond);
+    Out << ") {\n  default:\n";
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
+    printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+
+    // CHECK: Needs much testing
+    for (auto Case : SI.cases()) {
+      ConstantInt *CaseVal = Case.getCaseValue();
+      BasicBlock *Succ = Case.getCaseSuccessor();
+      Out << "  case ";
+      writeOperand(CaseVal);
+      Out << ":\n";
+      printPHICopiesForSuccessor(SI.getParent(), Succ, 2);
+      if (isGotoCodeNecessary(SI.getParent(), Succ))
+        printBranchToBlock(SI.getParent(), Succ, 2);
+      else
+        Out << "    break;\n";
+    }
+    Out << "  }\n";
+
+  } else { // model as a series of if statements
+    Out << "  ";
+    // CHECK: Needs much testing
+    for (auto Case : SI.cases()) {
+      Out << "if (";
+      ConstantInt *CaseVal = Case.getCaseValue();
+      BasicBlock *Succ = Case.getCaseSuccessor();
+      ICmpInst *icmp = new ICmpInst(CmpInst::ICMP_EQ, Cond, CaseVal);
+      visitICmpInst(*icmp);
+      delete icmp;
+      Out << ") {\n";
+      printPHICopiesForSuccessor(SI.getParent(), Succ, 2);
+      printBranchToBlock(SI.getParent(), Succ, 2);
+      Out << "  } else ";
+    }
+    Out << "{\n";
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
+    printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+    Out << "  }\n";
+  }
+  Out << "\n";
 }
 
 void CWriter::visitIndirectBrInst(IndirectBrInst &IBI) {
-	Out << "  goto *(void*)(";
-	writeOperand(IBI.getOperand(0));
-	Out << ");\n";
+  Out << "  goto *(void*)(";
+  writeOperand(IBI.getOperand(0));
+  Out << ");\n";
 }
 
 void CWriter::visitUnreachableInst(UnreachableInst &I) {
-	Out << "  __builtin_unreachable();\n\n";
+  Out << "  __builtin_unreachable();\n\n";
 }
 
 bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
-	/// FIXME: This should be reenabled, but loop reordering safe!!
-	return true;
+  /// FIXME: This should be reenabled, but loop reordering safe!!
+  return true;
 
-	if (std::next(Function::iterator(From)) != Function::iterator(To))
-		return true;  // Not the direct successor, we need a goto.
+  if (std::next(Function::iterator(From)) != Function::iterator(To))
+    return true; // Not the direct successor, we need a goto.
 
-	//isa<SwitchInst>(From->getTerminator())
+  // isa<SwitchInst>(From->getTerminator())
 
-	if (LI->getLoopFor(From) != LI->getLoopFor(To))
-		return true;
-	return false;
+  if (LI->getLoopFor(From) != LI->getLoopFor(To))
+    return true;
+  return false;
 }
 
-void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
-		BasicBlock *Successor,
-		unsigned Indent) {
-	Out << "/* Printing PHIs for " << CurBlock->getName() << "->" << Successor->getName() << " */\n";
-	DEBUG(errs() << "/* Printing PHIs for " << CurBlock->getName() << "->" << Successor->getName() << " */\n");
-	for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
-		PHINode *PN = cast<PHINode>(I);
-		if(LInductionVars.find(PN) == LInductionVars.end()) {
-			Out << "/* Printing phi node: " << *PN << " */\n";
-			DEBUG(errs() << "/* Printing phi node: " << *PN << " */\n");
-			// Now we have to do the printing.
-			Value *IV = PN->getIncomingValueForBlock(CurBlock);
-			if (!isa<UndefValue>(IV) && !isEmptyType(IV->getType())) {
-				Out << std::string(Indent, ' ');
-				Out << "  " << GetValueName(&*I) << "__PHI_TEMPORARY = ";
-				writeOperand(IV, ContextCasted);
-				Out << ";   /* for PHI node */\n";
-			}
-		} else {
-			Out << "/* Skipping (indvar) phi node: " << *PN << " */\n";
-			DEBUG(errs() << "/* Skipping (indvar) phi node: " << *PN << " */\n");
-		}
-	}
+void CWriter::printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                         BasicBlock *Successor,
+                                         unsigned Indent) {
+  Out << "/* Printing PHIs for " << CurBlock->getName() << "->"
+      << Successor->getName() << " */\n";
+  DEBUG(errs() << "/* Printing PHIs for " << CurBlock->getName() << "->"
+               << Successor->getName() << " */\n");
+  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    if (LInductionVars.find(PN) == LInductionVars.end()) {
+      Out << "/* Printing phi node: " << *PN << " */\n";
+      DEBUG(errs() << "/* Printing phi node: " << *PN << " */\n");
+      // Now we have to do the printing.
+      Value *IV = PN->getIncomingValueForBlock(CurBlock);
+      if (!isa<UndefValue>(IV) && !isEmptyType(IV->getType())) {
+        Out << std::string(Indent, ' ');
+        Out << "  " << GetValueName(&*I) << "__PHI_TEMPORARY = ";
+        writeOperand(IV, ContextCasted);
+        Out << ";   /* for PHI node */\n";
+      }
+    } else {
+      Out << "/* Skipping (indvar) phi node: " << *PN << " */\n";
+      DEBUG(errs() << "/* Skipping (indvar) phi node: " << *PN << " */\n");
+    }
+  }
 }
 
 void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
-		unsigned Indent) {
-	if (isGotoCodeNecessary(CurBB, Succ)) {
-		Out << std::string(Indent, ' ') << "  goto ";
-		writeOperand(Succ);
-		Out << ";\n";
-	}
+                                 unsigned Indent) {
+  if (isGotoCodeNecessary(CurBB, Succ)) {
+    Out << std::string(Indent, ' ') << "  goto ";
+    writeOperand(Succ);
+    Out << ";\n";
+  }
 }
 
-void CWriter::printBBorLoop (BasicBlock *BB) {
-	DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n");
-	Out << "\n/* Printing: " << BB->getName() << " */\n";
-	if(VisitedBlocks.find(BB)!=VisitedBlocks.end() && ReplicateBlocks.find(BB)==ReplicateBlocks.end()) {
-		DEBUG(errs() << "This BB has already been printed and is not marked for replication! exiting!\n");
-		Out << "/* This BB has already been printed and is not marked for replication! exiting! */\n";
-	} else if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) {
-		DEBUG(errs() << "Reached block that is top of stack, return instead!\n");
-		Out << "/* " << BB->getName() << " is top of stack, return instead! */\n";
-		//    ImmPostDommBlocks.pop();
-	} else {
-		VisitedBlocks.insert(BB);
-		if(Loop *LL = LI->getLoopFor(BB)) {
-			if (LL->getHeader() == BB)
-				printLoop(LL);
-			else 
-				printBasicBlock(BB);
-		} else {
-			printBasicBlock(BB);
-		}
-	}
-
+void CWriter::printBBorLoop(BasicBlock *BB) {
+  DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n");
+  Out << "\n/* Printing: " << BB->getName() << " */\n";
+  if (VisitedBlocks.find(BB) != VisitedBlocks.end() &&
+      ReplicateBlocks.find(BB) == ReplicateBlocks.end()) {
+    DEBUG(errs() << "This BB has already been printed and is not marked for "
+                    "replication! exiting!\n");
+    Out << "/* This BB has already been printed and is not marked for "
+           "replication! exiting! */\n";
+  } else if (!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) {
+    DEBUG(errs() << "Reached block that is top of stack, return instead!\n");
+    Out << "/* " << BB->getName() << " is top of stack, return instead! */\n";
+    //    ImmPostDommBlocks.pop();
+  } else {
+    VisitedBlocks.insert(BB);
+    if (Loop *LL = LI->getLoopFor(BB)) {
+      if (LL->getHeader() == BB)
+        printLoop(LL);
+      else
+        printBasicBlock(BB);
+    } else {
+      printBasicBlock(BB);
+    }
+  }
 }
 
-bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) {
-	CompVisitedBlocks.insert(CurrBlock);
-	DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with " << CompBlock->getName() << "\n");
-	if (CurrBlock == ImmPostDomm) {
-		DEBUG(errs() << "----Reached Post Dominator, returning false!\n");
-		return false;
-	} else if (CurrBlock == CompBlock) {
-		DEBUG(errs() << "----Found a match! " << CurrBlock->getName() << " == " << CompBlock->getName() << "\n");
-		return true;
-	} else {
-		bool res = false;
-		for (auto succ: successors(CurrBlock)) {
-			if (CompVisitedBlocks.find(succ) == CompVisitedBlocks.end()) {
-				DEBUG(errs() << "----Visiting successor " << succ->getName() << " of " << CurrBlock->getName() << "\n");
-				res = res || compareBlocks(succ, CompBlock, ImmPostDomm);
-			} else {
-				DEBUG(errs() << "----Skipping successor " << succ->getName() << " of " << CurrBlock->getName() << "\n");
-			}
-		}
-		return res;
-	}
+bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                            BasicBlock *ImmPostDomm) {
+  CompVisitedBlocks.insert(CurrBlock);
+  DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with "
+               << CompBlock->getName() << "\n");
+  if (CurrBlock == ImmPostDomm) {
+    DEBUG(errs() << "----Reached Post Dominator, returning false!\n");
+    return false;
+  } else if (CurrBlock == CompBlock) {
+    DEBUG(errs() << "----Found a match! " << CurrBlock->getName()
+                 << " == " << CompBlock->getName() << "\n");
+    return true;
+  } else {
+    bool res = false;
+    for (auto succ : successors(CurrBlock)) {
+      if (CompVisitedBlocks.find(succ) == CompVisitedBlocks.end()) {
+        DEBUG(errs() << "----Visiting successor " << succ->getName() << " of "
+                     << CurrBlock->getName() << "\n");
+        res = res || compareBlocks(succ, CompBlock, ImmPostDomm);
+      } else {
+        DEBUG(errs() << "----Skipping successor " << succ->getName() << " of "
+                     << CurrBlock->getName() << "\n");
+      }
+    }
+    return res;
+  }
 }
 
-bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) {
-	if (CompBlock == ImmPostDomm) {
-		DEBUG(errs() << "Reached PostDomm; returning!\n");
-		return false;
-	}
-	FindVisitedBlocks.insert(CompBlock);
-	DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & " << CurrBlock->getName() << "\n");
-	bool compareResult = compareBlocks(CurrBlock, CompBlock, ImmPostDomm);
-	CompVisitedBlocks.clear();
-	if (compareResult){
-		DEBUG(errs() << "Match found, marking " << CompBlock->getName() << " for replication!\n");
-		// Flag for replication
-		ReplicateBlocks.insert(CompBlock);
-		return true;
-	} else {
-		bool res = false;
-		for (auto succ: successors(CompBlock)) {
-			if(FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) {
-				DEBUG(errs() << "Visiting successor " << succ->getName() << " of " << CompBlock->getName() << "\n");
-				res = res || findMatch(CurrBlock, succ, ImmPostDomm);
-				if (res == true) break;
-			} else {
-				DEBUG(errs() << "Skipping successor " << succ->getName() << " of " << CompBlock->getName() << "\n");
-			}
-		}
-		return res;
-	}
+bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                        BasicBlock *ImmPostDomm) {
+  if (CompBlock == ImmPostDomm) {
+    DEBUG(errs() << "Reached PostDomm; returning!\n");
+    return false;
+  }
+  FindVisitedBlocks.insert(CompBlock);
+  DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & "
+               << CurrBlock->getName() << "\n");
+  bool compareResult = compareBlocks(CurrBlock, CompBlock, ImmPostDomm);
+  CompVisitedBlocks.clear();
+  if (compareResult) {
+    DEBUG(errs() << "Match found, marking " << CompBlock->getName()
+                 << " for replication!\n");
+    // Flag for replication
+    ReplicateBlocks.insert(CompBlock);
+    return true;
+  } else {
+    bool res = false;
+    for (auto succ : successors(CompBlock)) {
+      if (FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) {
+        DEBUG(errs() << "Visiting successor " << succ->getName() << " of "
+                     << CompBlock->getName() << "\n");
+        res = res || findMatch(CurrBlock, succ, ImmPostDomm);
+        if (res == true)
+          break;
+      } else {
+        DEBUG(errs() << "Skipping successor " << succ->getName() << " of "
+                     << CompBlock->getName() << "\n");
+      }
+    }
+    return res;
+  }
 }
 // Branch instruction printing - Avoid printing out a branch to a basic block
 // that immediately succeeds the current one.
 //
 void CWriter::visitBranchInst(BranchInst &I) {
-	errs() << "Visiting Branch Instruction: " << I <<"\n";
-	Out << "\n/* Branch: " << I << " */\n";
-
-	if (I.isConditional()) {
-		BasicBlock *BB0 = I.getSuccessor(0);
-		BasicBlock *BB1 = I.getSuccessor(1);
-		BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0,BB1);
-
-		// Iterate over all BBs in then & else to find a matching BB
-		// If found, mark it for replication
-		if (ImmPostDomm != BB1 && ImmPostDomm != BB0) {
-			findMatch(BB0, BB1, ImmPostDomm);
-			FindVisitedBlocks.clear();
-		}
-		if(Loop *L = LI->getLoopFor(I.getParent())) {
-			if(L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) {
-				errs() << "This is a loop branch!\n";
-				Out << "/* This is a loop branch! */\n";
-				//BB0 is in the loop. Print it if it hsn't been printed
-				if(VisitedBlocks.find(BB0) != VisitedBlocks.end()) {
-					errs() << "Branching back to header: " << BB0->getName() << "\n";
-					errs() << "This is the end of the loop, closing!\n";
-					Out << "/* Branching back to header: " << BB0->getName() << " */\n";
-					Out << "/* Closing loop! */\n";
-					//BB0 is the loop header. CLose the loop then print BB1.
-					printPHICopiesForSuccessor (I.getParent(), BB0, 2);
-					Out << " }\n";
-					printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-					printBBorLoop(BB1);
-				} else {
-					errs() << "Not branching to header! Branching to: " << BB0->getName() << "\n";
-					//BB0 is not the loop header. That means we are entering loop body
-
-					llvm_unreachable("loop branch unhandled!\n");
-				}
-			} else if(L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) {
-				errs() << "This is a loop branch!\n";
-				Out << "/* This is a loop branch! */\n";
-				if(VisitedBlocks.find(BB1) != VisitedBlocks.end()) {
-					errs() << "Branching back to header: " << BB1->getName() << "\n";
-					errs() << "This is the end of the loop, closing!\n";
-					Out << "/* Branching back to header: " << BB1->getName() << " */\n";
-					Out << "/* Closing loop! */\n";
-					//BB0 is the loop header. CLose the loop then print BB1.
-					printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-					Out << " }\n";
-					printPHICopiesForSuccessor (I.getParent(), BB0, 2);
-					printBBorLoop(BB0);
-				} else {
-					errs() << "Not branching to header! Branching to: " << BB1->getName() << "\n";
-					//BB1 is not the loop header. That means we are entering loop body
-					llvm_unreachable("loop branch unhandled!\n");
-				}
-			} else {
-				errs() << "This is a conditional statement within a loop!\n";
-				Out << "/* This is a conditional statement within a loop! */\n";
-				errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n";
-				if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
-					errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n";
-				} else {
-					errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
-					ImmPostDommBlocks.push(ImmPostDomm);
-				}
-
-				bool noElse = false;
-				if(BB1 == ImmPostDomm) {
-					noElse = true;
-				}
-				Out << "  if (";
-				writeOperand(I.getCondition(), ContextCasted);
-				Out << ") { /* " << I << "*/\n";
-				printPHICopiesForSuccessor (I.getParent(), BB0, 2);
-				printBBorLoop(BB0);
-				errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-				Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n";
-				if (!noElse) {
-					errs() << "Printing else!\n";
-					Out << "  } else { /*" << I << "*/\n";
-					printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-					ElseBlocks.push(BB1);
-					ElseBranches.push(&I);
-					printBBorLoop(BB1);
-					errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-					errs() << "Check to see if else block is closed!\n";
-					Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ;
-					Out << "/* Check to see if else block is closed! */\n" ;
-					if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
-						errs() << "Else block not closed, need to close braces!\n";
-						Out << "/* Else block not closed, need to close braces! */\n" ;
-						Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
-						ElseBranches.pop();
-						ElseBlocks.pop();
-					}
-					if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) {
-						errs() << "Will now pop post dom them handle it!\n";
-						ImmPostDommBlocks.pop();
-						printBBorLoop(ImmPostDomm);
-					} else {
-						errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
-					}
-				} else {
-					errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n";
-					Out << "/* (3913) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n";
-					Out << "  } /* closing " << I << "*/\n";
-					errs() << "Will now pop post dom them handle it!\n";
-					ImmPostDommBlocks.pop();
-					Out << "else {\n";
-					printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-					Out << "}\n";
-					printBBorLoop(BB1);
-				}
-			}
-		} else {
-			errs() << "This is a conditional statement!\n";
-			errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n";
-			if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
-				errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n";
-			} else {
-				errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
-				ImmPostDommBlocks.push(ImmPostDomm);
-			}
-			bool noElse = false;
-			if(BB1 == ImmPostDomm) {
-				noElse = true;
-			}
-			Out << "  if (";
-			writeOperand(I.getCondition(), ContextCasted);
-			Out << ") { /* " << I << "*/\n";
-			printPHICopiesForSuccessor (I.getParent(), BB0, 2);
-			printBBorLoop(BB0);
-			errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-			Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ;
-			if (!noElse) {
-				errs() << "Printing else!\n";
-				Out << "/* Printing else! */\n" ;
-				Out << "  } else { /*" << I << "*/\n";
-				printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-				ElseBlocks.push(BB1);
-				ElseBranches.push(&I);
-				printBBorLoop(BB1);
-				errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-				errs() << "Check to see if else block is closed!\n";
-				Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n";
-				Out << "/* Check to see if else block is closed! */\n";
-				if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
-					errs() << "Else block not closed, need to close braces!\n";
-					Out << "/* Else block not closed, need to close braces! */\n";
-					Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
-					ElseBranches.pop();
-					ElseBlocks.pop();
-				}
-				if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) {
-					errs() << "Will now pop post dom them handle it!\n";
-					ImmPostDommBlocks.pop();
-					printBBorLoop(ImmPostDomm);
-				} else {
-					errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
-				}
-			} else {
-				errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n";
-				Out << "/* (3985) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n";
-				Out << "  } /* closing " << I << "*/\n";
-				errs() << "Will now pop post dom them handle it!\n";
-				ImmPostDommBlocks.pop();
-				Out << "else {\n";
-				printPHICopiesForSuccessor (I.getParent(), BB1, 2);
-				Out << "}\n";
-				printBBorLoop(BB1);
-			}
-		}
-	} else {
-		errs() << "This is an unconditional branch!\n";
-		BasicBlock *BB = I.getSuccessor(0); 
-		printPHICopiesForSuccessor (I.getParent(), BB, 2);
-		if (!ElseBlocks.empty() && I.getParent() == ElseBlocks.top()) {
-			errs() << "Branch marks end of else block, need to close braces!\n";
-			Out << "/* Branch marks end of else block, need to close braces! */\n";
-			Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
-			ElseBranches.pop();
-			ElseBlocks.pop();
-		}
-		printBBorLoop(BB);
-	}
-	Out << "\n";
+  errs() << "Visiting Branch Instruction: " << I << "\n";
+  Out << "\n/* Branch: " << I << " */\n";
+
+  if (I.isConditional()) {
+    BasicBlock *BB0 = I.getSuccessor(0);
+    BasicBlock *BB1 = I.getSuccessor(1);
+    BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0, BB1);
+
+    // Iterate over all BBs in then & else to find a matching BB
+    // If found, mark it for replication
+    if (ImmPostDomm != BB1 && ImmPostDomm != BB0) {
+      findMatch(BB0, BB1, ImmPostDomm);
+      FindVisitedBlocks.clear();
+    }
+    if (Loop *L = LI->getLoopFor(I.getParent())) {
+      if (L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) {
+        errs() << "This is a loop branch!\n";
+        Out << "/* This is a loop branch! */\n";
+        // BB0 is in the loop. Print it if it hsn't been printed
+        if (VisitedBlocks.find(BB0) != VisitedBlocks.end()) {
+          errs() << "Branching back to header: " << BB0->getName() << "\n";
+          errs() << "This is the end of the loop, closing!\n";
+          Out << "/* Branching back to header: " << BB0->getName() << " */\n";
+          Out << "/* Closing loop! */\n";
+          // BB0 is the loop header. CLose the loop then print BB1.
+          printPHICopiesForSuccessor(I.getParent(), BB0, 2);
+          Out << " }\n";
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+          printBBorLoop(BB1);
+        } else {
+          errs() << "Not branching to header! Branching to: " << BB0->getName()
+                 << "\n";
+          // BB0 is not the loop header. That means we are entering loop body
+
+          llvm_unreachable("loop branch unhandled!\n");
+        }
+      } else if (L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) {
+        errs() << "This is a loop branch!\n";
+        Out << "/* This is a loop branch! */\n";
+        if (VisitedBlocks.find(BB1) != VisitedBlocks.end()) {
+          errs() << "Branching back to header: " << BB1->getName() << "\n";
+          errs() << "This is the end of the loop, closing!\n";
+          Out << "/* Branching back to header: " << BB1->getName() << " */\n";
+          Out << "/* Closing loop! */\n";
+          // BB0 is the loop header. CLose the loop then print BB1.
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+          Out << " }\n";
+          printPHICopiesForSuccessor(I.getParent(), BB0, 2);
+          printBBorLoop(BB0);
+        } else {
+          errs() << "Not branching to header! Branching to: " << BB1->getName()
+                 << "\n";
+          // BB1 is not the loop header. That means we are entering loop body
+          llvm_unreachable("loop branch unhandled!\n");
+        }
+      } else {
+        errs() << "This is a conditional statement within a loop!\n";
+        Out << "/* This is a conditional statement within a loop! */\n";
+        errs() << ImmPostDomm->getName()
+               << " is the immediate post dominator of " << BB0->getName()
+               << " and " << BB1->getName() << "\n";
+        if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
+          errs() << "Not pushing " << ImmPostDomm->getName()
+                 << " because it has already been visited!\n";
+        } else {
+          errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
+          ImmPostDommBlocks.push(ImmPostDomm);
+        }
+
+        bool noElse = false;
+        if (BB1 == ImmPostDomm) {
+          noElse = true;
+        }
+        Out << "  if (";
+        writeOperand(I.getCondition(), ContextCasted);
+        Out << ") { /* " << I << "*/\n";
+        printPHICopiesForSuccessor(I.getParent(), BB0, 2);
+        printBBorLoop(BB0);
+        errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+               << "\n";
+        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+            << " */\n";
+        if (!noElse) {
+          errs() << "Printing else!\n";
+          Out << "  } else { /*" << I << "*/\n";
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+          ElseBlocks.push(BB1);
+          ElseBranches.push(&I);
+          printBBorLoop(BB1);
+          errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+                 << "\n";
+          errs() << "Check to see if else block is closed!\n";
+          Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+              << " */\n";
+          Out << "/* Check to see if else block is closed! */\n";
+          if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
+            errs() << "Else block not closed, need to close braces!\n";
+            Out << "/* Else block not closed, need to close braces! */\n";
+            Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
+            ElseBranches.pop();
+            ElseBlocks.pop();
+          }
+          if (!ImmPostDommBlocks.empty() &&
+              ImmPostDommBlocks.top() == ImmPostDomm) {
+            errs() << "Will now pop post dom them handle it!\n";
+            ImmPostDommBlocks.pop();
+            printBBorLoop(ImmPostDomm);
+          } else {
+            errs()
+                << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
+          }
+        } else {
+          errs() << "No else block. Adding one for phis, then moving to "
+                 << BB1->getName() << "!\n";
+          Out << "/* (3913) No else block. Adding one for phis, then moving to "
+              << BB1->getName() << "! */\n";
+          Out << "  } /* closing " << I << "*/\n";
+          errs() << "Will now pop post dom them handle it!\n";
+          ImmPostDommBlocks.pop();
+          Out << "else {\n";
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+          Out << "}\n";
+          printBBorLoop(BB1);
+        }
+      }
+    } else {
+      errs() << "This is a conditional statement!\n";
+      errs() << ImmPostDomm->getName() << " is the immediate post dominator of "
+             << BB0->getName() << " and " << BB1->getName() << "\n";
+      if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
+        errs() << "Not pushing " << ImmPostDomm->getName()
+               << " because it has already been visited!\n";
+      } else {
+        errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
+        ImmPostDommBlocks.push(ImmPostDomm);
+      }
+      bool noElse = false;
+      if (BB1 == ImmPostDomm) {
+        noElse = true;
+      }
+      Out << "  if (";
+      writeOperand(I.getCondition(), ContextCasted);
+      Out << ") { /* " << I << "*/\n";
+      printPHICopiesForSuccessor(I.getParent(), BB0, 2);
+      printBBorLoop(BB0);
+      errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+             << "\n";
+      Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+          << " */\n";
+      if (!noElse) {
+        errs() << "Printing else!\n";
+        Out << "/* Printing else! */\n";
+        Out << "  } else { /*" << I << "*/\n";
+        printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+        ElseBlocks.push(BB1);
+        ElseBranches.push(&I);
+        printBBorLoop(BB1);
+        errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+               << "\n";
+        errs() << "Check to see if else block is closed!\n";
+        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+            << " */\n";
+        Out << "/* Check to see if else block is closed! */\n";
+        if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
+          errs() << "Else block not closed, need to close braces!\n";
+          Out << "/* Else block not closed, need to close braces! */\n";
+          Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
+          ElseBranches.pop();
+          ElseBlocks.pop();
+        }
+        if (!ImmPostDommBlocks.empty() &&
+            ImmPostDommBlocks.top() == ImmPostDomm) {
+          errs() << "Will now pop post dom them handle it!\n";
+          ImmPostDommBlocks.pop();
+          printBBorLoop(ImmPostDomm);
+        } else {
+          errs()
+              << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
+        }
+      } else {
+        errs() << "No else block. Adding one for phis, then moving to "
+               << BB1->getName() << "!\n";
+        Out << "/* (3985) No else block. Adding one for phis, then moving to "
+            << BB1->getName() << "! */\n";
+        Out << "  } /* closing " << I << "*/\n";
+        errs() << "Will now pop post dom them handle it!\n";
+        ImmPostDommBlocks.pop();
+        Out << "else {\n";
+        printPHICopiesForSuccessor(I.getParent(), BB1, 2);
+        Out << "}\n";
+        printBBorLoop(BB1);
+      }
+    }
+  } else {
+    errs() << "This is an unconditional branch!\n";
+    BasicBlock *BB = I.getSuccessor(0);
+    printPHICopiesForSuccessor(I.getParent(), BB, 2);
+    if (!ElseBlocks.empty() && I.getParent() == ElseBlocks.top()) {
+      errs() << "Branch marks end of else block, need to close braces!\n";
+      Out << "/* Branch marks end of else block, need to close braces! */\n";
+      Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
+      ElseBranches.pop();
+      ElseBlocks.pop();
+    }
+    printBBorLoop(BB);
+  }
+  Out << "\n";
 }
 
 // PHI nodes get copied into temporary values at the end of predecessor basic
 // blocks.  We now need to copy these temporary values into the REAL value for
 // the PHI.
 void CWriter::visitPHINode(PHINode &I) {
-	if (LInductionVars.find(&I) == LInductionVars.end()) {
-		writeOperand(&I);
-		Out << "__PHI_TEMPORARY";
-	} 
-	else { 
-		DEBUG(errs() << "Skipping PHI node for induction variable!\n"); 
-	}
+  if (LInductionVars.find(&I) == LInductionVars.end()) {
+    writeOperand(&I);
+    Out << "__PHI_TEMPORARY";
+  } else {
+    DEBUG(errs() << "Skipping PHI node for induction variable!\n");
+  }
 }
 
-
 // NOTE: Moving LLVM-4 Binary Op functions here
 bool isNeg(const Value *V) {
-	if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-		if (Bop->getOpcode() == Instruction::Sub)
-			if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0)))
-				return C->isNegativeZeroValue();
-	return false;
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::Sub)
+      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0)))
+        return C->isNegativeZeroValue();
+  return false;
 }
 
 bool isFNeg(const Value *V, bool IgnoreZeroSign) {
-	if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-		if (Bop->getOpcode() == Instruction::FSub)
-			if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0))) {
-				if (!IgnoreZeroSign)
-					IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
-				return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
-			}
-	return false;
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::FSub)
+      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0))) {
+        if (!IgnoreZeroSign)
+          IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
+        return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
+      }
+  return false;
 }
 
-
 Value *getNegArgument(Value *BinOp) {
-	return cast<BinaryOperator>(BinOp)->getOperand(1);
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
 }
 
 const Value *getNegArgument(const Value *BinOp) {
-	return getNegArgument(const_cast<Value*>(BinOp));
+  return getNegArgument(const_cast<Value *>(BinOp));
 }
 
 Value *getFNegArgument(Value *BinOp) {
-	return cast<BinaryOperator>(BinOp)->getOperand(1);
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
 }
 
 const Value *getFNegArgument(const Value *BinOp) {
-	return getFNegArgument(const_cast<Value*>(BinOp));
+  return getFNegArgument(const_cast<Value *>(BinOp));
 }
 
 static inline bool isConstantAllOnes(const Value *V) {
-	if (const Constant *C = dyn_cast<Constant>(V))
-		return C->isAllOnesValue();
-	return false;
+  if (const Constant *C = dyn_cast<Constant>(V))
+    return C->isAllOnesValue();
+  return false;
 }
 
 bool isNot(const Value *V) {
-	if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-		return (Bop->getOpcode() == Instruction::Xor &&
-				(isConstantAllOnes(Bop->getOperand(1)) ||
-				 isConstantAllOnes(Bop->getOperand(0))));
-	return false;
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    return (Bop->getOpcode() == Instruction::Xor &&
+            (isConstantAllOnes(Bop->getOperand(1)) ||
+             isConstantAllOnes(Bop->getOperand(0))));
+  return false;
 }
 
-
 Value *getNotArgument(Value *BinOp) {
-	assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
-	BinaryOperator *BO = cast<BinaryOperator>(BinOp);
-	Value *Op0 = BO->getOperand(0);
-	Value *Op1 = BO->getOperand(1);
-	if (isConstantAllOnes(Op0)) return Op1;
-
-	assert(isConstantAllOnes(Op1));
-	return Op0;
+  assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
+  BinaryOperator *BO = cast<BinaryOperator>(BinOp);
+  Value *Op0 = BO->getOperand(0);
+  Value *Op1 = BO->getOperand(1);
+  if (isConstantAllOnes(Op0))
+    return Op1;
+
+  assert(isConstantAllOnes(Op1));
+  return Op0;
 }
 
 const Value *getNotArgument(const Value *BinOp) {
-	return getNotArgument(const_cast<Value*>(BinOp));
+  return getNotArgument(const_cast<Value *>(BinOp));
 }
 
+void CWriter::visitBinaryOperator(BinaryOperator &I) {
+  // binary instructions, shift instructions, setCond instructions.
+  assert(!I.getType()->isPointerTy());
+  DEBUG(errs() << "visiting binary operator!\n");
+
+  //  // We must cast the results of binary operations which might be promoted.
+  //  bool needsCast = false;
+  //  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
+  //      (I.getType() == Type::getInt16Ty(I.getContext()))
+  //      || (I.getType() == Type::getFloatTy(I.getContext()))) {
+  //    // types too small to work with directly
+  //    needsCast = true;
+  //  } else if (I.getType()->getPrimitiveSizeInBits() > 64) {
+  //    // types too big to work with directly
+  //    needsCast = true;
+  //  }
+  //  bool shouldCast;
+  //  bool castIsSigned;
+  //  opcodeNeedsCast(I.getOpcode(), shouldCast, castIsSigned);
+  //
+  //  if (I.getType()->isVectorTy() || needsCast || shouldCast) {
+  //
+  // DEBUG(
+  //       if(needsCast) errs() << "****Needs Cast: \n" << I << "\n";
+  //       else if(shouldCast) errs() << "****Should Cast: \n" << I << "\n";
+  //       else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n"
+  //       << I << "\n";
+  //       );
+  //
+  //    Type *VTy = I.getOperand(0)->getType();
+  //    unsigned opcode;
+  //    if (BinaryOperator::isNeg(&I)) {
+  //      opcode = BinaryNeg;
+  //      Out << "llvm_neg_";
+  //      printTypeString(Out, VTy, false);
+  //      Out << "(";
+  //      writeOperand(BinaryOperator::getNegArgument(&I), ContextCasted);
+  //    } else if (BinaryOperator::isFNeg(&I)) {
+  //      opcode = BinaryNeg;
+  //      Out << "llvm_neg_";
+  //      printTypeString(Out, VTy, false);
+  //      Out << "(";
+  //      writeOperand(BinaryOperator::getFNegArgument(&I), ContextCasted);
+  //    } else if (BinaryOperator::isNot(&I)) {
+  //      opcode = BinaryNot;
+  //      Out << "llvm_not_";
+  //      printTypeString(Out, VTy, false);
+  //      Out << "(";
+  //      writeOperand(BinaryOperator::getNotArgument(&I), ContextCasted);
+  //    } else {
+  //      opcode = I.getOpcode();
+  //      Out << "llvm_" << Instruction::getOpcodeName(opcode) << "_";
+  //      printTypeString(Out, VTy, false);
+  //      Out << "(";
+  //      writeOperand(I.getOperand(0), ContextCasted);
+  //      Out << ", ";
+  //      writeOperand(I.getOperand(1), ContextCasted);
+  //    }
+  //    Out << ")";
+  //    InlineOpDeclTypes.insert(std::pair<unsigned, Type*>(opcode, VTy));
+  //    return;
+  //  }
 
+  // If this is a negation operation, print it out as such.  For FP, we don't
+  // want to print "-0.0 - X".
 
+  // if (BinaryOperator::isNeg(&I)) {
+  if (isNeg(&I)) {
+    Out << "-(";
+    writeOperand(getNegArgument(&I));
+    Out << ")";
+  }
+  // else if (BinaryOperator::isFNeg(&I)) {
+  else if (isFNeg(&I, true)) {
+    Out << "-(";
+    writeOperand(getFNegArgument(&I));
+    Out << ")";
+  } else if (isNot(&I)) {
+    Out << "~(";
+    writeOperand(getNotArgument(&I));
+    Out << ")";
+  } else if (I.getOpcode() == Instruction::FRem) {
+    // Output a call to fmod/fmodf instead of emitting a%b
+    if (I.getType() == Type::getFloatTy(I.getContext()))
+      Out << "fmodf(";
+    else if (I.getType() == Type::getDoubleTy(I.getContext()))
+      Out << "fmod(";
+    else // all 3 flavors of long double
+      Out << "fmodl(";
+    writeOperand(I.getOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getOperand(1), ContextCasted);
+    Out << ")";
+  } else {
 
+    // Write out the cast of the instruction's value back to the proper type
+    // if necessary.
+    //    bool NeedsClosingParens = writeInstructionCast(I);
 
+    // Certain instructions require the operand to be forced to a specific type
+    // so we use writeOperandWithCast here instead of writeOperand. Similarly
+    // below for operand 1
+    writeOperandWithCast(I.getOperand(0), I.getOpcode());
 
-void CWriter::visitBinaryOperator(BinaryOperator &I) {
-	// binary instructions, shift instructions, setCond instructions.
-	assert(!I.getType()->isPointerTy());
-	DEBUG(errs() << "visiting binary operator!\n" );
-
-	//  // We must cast the results of binary operations which might be promoted.
-	//  bool needsCast = false;
-	//  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
-	//      (I.getType() == Type::getInt16Ty(I.getContext()))
-	//      || (I.getType() == Type::getFloatTy(I.getContext()))) {
-	//    // types too small to work with directly
-	//    needsCast = true;
-	//  } else if (I.getType()->getPrimitiveSizeInBits() > 64) {
-	//    // types too big to work with directly
-	//    needsCast = true;
-	//  }
-	//  bool shouldCast;
-	//  bool castIsSigned;
-	//  opcodeNeedsCast(I.getOpcode(), shouldCast, castIsSigned);
-	//
-	//  if (I.getType()->isVectorTy() || needsCast || shouldCast) {
-	//
-			// DEBUG(
-			//       if(needsCast) errs() << "****Needs Cast: \n" << I << "\n";
-			//       else if(shouldCast) errs() << "****Should Cast: \n" << I << "\n";
-			//       else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n" << I << "\n";
-			//       );
-			//
-			//    Type *VTy = I.getOperand(0)->getType();
-			//    unsigned opcode;
-			//    if (BinaryOperator::isNeg(&I)) {
-			//      opcode = BinaryNeg;
-			//      Out << "llvm_neg_";
-			//      printTypeString(Out, VTy, false);
-			//      Out << "(";
-			//      writeOperand(BinaryOperator::getNegArgument(&I), ContextCasted);
-			//    } else if (BinaryOperator::isFNeg(&I)) {
-			//      opcode = BinaryNeg;
-			//      Out << "llvm_neg_";
-			//      printTypeString(Out, VTy, false);
-			//      Out << "(";
-			//      writeOperand(BinaryOperator::getFNegArgument(&I), ContextCasted);
-			//    } else if (BinaryOperator::isNot(&I)) {
-			//      opcode = BinaryNot;
-			//      Out << "llvm_not_";
-			//      printTypeString(Out, VTy, false);
-			//      Out << "(";
-			//      writeOperand(BinaryOperator::getNotArgument(&I), ContextCasted);
-			//    } else {
-			//      opcode = I.getOpcode();
-			//      Out << "llvm_" << Instruction::getOpcodeName(opcode) << "_";
-			//      printTypeString(Out, VTy, false);
-			//      Out << "(";
-			//      writeOperand(I.getOperand(0), ContextCasted);
-			//      Out << ", ";
-			//      writeOperand(I.getOperand(1), ContextCasted);
-			//    }
-			//    Out << ")";
-			//    InlineOpDeclTypes.insert(std::pair<unsigned, Type*>(opcode, VTy));
-			//    return;
-			//  }
-
-			// If this is a negation operation, print it out as such.  For FP, we don't
-			// want to print "-0.0 - X".
-
-			//if (BinaryOperator::isNeg(&I)) {
-		if (isNeg(&I)) {
-			Out << "-(";
-			writeOperand(getNegArgument(&I));
-			Out << ")";
-		}
-	//else if (BinaryOperator::isFNeg(&I)) {
-		else if (isFNeg(&I, true)) {
-			Out << "-(";
-			writeOperand(getFNegArgument(&I));
-			Out << ")";
-		} else if (isNot(&I)) {
-			Out << "~(";
-			writeOperand(getNotArgument(&I));
-			Out << ")";
-		} else if (I.getOpcode() == Instruction::FRem) {
-			// Output a call to fmod/fmodf instead of emitting a%b
-			if (I.getType() == Type::getFloatTy(I.getContext()))
-				Out << "fmodf(";
-			else if (I.getType() == Type::getDoubleTy(I.getContext()))
-				Out << "fmod(";
-			else  // all 3 flavors of long double
-				Out << "fmodl(";
-			writeOperand(I.getOperand(0), ContextCasted);
-			Out << ", ";
-			writeOperand(I.getOperand(1), ContextCasted);
-			Out << ")";
-		} else {
-
-			// Write out the cast of the instruction's value back to the proper type
-			// if necessary.
-			//    bool NeedsClosingParens = writeInstructionCast(I);
-
-			// Certain instructions require the operand to be forced to a specific type
-			// so we use writeOperandWithCast here instead of writeOperand. Similarly
-			// below for operand 1
-			writeOperandWithCast(I.getOperand(0), I.getOpcode());
-
-			switch (I.getOpcode()) {
-				case Instruction::Add:
-				case Instruction::FAdd: Out << " + "; break;
-				case Instruction::Sub:
-				case Instruction::FSub: Out << " - "; break;
-				case Instruction::Mul:
-				case Instruction::FMul: Out << " * "; break;
-				case Instruction::URem:
-				case Instruction::SRem:
-				case Instruction::FRem: Out << " % "; break;
-				case Instruction::UDiv:
-				case Instruction::SDiv:
-				case Instruction::FDiv: Out << " / "; break;
-				case Instruction::And:  Out << " & "; break;
-				case Instruction::Or:   Out << " | "; break;
-				case Instruction::Xor:  Out << " ^ "; break;
-				case Instruction::Shl : Out << " << "; break;
-				case Instruction::LShr:
-				case Instruction::AShr: Out << " >> "; break;
-				default:
+    switch (I.getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+      Out << " + ";
+      break;
+    case Instruction::Sub:
+    case Instruction::FSub:
+      Out << " - ";
+      break;
+    case Instruction::Mul:
+    case Instruction::FMul:
+      Out << " * ";
+      break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+      Out << " % ";
+      break;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+      Out << " / ";
+      break;
+    case Instruction::And:
+      Out << " & ";
+      break;
+    case Instruction::Or:
+      Out << " | ";
+      break;
+    case Instruction::Xor:
+      Out << " ^ ";
+      break;
+    case Instruction::Shl:
+      Out << " << ";
+      break;
+    case Instruction::LShr:
+    case Instruction::AShr:
+      Out << " >> ";
+      break;
+    default:
 #ifndef NDEBUG
-																errs() << "Invalid operator type!" << I;
+      errs() << "Invalid operator type!" << I;
 #endif
-																llvm_unreachable(0);
-			}
+      llvm_unreachable(0);
+    }
 
-			writeOperandWithCast(I.getOperand(1), I.getOpcode());
-			//    if (NeedsClosingParens)
-			//      Out << "))";
-		}
+    writeOperandWithCast(I.getOperand(1), I.getOpcode());
+    //    if (NeedsClosingParens)
+    //      Out << "))";
+  }
 }
 
 void CWriter::visitICmpInst(ICmpInst &I) {
-	if (I.getType()->isVectorTy()
-			|| I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) {
-		Out << "llvm_icmp_" << getCmpPredicateName(I.getPredicate()) << "_";
-		printTypeString(Out, I.getOperand(0)->getType(), I.isSigned());
-		Out << "(";
-		writeOperand(I.getOperand(0), ContextCasted);
-		Out << ", ";
-		writeOperand(I.getOperand(1), ContextCasted);
-		Out << ")";
-		if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
-			CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy));
-			TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above
-		}
-		return;
-	}
+  if (I.getType()->isVectorTy() ||
+      I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) {
+    Out << "llvm_icmp_" << getCmpPredicateName(I.getPredicate()) << "_";
+    printTypeString(Out, I.getOperand(0)->getType(), I.isSigned());
+    Out << "(";
+    writeOperand(I.getOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getOperand(1), ContextCasted);
+    Out << ")";
+    if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
+      CmpDeclTypes.insert(
+          std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy));
+      TypedefDeclTypes.insert(
+          I.getType()); // insert type not necessarily visible above
+    }
+    return;
+  }
 
-	// Write out the cast of the instruction's value back to the proper type
-	// if necessary.
-	bool NeedsClosingParens = writeInstructionCast(I);
-
-	// Certain icmp predicate require the operand to be forced to a specific type
-	// so we use writeOperandWithCast here instead of writeOperand. Similarly
-	// below for operand 1
-	writeOperandWithCast(I.getOperand(0), I);
-
-	switch (I.getPredicate()) {
-		case ICmpInst::ICMP_EQ:  Out << " == "; break;
-		case ICmpInst::ICMP_NE:  Out << " != "; break;
-		case ICmpInst::ICMP_ULE:
-		case ICmpInst::ICMP_SLE: Out << " <= "; break;
-		case ICmpInst::ICMP_UGE:
-		case ICmpInst::ICMP_SGE: Out << " >= "; break;
-		case ICmpInst::ICMP_ULT:
-		case ICmpInst::ICMP_SLT: Out << " < "; break;
-		case ICmpInst::ICMP_UGT:
-		case ICmpInst::ICMP_SGT: Out << " > "; break;
-		default:
+  // Write out the cast of the instruction's value back to the proper type
+  // if necessary.
+  bool NeedsClosingParens = writeInstructionCast(I);
+
+  // Certain icmp predicate require the operand to be forced to a specific type
+  // so we use writeOperandWithCast here instead of writeOperand. Similarly
+  // below for operand 1
+  writeOperandWithCast(I.getOperand(0), I);
+
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+    Out << " == ";
+    break;
+  case ICmpInst::ICMP_NE:
+    Out << " != ";
+    break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    Out << " <= ";
+    break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    Out << " >= ";
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    Out << " < ";
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    Out << " > ";
+    break;
+  default:
 #ifndef NDEBUG
-														 errs() << "Invalid icmp predicate!" << I;
+    errs() << "Invalid icmp predicate!" << I;
 #endif
-														 llvm_unreachable(0);
-	}
+    llvm_unreachable(0);
+  }
 
-	writeOperandWithCast(I.getOperand(1), I);
-	if (NeedsClosingParens)
-		Out << "))";
+  writeOperandWithCast(I.getOperand(1), I);
+  if (NeedsClosingParens)
+    Out << "))";
 }
 
 void CWriter::visitFCmpInst(FCmpInst &I) {
-	if (I.getType()->isVectorTy()) {
-		Out << "llvm_fcmp_" << getCmpPredicateName(I.getPredicate()) << "_";
-		printTypeString(Out, I.getOperand(0)->getType(), I.isSigned());
-		Out << "(";
-		writeOperand(I.getOperand(0), ContextCasted);
-		Out << ", ";
-		writeOperand(I.getOperand(1), ContextCasted);
-		Out << ")";
-		if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
-			CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy));
-			TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above
-		}
-		return;
-	}
+  if (I.getType()->isVectorTy()) {
+    Out << "llvm_fcmp_" << getCmpPredicateName(I.getPredicate()) << "_";
+    printTypeString(Out, I.getOperand(0)->getType(), I.isSigned());
+    Out << "(";
+    writeOperand(I.getOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getOperand(1), ContextCasted);
+    Out << ")";
+    if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
+      CmpDeclTypes.insert(
+          std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy));
+      TypedefDeclTypes.insert(
+          I.getType()); // insert type not necessarily visible above
+    }
+    return;
+  }
 
-	Out << "llvm_fcmp_" << getCmpPredicateName(I.getPredicate()) << "(";
-	// Write the first operand
-	writeOperand(I.getOperand(0), ContextCasted);
-	Out << ", ";
-	// Write the second operand
-	writeOperand(I.getOperand(1), ContextCasted);
-	Out << ")";
+  Out << "llvm_fcmp_" << getCmpPredicateName(I.getPredicate()) << "(";
+  // Write the first operand
+  writeOperand(I.getOperand(0), ContextCasted);
+  Out << ", ";
+  // Write the second operand
+  writeOperand(I.getOperand(1), ContextCasted);
+  Out << ")";
 }
 
-static const char * getFloatBitCastField(Type *Ty) {
-	switch (Ty->getTypeID()) {
-		default: llvm_unreachable("Invalid Type");
-		case Type::FloatTyID:  return "Float";
-		case Type::DoubleTyID: return "Double";
-		case Type::IntegerTyID: {
-															unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-															if (NumBits <= 32)
-																return "Int32";
-															else
-																return "Int64";
-														}
-	}
+static const char *getFloatBitCastField(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("Invalid Type");
+  case Type::FloatTyID:
+    return "Float";
+  case Type::DoubleTyID:
+    return "Double";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits <= 32)
+      return "Int32";
+    else
+      return "Int64";
+  }
+  }
 }
 
 void CWriter::visitCastInst(CastInst &I) {
-	DEBUG(errs() << "This is a cast instruction!\n");
-	Type *DstTy = I.getType();
-	Type *SrcTy = I.getOperand(0)->getType();
-
-	if (DstTy->isVectorTy() || SrcTy->isVectorTy()
-			|| DstTy->getPrimitiveSizeInBits() > 64
-			|| SrcTy->getPrimitiveSizeInBits() > 64) {
-		Out << "llvm_" << I.getOpcodeName() << "_";
-		printTypeString(Out, SrcTy, false);
-		Out << "_";
-		printTypeString(Out, DstTy, false);
-		Out << "(";
-		writeOperand(I.getOperand(0), ContextCasted);
-		Out << ")";
-		CastOpDeclTypes.insert(std::pair<Instruction::CastOps, std::pair<Type*, Type*> >(I.getOpcode(), std::pair<Type*, Type*>(SrcTy, DstTy)));
-		return;
-	}
+  DEBUG(errs() << "This is a cast instruction!\n");
+  Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
 
-	if (isFPIntBitCast(I)) {
-		Out << '(';
-		// These int<->float and long<->double casts need to be handled specially
-		Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
-			<< getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
-		writeOperand(I.getOperand(0), ContextCasted);
-		Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
-			<< getFloatBitCastField(I.getType());
-		Out << ')';
-		return;
-	}
+  if (DstTy->isVectorTy() || SrcTy->isVectorTy() ||
+      DstTy->getPrimitiveSizeInBits() > 64 ||
+      SrcTy->getPrimitiveSizeInBits() > 64) {
+    Out << "llvm_" << I.getOpcodeName() << "_";
+    printTypeString(Out, SrcTy, false);
+    Out << "_";
+    printTypeString(Out, DstTy, false);
+    Out << "(";
+    writeOperand(I.getOperand(0), ContextCasted);
+    Out << ")";
+    CastOpDeclTypes.insert(
+        std::pair<Instruction::CastOps, std::pair<Type *, Type *>>(
+            I.getOpcode(), std::pair<Type *, Type *>(SrcTy, DstTy)));
+    return;
+  }
+
+  if (isFPIntBitCast(I)) {
+    Out << '(';
+    // These int<->float and long<->double casts need to be handled specially
+    Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+    writeOperand(I.getOperand(0), ContextCasted);
+    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getType());
+    Out << ')';
+    return;
+  }
 
-	Out << '(';
-	printCast(I.getOpcode(), SrcTy, DstTy);
+  Out << '(';
+  printCast(I.getOpcode(), SrcTy, DstTy);
 
-	// Make a sext from i1 work by subtracting the i1 from 0 (an int).
-	if (SrcTy == Type::getInt1Ty(I.getContext()) &&
-			I.getOpcode() == Instruction::SExt)
-		Out << "0-";
+  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
+  if (SrcTy == Type::getInt1Ty(I.getContext()) &&
+      I.getOpcode() == Instruction::SExt)
+    Out << "0-";
 
-	writeOperand(I.getOperand(0), ContextCasted);
+  writeOperand(I.getOperand(0), ContextCasted);
 
-	if (DstTy == Type::getInt1Ty(I.getContext()) &&
-			(I.getOpcode() == Instruction::Trunc ||
-			 I.getOpcode() == Instruction::FPToUI ||
-			 I.getOpcode() == Instruction::FPToSI ||
-			 I.getOpcode() == Instruction::PtrToInt)) {
-		// Make sure we really get a trunc to bool by anding the operand with 1
-		Out << "&1u";
-	}
-	Out << ')';
+  if (DstTy == Type::getInt1Ty(I.getContext()) &&
+      (I.getOpcode() == Instruction::Trunc ||
+       I.getOpcode() == Instruction::FPToUI ||
+       I.getOpcode() == Instruction::FPToSI ||
+       I.getOpcode() == Instruction::PtrToInt)) {
+    // Make sure we really get a trunc to bool by anding the operand with 1
+    Out << "&1u";
+  }
+  Out << ')';
 }
 
 void CWriter::visitSelectInst(SelectInst &I) {
-	Out << "llvm_select_";
-	printTypeString(Out, I.getType(), false);
-	Out << "(";
-	writeOperand(I.getCondition(), ContextCasted);
-	Out << ", ";
-	writeOperand(I.getTrueValue(), ContextCasted);
-	Out << ", ";
-	writeOperand(I.getFalseValue(), ContextCasted);
-	Out << ")";
-	SelectDeclTypes.insert(I.getType());
-	assert(I.getCondition()->getType()->isVectorTy() == I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty
+  Out << "llvm_select_";
+  printTypeString(Out, I.getType(), false);
+  Out << "(";
+  writeOperand(I.getCondition(), ContextCasted);
+  Out << ", ";
+  writeOperand(I.getTrueValue(), ContextCasted);
+  Out << ", ";
+  writeOperand(I.getFalseValue(), ContextCasted);
+  Out << ")";
+  SelectDeclTypes.insert(I.getType());
+  assert(I.getCondition()->getType()->isVectorTy() ==
+         I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty
 }
 
 // Returns the macro name or value of the max or min of an integer type
 // (as defined in limits.h).
 static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
-		raw_ostream &Out) {
-	const char* type;
-	const char* sprefix = "";
-
-	unsigned NumBits = Ty.getBitWidth();
-	if (NumBits <= 8) {
-		type = "CHAR";
-		sprefix = "S";
-	} else if (NumBits <= 16) {
-		type = "SHRT";
-	} else if (NumBits <= 32) {
-		type = "INT";
-	} else if (NumBits <= 64) {
-		type = "LLONG";
-	} else {
-		llvm_unreachable("Bit widths > 64 not implemented yet");
-	}
+                            raw_ostream &Out) {
+  const char *type;
+  const char *sprefix = "";
+
+  unsigned NumBits = Ty.getBitWidth();
+  if (NumBits <= 8) {
+    type = "CHAR";
+    sprefix = "S";
+  } else if (NumBits <= 16) {
+    type = "SHRT";
+  } else if (NumBits <= 32) {
+    type = "INT";
+  } else if (NumBits <= 64) {
+    type = "LLONG";
+  } else {
+    llvm_unreachable("Bit widths > 64 not implemented yet");
+  }
 
-	if (isSigned)
-		Out << sprefix << type << (isMax ? "_MAX" : "_MIN");
-	else
-		Out << "U" << type << (isMax ? "_MAX" : "0");
+  if (isSigned)
+    Out << sprefix << type << (isMax ? "_MAX" : "_MIN");
+  else
+    Out << "U" << type << (isMax ? "_MAX" : "0");
 }
 
 #ifndef NDEBUG
 static bool isSupportedIntegerSize(IntegerType &T) {
-	return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
-		T.getBitWidth() == 32 || T.getBitWidth() == 64 ||
-		T.getBitWidth() == 128;
+  return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
+         T.getBitWidth() == 32 || T.getBitWidth() == 64 ||
+         T.getBitWidth() == 128;
 }
 #endif
 
-void CWriter::printIntrinsicDefinition(FunctionType *funT,
-		unsigned Opcode, std::string OpName, raw_ostream &Out) {
-	Type *retT = funT->getReturnType();
-	Type *elemT = funT->getParamType(0);
-	IntegerType *elemIntT = dyn_cast<IntegerType>(elemT);
-	char i, numParams = funT->getNumParams();
-	bool isSigned;
-	switch (Opcode) {
-		default:
-			isSigned = false;
-			break;
-		case Intrinsic::sadd_with_overflow:
-		case Intrinsic::ssub_with_overflow:
-		case Intrinsic::smul_with_overflow:
-			isSigned = true;
-			break;
-	}
-	assert(numParams > 0 && numParams < 26);
+void CWriter::printIntrinsicDefinition(FunctionType *funT, unsigned Opcode,
+                                       std::string OpName, raw_ostream &Out) {
+  Type *retT = funT->getReturnType();
+  Type *elemT = funT->getParamType(0);
+  IntegerType *elemIntT = dyn_cast<IntegerType>(elemT);
+  char i, numParams = funT->getNumParams();
+  bool isSigned;
+  switch (Opcode) {
+  default:
+    isSigned = false;
+    break;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+    isSigned = true;
+    break;
+  }
+  assert(numParams > 0 && numParams < 26);
 
-	if (isa<VectorType>(retT)) {
-		// this looks general, but is only actually used for ctpop, ctlz, cttz
-		Type* *devecFunParams = (Type**)alloca(sizeof(Type*) * numParams);
-		for (i = 0; i < numParams; i++) {
-			devecFunParams[(int)i] = funT->params()[(int)i]->getScalarType();
-		}
-		FunctionType *devecFunT = FunctionType::get(funT->getReturnType()->getScalarType(),
-				makeArrayRef(devecFunParams, numParams), funT->isVarArg());
-		printIntrinsicDefinition(devecFunT, Opcode, OpName + "_devec", Out);
-	}
+  if (isa<VectorType>(retT)) {
+    // this looks general, but is only actually used for ctpop, ctlz, cttz
+    Type **devecFunParams = (Type **)alloca(sizeof(Type *) * numParams);
+    for (i = 0; i < numParams; i++) {
+      devecFunParams[(int)i] = funT->params()[(int)i]->getScalarType();
+    }
+    FunctionType *devecFunT = FunctionType::get(
+        funT->getReturnType()->getScalarType(),
+        makeArrayRef(devecFunParams, numParams), funT->isVarArg());
+    printIntrinsicDefinition(devecFunT, Opcode, OpName + "_devec", Out);
+  }
 
-	// static __forceinline Rty _llvm_op_ixx(unsigned ixx a, unsigned ixx b) {
-	//   Rty r;
-	//   <opcode here>
-	//   return r;
-	// }
-	Out << "static __forceinline ";
-	printTypeName(Out, retT);
-	Out << " ";
-	Out << OpName;
-	Out << "(";
-	for (i = 0; i < numParams; i++) {
-		switch (Opcode) {
-			// optional intrinsic validity assertion checks
-			default:
-				// default case: assume all parameters must have the same type
-				assert(elemT == funT->getParamType(i));
-				break;
-			case Intrinsic::ctlz:
-			case Intrinsic::cttz:
-			case Intrinsic::powi:
-				break;
-		}
-		printTypeNameUnaligned(Out, funT->getParamType(i), isSigned);
-		Out << " " << (char)('a' + i);
-		if (i != numParams - 1) Out << ", ";
-	}
-	Out << ") {\n  ";
-	printTypeName(Out, retT);
-	Out << " r;\n";
-
-	if (isa<VectorType>(retT)) {
-		for (i = 0; i < numParams; i++) {
-			Out << "  r.vector[" << (int)i << "] = " << OpName << "_devec(";
-			for (char j = 0; j < numParams; j++) {
-				Out << (char)('a' + j);
-				if (isa<VectorType>(funT->params()[j]))
-					Out << ".vector[" << (int)i << "]";
-				if (j != numParams - 1) Out << ", ";
-			}
-			Out << ");\n";
-		}
-	}
-	else if (elemIntT) {
-		// handle integer ops
-		assert(isSupportedIntegerSize(*elemIntT) &&
-				"CBackend does not support arbitrary size integers.");
-		switch (Opcode) {
-			default:
+  // static __forceinline Rty _llvm_op_ixx(unsigned ixx a, unsigned ixx b) {
+  //   Rty r;
+  //   <opcode here>
+  //   return r;
+  // }
+  Out << "static __forceinline ";
+  printTypeName(Out, retT);
+  Out << " ";
+  Out << OpName;
+  Out << "(";
+  for (i = 0; i < numParams; i++) {
+    switch (Opcode) {
+    // optional intrinsic validity assertion checks
+    default:
+      // default case: assume all parameters must have the same type
+      assert(elemT == funT->getParamType(i));
+      break;
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::powi:
+      break;
+    }
+    printTypeNameUnaligned(Out, funT->getParamType(i), isSigned);
+    Out << " " << (char)('a' + i);
+    if (i != numParams - 1)
+      Out << ", ";
+  }
+  Out << ") {\n  ";
+  printTypeName(Out, retT);
+  Out << " r;\n";
+
+  if (isa<VectorType>(retT)) {
+    for (i = 0; i < numParams; i++) {
+      Out << "  r.vector[" << (int)i << "] = " << OpName << "_devec(";
+      for (char j = 0; j < numParams; j++) {
+        Out << (char)('a' + j);
+        if (isa<VectorType>(funT->params()[j]))
+          Out << ".vector[" << (int)i << "]";
+        if (j != numParams - 1)
+          Out << ", ";
+      }
+      Out << ");\n";
+    }
+  } else if (elemIntT) {
+    // handle integer ops
+    assert(isSupportedIntegerSize(*elemIntT) &&
+           "CBackend does not support arbitrary size integers.");
+    switch (Opcode) {
+    default:
 #ifndef NDEBUG
-				errs() << "Unsupported Intrinsic!" << Opcode;
+      errs() << "Unsupported Intrinsic!" << Opcode;
 #endif
-				llvm_unreachable(0);
-
-			case Intrinsic::uadd_with_overflow:
-				//   r.field0 = a + b;
-				//   r.field1 = (r.field0 < a);
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field0 = a + b;\n";
-				Out << "  r.field1 = (a >= -b);\n";
-				break;
-
-			case Intrinsic::sadd_with_overflow:
-				//   r.field0 = a + b;
-				//   r.field1 = (b > 0 && a > XX_MAX - b) ||
-				//              (b < 0 && a < XX_MIN - b);
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field0 = a + b;\n";
-				Out << "  r.field1 = (b >= 0 ? a > ";
-				printLimitValue(*elemIntT, true, true, Out);
-				Out << " - b : a < ";
-				printLimitValue(*elemIntT, true, false, Out);
-				Out << " - b);\n";
-				break;
-
-			case Intrinsic::usub_with_overflow:
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field0 = a - b;\n";
-				Out << "  r.field1 = (a < b);\n";
-				break;
-
-			case Intrinsic::ssub_with_overflow:
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field0 = a - b;\n";
-				Out << "  r.field1 = (b <= 0 ? a > ";
-				printLimitValue(*elemIntT, true, true, Out);
-				Out << " + b : a < ";
-				printLimitValue(*elemIntT, true, false, Out);
-				Out << " + b);\n";
-				break;
-
-			case Intrinsic::umul_with_overflow:
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n";
-				break;
-
-			case Intrinsic::smul_with_overflow:
-				assert(cast<StructType>(retT)->getElementType(0) == elemT);
-				Out << "  r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n";
-				break;
-
-			case Intrinsic::bswap:
-				assert(retT == elemT);
-				Out << "  LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n";
-				break;
-
-			case Intrinsic::ctpop:
-				assert(retT == elemT);
-				Out << "  r = ";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << "llvm_ctor_u128(0, ";
-				Out << "LLVMCountPopulation(8 * sizeof(a), &a)";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << ")";
-				Out << ";\n";
-				break;
-
-			case Intrinsic::ctlz:
-				assert(retT == elemT);
-				Out << "  (void)b;\n  r = ";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << "llvm_ctor_u128(0, ";
-				Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << ")";
-				Out << ";\n";
-				break;
-
-			case Intrinsic::cttz:
-				assert(retT == elemT);
-				Out << "  (void)b;\n  r = ";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << "llvm_ctor_u128(0, ";
-				Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)";
-				if (retT->getPrimitiveSizeInBits() > 64)
-					Out << ")";
-				Out << ";\n";
-				break;
-		}
+      llvm_unreachable(0);
+
+    case Intrinsic::uadd_with_overflow:
+      //   r.field0 = a + b;
+      //   r.field1 = (r.field0 < a);
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a + b;\n";
+      Out << "  r.field1 = (a >= -b);\n";
+      break;
 
-	} else {
-		// handle FP ops
-		const char *suffix;
-		assert(retT == elemT);
-		if (elemT->isFloatTy() || elemT->isHalfTy()) {
-			suffix = "f";
-		} else if (elemT->isDoubleTy()) {
-			suffix = "";
-		} else if (elemT->isFP128Ty()) {
-		} else if (elemT->isX86_FP80Ty()) {
-		} else if (elemT->isPPC_FP128Ty()) {
-			suffix = "l";
-		} else {
+    case Intrinsic::sadd_with_overflow:
+      //   r.field0 = a + b;
+      //   r.field1 = (b > 0 && a > XX_MAX - b) ||
+      //              (b < 0 && a < XX_MIN - b);
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a + b;\n";
+      Out << "  r.field1 = (b >= 0 ? a > ";
+      printLimitValue(*elemIntT, true, true, Out);
+      Out << " - b : a < ";
+      printLimitValue(*elemIntT, true, false, Out);
+      Out << " - b);\n";
+      break;
+
+    case Intrinsic::usub_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a - b;\n";
+      Out << "  r.field1 = (a < b);\n";
+      break;
+
+    case Intrinsic::ssub_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a - b;\n";
+      Out << "  r.field1 = (b <= 0 ? a > ";
+      printLimitValue(*elemIntT, true, true, Out);
+      Out << " + b : a < ";
+      printLimitValue(*elemIntT, true, false, Out);
+      Out << " + b);\n";
+      break;
+
+    case Intrinsic::umul_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n";
+      break;
+
+    case Intrinsic::smul_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n";
+      break;
+
+    case Intrinsic::bswap:
+      assert(retT == elemT);
+      Out << "  LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n";
+      break;
+
+    case Intrinsic::ctpop:
+      assert(retT == elemT);
+      Out << "  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountPopulation(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
+
+    case Intrinsic::ctlz:
+      assert(retT == elemT);
+      Out << "  (void)b;\n  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
+
+    case Intrinsic::cttz:
+      assert(retT == elemT);
+      Out << "  (void)b;\n  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
+    }
+
+  } else {
+    // handle FP ops
+    const char *suffix;
+    assert(retT == elemT);
+    if (elemT->isFloatTy() || elemT->isHalfTy()) {
+      suffix = "f";
+    } else if (elemT->isDoubleTy()) {
+      suffix = "";
+    } else if (elemT->isFP128Ty()) {
+    } else if (elemT->isX86_FP80Ty()) {
+    } else if (elemT->isPPC_FP128Ty()) {
+      suffix = "l";
+    } else {
 #ifndef NDEBUG
-			errs() << "Unsupported Intrinsic!" << Opcode;
+      errs() << "Unsupported Intrinsic!" << Opcode;
 #endif
-			llvm_unreachable(0);
-		}
+      llvm_unreachable(0);
+    }
 
-		switch (Opcode) {
-			default:
+    switch (Opcode) {
+    default:
 #ifndef NDEBUG
-				errs() << "Unsupported Intrinsic!" << Opcode;
+      errs() << "Unsupported Intrinsic!" << Opcode;
 #endif
-				llvm_unreachable(0);
+      llvm_unreachable(0);
 
-			case Intrinsic::ceil:
-				Out << "  r = ceil" << suffix << "(a);\n";
-				break;
-
-			case Intrinsic::fabs:
-				Out << "  r = fabs" << suffix << "(a);\n";
-				break;
+    case Intrinsic::ceil:
+      Out << "  r = ceil" << suffix << "(a);\n";
+      break;
 
-			case Intrinsic::floor:
-				Out << "  r = floor" << suffix << "(a);\n";
-				break;
+    case Intrinsic::fabs:
+      Out << "  r = fabs" << suffix << "(a);\n";
+      break;
 
-			case Intrinsic::fma:
-				Out << "  r = fma" << suffix << "(a, b, c);\n";
-				break;
+    case Intrinsic::floor:
+      Out << "  r = floor" << suffix << "(a);\n";
+      break;
 
-			case Intrinsic::fmuladd:
-				Out << "  r = a * b + c;\n";
-				break;
+    case Intrinsic::fma:
+      Out << "  r = fma" << suffix << "(a, b, c);\n";
+      break;
 
-			case Intrinsic::pow:
-			case Intrinsic::powi:
-				Out << "  r = pow" << suffix << "(a, b);\n";
-				break;
+    case Intrinsic::fmuladd:
+      Out << "  r = a * b + c;\n";
+      break;
 
-			case Intrinsic::rint:
-				Out << "  r = rint" << suffix << "(a);\n";
-				break;
+    case Intrinsic::pow:
+    case Intrinsic::powi:
+      Out << "  r = pow" << suffix << "(a, b);\n";
+      break;
 
-			case Intrinsic::sqrt:
-				Out << "  r = sqrt"  << "(a);\n";
-				break;
+    case Intrinsic::rint:
+      Out << "  r = rint" << suffix << "(a);\n";
+      break;
 
-			case Intrinsic::trunc:
-				Out << "  r = trunc" << suffix << "(a);\n";
-				break;
+    case Intrinsic::sqrt:
+      Out << "  r = sqrt"
+          << "(a);\n";
+      break;
 
-		}
-	}
+    case Intrinsic::trunc:
+      Out << "  r = trunc" << suffix << "(a);\n";
+      break;
+    }
+  }
 
-	Out << "  return r;\n}\n";
+  Out << "  return r;\n}\n";
 }
 
 void CWriter::printIntrinsicDefinition(Function &F, raw_ostream &Out) {
-	FunctionType *funT = F.getFunctionType();
-	unsigned Opcode = F.getIntrinsicID();
-	std::string OpName = GetValueName(&F);
-	printIntrinsicDefinition(funT, Opcode, OpName, Out);
+  FunctionType *funT = F.getFunctionType();
+  unsigned Opcode = F.getIntrinsicID();
+  std::string OpName = GetValueName(&F);
+  printIntrinsicDefinition(funT, Opcode, OpName, Out);
 }
 
 void CWriter::lowerIntrinsics(Function &F) {
-	// Examine all the instructions in this function to find the intrinsics that
-	// need to be lowered.
-	for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
-		for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
-			if (CallInst *CI = dyn_cast<CallInst>(I++))
-				if (Function *F = CI->getCalledFunction())
-					switch (F->getIntrinsicID()) {
-						case Intrinsic::not_intrinsic:
-						case Intrinsic::vastart:
-						case Intrinsic::vacopy:
-						case Intrinsic::vaend:
-						case Intrinsic::returnaddress:
-						case Intrinsic::frameaddress:
-						case Intrinsic::setjmp:
-						case Intrinsic::longjmp:
-						case Intrinsic::sigsetjmp:
-						case Intrinsic::siglongjmp:
-						case Intrinsic::prefetch:
-						case Intrinsic::x86_sse_cmp_ss:
-						case Intrinsic::x86_sse_cmp_ps:
-						case Intrinsic::x86_sse2_cmp_sd:
-						case Intrinsic::x86_sse2_cmp_pd:
-						case Intrinsic::ppc_altivec_lvsl:
-						case Intrinsic::uadd_with_overflow:
-						case Intrinsic::sadd_with_overflow:
-						case Intrinsic::usub_with_overflow:
-						case Intrinsic::ssub_with_overflow:
-						case Intrinsic::umul_with_overflow:
-						case Intrinsic::smul_with_overflow:
-						case Intrinsic::bswap:
-						case Intrinsic::ceil:
-						case Intrinsic::ctlz:
-						case Intrinsic::ctpop:
-						case Intrinsic::cttz:
-						case Intrinsic::fabs:
-						case Intrinsic::floor:
-						case Intrinsic::fma:
-						case Intrinsic::fmuladd:
-						case Intrinsic::pow:
-						case Intrinsic::powi:
-						case Intrinsic::rint:
-						case Intrinsic::sqrt:
-						case Intrinsic::trunc:
-						case Intrinsic::trap:
-						case Intrinsic::stackprotector:
-						case Intrinsic::dbg_value:
-						case Intrinsic::dbg_declare:
-							// We directly implement these intrinsics
-							break;
-						default:
-							// All other intrinsic calls we must lower.
-							BasicBlock::iterator Before = E;
-							if (CI != &BB->front())
-								Before = std::prev(BasicBlock::iterator(CI));
-
-							IL->LowerIntrinsicCall(CI);
-							if (Before != E) {        // Move iterator to instruction after call
-								I = Before; ++I;
-							} else {
-								I = BB->begin();
-							}
-							// If the intrinsic got lowered to another call, and that call has
-							// a definition then we need to make sure its prototype is emitted
-							// before any calls to it.
-							if (CallInst *Call = dyn_cast<CallInst>(I))
-								if (Function *NewF = Call->getCalledFunction())
-									if (!NewF->isDeclaration())
-										prototypesToGen.push_back(NewF);
-
-							break;
-					}
+  // Examine all the instructions in this function to find the intrinsics that
+  // need to be lowered.
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;)
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (Function *F = CI->getCalledFunction())
+          switch (F->getIntrinsicID()) {
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::sigsetjmp:
+          case Intrinsic::siglongjmp:
+          case Intrinsic::prefetch:
+          case Intrinsic::x86_sse_cmp_ss:
+          case Intrinsic::x86_sse_cmp_ps:
+          case Intrinsic::x86_sse2_cmp_sd:
+          case Intrinsic::x86_sse2_cmp_pd:
+          case Intrinsic::ppc_altivec_lvsl:
+          case Intrinsic::uadd_with_overflow:
+          case Intrinsic::sadd_with_overflow:
+          case Intrinsic::usub_with_overflow:
+          case Intrinsic::ssub_with_overflow:
+          case Intrinsic::umul_with_overflow:
+          case Intrinsic::smul_with_overflow:
+          case Intrinsic::bswap:
+          case Intrinsic::ceil:
+          case Intrinsic::ctlz:
+          case Intrinsic::ctpop:
+          case Intrinsic::cttz:
+          case Intrinsic::fabs:
+          case Intrinsic::floor:
+          case Intrinsic::fma:
+          case Intrinsic::fmuladd:
+          case Intrinsic::pow:
+          case Intrinsic::powi:
+          case Intrinsic::rint:
+          case Intrinsic::sqrt:
+          case Intrinsic::trunc:
+          case Intrinsic::trap:
+          case Intrinsic::stackprotector:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+            // We directly implement these intrinsics
+            break;
+          default:
+            // All other intrinsic calls we must lower.
+            BasicBlock::iterator Before = E;
+            if (CI != &BB->front())
+              Before = std::prev(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before != E) { // Move iterator to instruction after call
+              I = Before;
+              ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
+          }
 }
 
 void CWriter::visitCallInst(CallInst &I) {
-	if (isa<InlineAsm>(I.getCalledValue()))
-		return visitInlineAsm(I);
-
-	// Handle intrinsic function calls first...
-	if (Function *F = I.getCalledFunction())
-		if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
-			if (visitBuiltinCall(I, ID))
-				return;
-
-	Value *Callee = I.getCalledValue();
-
-	PointerType  *PTy   = cast<PointerType>(Callee->getType());
-	FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
-
-	// If this is a call to a struct-return function, assign to the first
-	// parameter instead of passing it to the call.
-
-	// CHECK: If AttributeList replaces AttributeSet for CallInst
-	const AttributeList PAL = I.getAttributes();
-	bool hasByVal = I.hasByValArgument();
-	bool isStructRet = I.hasStructRetAttr();
-	if (isStructRet) {
-		writeOperandDeref(I.getArgOperand(0));
-		Out << " = ";
-	}
+  if (isa<InlineAsm>(I.getCalledValue()))
+    return visitInlineAsm(I);
 
-	if (I.isTailCall()) Out << " /*tail*/ ";
-
-	// If this is an indirect call to a struct return function, we need to cast
-	// the pointer. Ditto for indirect calls with byval arguments.
-	bool NeedsCast = (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) && !isa<Function>(Callee);
-
-	// GCC is a real PITA.  It does not permit codegening casts of functions to
-	// function pointers if they are in a call (it generates a trap instruction
-	// instead!).  We work around this by inserting a cast to void* in between
-	// the function and the function pointer cast.  Unfortunately, we can't just
-	// form the constant expression here, because the folder will immediately
-	// nuke it.
-	//
-	// Note finally, that this is completely unsafe.  ANSI C does not guarantee
-	// that void* and function pointers have the same size. :( To deal with this
-	// in the common case, we handle casts where the number of arguments passed
-	// match exactly.
-	//
-	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
-		if (CE->isCast())
-			if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
-				NeedsCast = true;
-				Callee = RF;
-			}
-
-	if (NeedsCast) {
-		// Ok, just cast the pointer type.
-		Out << "((";
-		printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(), false, std::make_pair(PAL, I.getCallingConv()));
-		Out << "*)(void*)";
-	}
-	writeOperand(Callee, ContextCasted);
-	if (NeedsCast) Out << ')';
+  // Handle intrinsic function calls first...
+  if (Function *F = I.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      if (visitBuiltinCall(I, ID))
+        return;
 
-	Out << '(';
+  Value *Callee = I.getCalledValue();
 
-	bool PrintedArg = false;
-	if (FTy->isVarArg() && !FTy->getNumParams()) {
-		Out << "0 /*dummy arg*/";
-		PrintedArg = true;
-	}
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
 
-	unsigned NumDeclaredParams = FTy->getNumParams();
-	CallSite CS(&I);
-	CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-	unsigned ArgNo = 0;
-	if (isStructRet) {   // Skip struct return argument.
-		++AI;
-		++ArgNo;
-	}
+  // If this is a call to a struct-return function, assign to the first
+  // parameter instead of passing it to the call.
 
-	Function *F = I.getCalledFunction();
-	if (F) {
-		StringRef Name = F->getName();
-		// emit cast for the first argument to type expected by header prototype
-		// the jmp_buf type is an array, so the array-to-pointer decay adds the
-		// strange extra *'s
-		if (Name == "sigsetjmp")
-			Out << "*(sigjmp_buf*)";
-		else if (Name == "setjmp")
-			Out << "*(jmp_buf*)";
-	}
+  // CHECK: If AttributeList replaces AttributeSet for CallInst
+  const AttributeList PAL = I.getAttributes();
+  bool hasByVal = I.hasByValArgument();
+  bool isStructRet = I.hasStructRetAttr();
+  if (isStructRet) {
+    writeOperandDeref(I.getArgOperand(0));
+    Out << " = ";
+  }
 
-	for (; AI != AE; ++AI, ++ArgNo) {
-		if (PrintedArg) Out << ", ";
-		if (ArgNo < NumDeclaredParams &&
-				(*AI)->getType() != FTy->getParamType(ArgNo)) {
-			Out << '(';
-			printTypeNameUnaligned(Out, FTy->getParamType(ArgNo),
-					/*isSigned=*/PAL.hasAttribute(ArgNo+1, Attribute::SExt));
-			Out << ')';
-		}
-		// Check if the argument is expected to be passed by value.
-		if (I.getAttributes().hasAttribute(ArgNo+1, Attribute::ByVal))
-			writeOperandDeref(*AI);
-		else
-			writeOperand(*AI, ContextCasted);
-		PrintedArg = true;
-	}
-	Out << ')';
+  if (I.isTailCall())
+    Out << " /*tail*/ ";
+
+  // If this is an indirect call to a struct return function, we need to cast
+  // the pointer. Ditto for indirect calls with byval arguments.
+  bool NeedsCast =
+      (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) &&
+      !isa<Function>(Callee);
+
+  // GCC is a real PITA.  It does not permit codegening casts of functions to
+  // function pointers if they are in a call (it generates a trap instruction
+  // instead!).  We work around this by inserting a cast to void* in between
+  // the function and the function pointer cast.  Unfortunately, we can't just
+  // form the constant expression here, because the folder will immediately
+  // nuke it.
+  //
+  // Note finally, that this is completely unsafe.  ANSI C does not guarantee
+  // that void* and function pointers have the same size. :( To deal with this
+  // in the common case, we handle casts where the number of arguments passed
+  // match exactly.
+  //
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
+    if (CE->isCast())
+      if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
+        NeedsCast = true;
+        Callee = RF;
+      }
+
+  if (NeedsCast) {
+    // Ok, just cast the pointer type.
+    Out << "((";
+    printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(),
+                  false, std::make_pair(PAL, I.getCallingConv()));
+    Out << "*)(void*)";
+  }
+  writeOperand(Callee, ContextCasted);
+  if (NeedsCast)
+    Out << ')';
+
+  Out << '(';
+
+  bool PrintedArg = false;
+  if (FTy->isVarArg() && !FTy->getNumParams()) {
+    Out << "0 /*dummy arg*/";
+    PrintedArg = true;
+  }
+
+  unsigned NumDeclaredParams = FTy->getNumParams();
+  CallSite CS(&I);
+  CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+  unsigned ArgNo = 0;
+  if (isStructRet) { // Skip struct return argument.
+    ++AI;
+    ++ArgNo;
+  }
+
+  Function *F = I.getCalledFunction();
+  if (F) {
+    StringRef Name = F->getName();
+    // emit cast for the first argument to type expected by header prototype
+    // the jmp_buf type is an array, so the array-to-pointer decay adds the
+    // strange extra *'s
+    if (Name == "sigsetjmp")
+      Out << "*(sigjmp_buf*)";
+    else if (Name == "setjmp")
+      Out << "*(jmp_buf*)";
+  }
+
+  for (; AI != AE; ++AI, ++ArgNo) {
+    if (PrintedArg)
+      Out << ", ";
+    if (ArgNo < NumDeclaredParams &&
+        (*AI)->getType() != FTy->getParamType(ArgNo)) {
+      Out << '(';
+      printTypeNameUnaligned(
+          Out, FTy->getParamType(ArgNo),
+          /*isSigned=*/PAL.hasAttribute(ArgNo + 1, Attribute::SExt));
+      Out << ')';
+    }
+    // Check if the argument is expected to be passed by value.
+    if (I.getAttributes().hasAttribute(ArgNo + 1, Attribute::ByVal))
+      writeOperandDeref(*AI);
+    else
+      writeOperand(*AI, ContextCasted);
+    PrintedArg = true;
+  }
+  Out << ')';
 }
 
 /// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
 /// if the entire call is handled, return false if it wasn't handled
 bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID) {
 
-	switch (ID) {
-		default: {
+  switch (ID) {
+  default: {
 #ifndef NDEBUG
-							 errs() << "Unknown LLVM intrinsic! " << I;
+    errs() << "Unknown LLVM intrinsic! " << I;
 #endif
-							 llvm_unreachable(0);
-							 return false;
-						 }
-
-		case Intrinsic::dbg_value:
-		case Intrinsic::dbg_declare:
-						 return true; // ignore these intrinsics
-		case Intrinsic::vastart:
-						 Out << "0; ";
-
-						 Out << "va_start(*(va_list*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", ";
-						 // Output the last argument to the enclosing function.
-						 if (I.getParent()->getParent()->arg_empty())
-							 Out << "vararg_dummy_arg";
-						 else
-							 writeOperand(&*(I.getParent()->getParent()->arg_end() - 1));
-						 Out << ')';
-						 return true;
-		case Intrinsic::vaend:
-						 if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
-							 Out << "0; va_end(*(va_list*)";
-							 writeOperand(I.getArgOperand(0), ContextCasted);
-							 Out << ')';
-						 } else {
-							 Out << "va_end(*(va_list*)0)";
-						 }
-						 return true;
-		case Intrinsic::vacopy:
-						 Out << "0; ";
-						 Out << "va_copy(*(va_list*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", *(va_list*)";
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::returnaddress:
-						 Out << "__builtin_return_address(";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::frameaddress:
-						 Out << "__builtin_frame_address(";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::setjmp:
-						 Out << "setjmp(*(jmp_buf*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::longjmp:
-						 Out << "longjmp(*(jmp_buf*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", ";
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::sigsetjmp:
-						 Out << "sigsetjmp(*(sigjmp_buf*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ',';
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::siglongjmp:
-						 Out << "siglongjmp(*(sigjmp_buf*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", ";
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ')';
-						 return true;
-		case Intrinsic::prefetch:
-						 Out << "LLVM_PREFETCH((const void *)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", ";
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ", ";
-						 writeOperand(I.getArgOperand(2), ContextCasted);
-						 Out << ")";
-						 return true;
-		case Intrinsic::stacksave:
-						 // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
-						 // to work around GCC bugs (see PR1809).
-						 Out << "0; *((void**)&" << GetValueName(&I)
-							 << ") = __builtin_stack_save()";
-						 return true;
-		case Intrinsic::x86_sse_cmp_ss:
-		case Intrinsic::x86_sse_cmp_ps:
-		case Intrinsic::x86_sse2_cmp_sd:
-		case Intrinsic::x86_sse2_cmp_pd:
-						 Out << '(';
-						 printTypeName(Out, I.getType());
-						 Out << ')';
-						 // Multiple GCC builtins multiplex onto this intrinsic.
-						 switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
-							 default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
-							 case 0: Out << "__builtin_ia32_cmpeq"; break;
-							 case 1: Out << "__builtin_ia32_cmplt"; break;
-							 case 2: Out << "__builtin_ia32_cmple"; break;
-							 case 3: Out << "__builtin_ia32_cmpunord"; break;
-							 case 4: Out << "__builtin_ia32_cmpneq"; break;
-							 case 5: Out << "__builtin_ia32_cmpnlt"; break;
-							 case 6: Out << "__builtin_ia32_cmpnle"; break;
-							 case 7: Out << "__builtin_ia32_cmpord"; break;
-						 }
-						 if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
-							 Out << 'p';
-						 else
-							 Out << 's';
-						 if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
-							 Out << 's';
-						 else
-							 Out << 'd';
-
-						 Out << "(";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ", ";
-						 writeOperand(I.getArgOperand(1), ContextCasted);
-						 Out << ")";
-						 return true;
-		case Intrinsic::ppc_altivec_lvsl:
-						 Out << '(';
-						 printTypeName(Out, I.getType());
-						 Out << ')';
-						 Out << "__builtin_altivec_lvsl(0, (void*)";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 Out << ")";
-						 return true;
-		case Intrinsic::stackprotector:
-						 writeOperandDeref(I.getArgOperand(1));
-						 Out << " = ";
-						 writeOperand(I.getArgOperand(0), ContextCasted);
-						 return true;
-		case Intrinsic::uadd_with_overflow:
-		case Intrinsic::sadd_with_overflow:
-		case Intrinsic::usub_with_overflow:
-		case Intrinsic::ssub_with_overflow:
-		case Intrinsic::umul_with_overflow:
-		case Intrinsic::smul_with_overflow:
-		case Intrinsic::bswap:
-		case Intrinsic::ceil:
-		case Intrinsic::ctlz:
-		case Intrinsic::ctpop:
-		case Intrinsic::cttz:
-		case Intrinsic::fabs:
-		case Intrinsic::floor:
-		case Intrinsic::fma:
-		case Intrinsic::fmuladd:
-		case Intrinsic::pow:
-		case Intrinsic::powi:
-		case Intrinsic::rint:
-		case Intrinsic::sqrt:
-		case Intrinsic::trap:
-		case Intrinsic::trunc:
-						 return false; // these use the normal function call emission
-	}
+    llvm_unreachable(0);
+    return false;
+  }
+
+  case Intrinsic::dbg_value:
+  case Intrinsic::dbg_declare:
+    return true; // ignore these intrinsics
+  case Intrinsic::vastart:
+    Out << "0; ";
+
+    Out << "va_start(*(va_list*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    // Output the last argument to the enclosing function.
+    if (I.getParent()->getParent()->arg_empty())
+      Out << "vararg_dummy_arg";
+    else
+      writeOperand(&*(I.getParent()->getParent()->arg_end() - 1));
+    Out << ')';
+    return true;
+  case Intrinsic::vaend:
+    if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
+      Out << "0; va_end(*(va_list*)";
+      writeOperand(I.getArgOperand(0), ContextCasted);
+      Out << ')';
+    } else {
+      Out << "va_end(*(va_list*)0)";
+    }
+    return true;
+  case Intrinsic::vacopy:
+    Out << "0; ";
+    Out << "va_copy(*(va_list*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", *(va_list*)";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::returnaddress:
+    Out << "__builtin_return_address(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::frameaddress:
+    Out << "__builtin_frame_address(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::setjmp:
+    Out << "setjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::longjmp:
+    Out << "longjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::sigsetjmp:
+    Out << "sigsetjmp(*(sigjmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ',';
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::siglongjmp:
+    Out << "siglongjmp(*(sigjmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::prefetch:
+    Out << "LLVM_PREFETCH((const void *)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(2), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::stacksave:
+    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
+    // to work around GCC bugs (see PR1809).
+    Out << "0; *((void**)&" << GetValueName(&I) << ") = __builtin_stack_save()";
+    return true;
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_cmp_ps:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_cmp_pd:
+    Out << '(';
+    printTypeName(Out, I.getType());
+    Out << ')';
+    // Multiple GCC builtins multiplex onto this intrinsic.
+    switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
+    default:
+      llvm_unreachable("Invalid llvm.x86.sse.cmp!");
+    case 0:
+      Out << "__builtin_ia32_cmpeq";
+      break;
+    case 1:
+      Out << "__builtin_ia32_cmplt";
+      break;
+    case 2:
+      Out << "__builtin_ia32_cmple";
+      break;
+    case 3:
+      Out << "__builtin_ia32_cmpunord";
+      break;
+    case 4:
+      Out << "__builtin_ia32_cmpneq";
+      break;
+    case 5:
+      Out << "__builtin_ia32_cmpnlt";
+      break;
+    case 6:
+      Out << "__builtin_ia32_cmpnle";
+      break;
+    case 7:
+      Out << "__builtin_ia32_cmpord";
+      break;
+    }
+    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
+      Out << 'p';
+    else
+      Out << 's';
+    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
+      Out << 's';
+    else
+      Out << 'd';
+
+    Out << "(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::ppc_altivec_lvsl:
+    Out << '(';
+    printTypeName(Out, I.getType());
+    Out << ')';
+    Out << "__builtin_altivec_lvsl(0, (void*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::stackprotector:
+    writeOperandDeref(I.getArgOperand(1));
+    Out << " = ";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    return true;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::bswap:
+  case Intrinsic::ceil:
+  case Intrinsic::ctlz:
+  case Intrinsic::ctpop:
+  case Intrinsic::cttz:
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
+  case Intrinsic::pow:
+  case Intrinsic::powi:
+  case Intrinsic::rint:
+  case Intrinsic::sqrt:
+  case Intrinsic::trap:
+  case Intrinsic::trunc:
+    return false; // these use the normal function call emission
+  }
 }
 
-//This converts the llvm constraint string to something gcc is expecting.
-//TODO: work out platform independent constraints and factor those out
+// This converts the llvm constraint string to something gcc is expecting.
+// TODO: work out platform independent constraints and factor those out
 //      of the per target tables
 //      handle multiple constraint codes
-std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
-	return TargetLowering::AsmOperandInfo(c).ConstraintCode;
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo &c) {
+  return TargetLowering::AsmOperandInfo(c).ConstraintCode;
 #if 0
 	assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
 
@@ -4923,513 +5358,524 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
 #endif
 }
 
-//TODO: import logic from AsmPrinter.cpp
+// TODO: import logic from AsmPrinter.cpp
 static std::string gccifyAsm(std::string asmstr) {
-	for (std::string::size_type i = 0; i != asmstr.size(); ++i)
-		if (asmstr[i] == '\n')
-			asmstr.replace(i, 1, "\\n");
-		else if (asmstr[i] == '\t')
-			asmstr.replace(i, 1, "\\t");
-		else if (asmstr[i] == '$') {
-			if (asmstr[i + 1] == '{') {
-				std::string::size_type a = asmstr.find_first_of(':', i + 1);
-				std::string::size_type b = asmstr.find_first_of('}', i + 1);
-				std::string n = "%" +
-					asmstr.substr(a + 1, b - a - 1) +
-					asmstr.substr(i + 2, a - i - 2);
-				asmstr.replace(i, b - i + 1, n);
-				i += n.size() - 1;
-			} else
-				asmstr.replace(i, 1, "%");
-		}
-		else if (asmstr[i] == '%')//grr
-		{ asmstr.replace(i, 1, "%%"); ++i;}
+  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
+    if (asmstr[i] == '\n')
+      asmstr.replace(i, 1, "\\n");
+    else if (asmstr[i] == '\t')
+      asmstr.replace(i, 1, "\\t");
+    else if (asmstr[i] == '$') {
+      if (asmstr[i + 1] == '{') {
+        std::string::size_type a = asmstr.find_first_of(':', i + 1);
+        std::string::size_type b = asmstr.find_first_of('}', i + 1);
+        std::string n = "%" + asmstr.substr(a + 1, b - a - 1) +
+                        asmstr.substr(i + 2, a - i - 2);
+        asmstr.replace(i, b - i + 1, n);
+        i += n.size() - 1;
+      } else
+        asmstr.replace(i, 1, "%");
+    } else if (asmstr[i] == '%') // grr
+    {
+      asmstr.replace(i, 1, "%%");
+      ++i;
+    }
 
-	return asmstr;
+  return asmstr;
 }
 
-//TODO: assumptions about what consume arguments from the call are likely wrong
+// TODO: assumptions about what consume arguments from the call are likely wrong
 //      handle communitivity
 void CWriter::visitInlineAsm(CallInst &CI) {
-	InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
-	InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
-
-	std::vector<std::pair<Value*, int> > ResultVals;
-	if (CI.getType() == Type::getVoidTy(CI.getContext()))
-		;
-	else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
-		for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
-			ResultVals.push_back(std::make_pair(&CI, (int)i));
-	} else {
-		ResultVals.push_back(std::make_pair(&CI, -1));
-	}
-
-	// Fix up the asm string for gcc and emit it.
-	Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
-	Out << "        :";
-
-	unsigned ValueCount = 0;
-	bool IsFirst = true;
+  InlineAsm *as = cast<InlineAsm>(CI.getCalledValue());
+  InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
+
+  std::vector<std::pair<Value *, int>> ResultVals;
+  if (CI.getType() == Type::getVoidTy(CI.getContext()))
+    ;
+  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
+      ResultVals.push_back(std::make_pair(&CI, (int)i));
+  } else {
+    ResultVals.push_back(std::make_pair(&CI, -1));
+  }
 
-	// Convert over all the output constraints.
-	for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-			E = Constraints.end(); I != E; ++I) {
+  // Fix up the asm string for gcc and emit it.
+  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
+  Out << "        :";
 
-		if (I->Type != InlineAsm::isOutput) {
-			++ValueCount;
-			continue;  // Ignore non-output constraints.
-		}
+  unsigned ValueCount = 0;
+  bool IsFirst = true;
 
-		assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-		std::string C = InterpretASMConstraint(*I);
-		if (C.empty()) continue;
+  // Convert over all the output constraints.
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+                                                 E = Constraints.end();
+       I != E; ++I) {
 
-		if (!IsFirst) {
-			Out << ", ";
-			IsFirst = false;
-		}
+    if (I->Type != InlineAsm::isOutput) {
+      ++ValueCount;
+      continue; // Ignore non-output constraints.
+    }
 
-		// Unpack the dest.
-		Value *DestVal;
-		int DestValNo = -1;
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty())
+      continue;
 
-		if (ValueCount < ResultVals.size()) {
-			DestVal = ResultVals[ValueCount].first;
-			DestValNo = ResultVals[ValueCount].second;
-		} else
-			DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
 
-		if (I->isEarlyClobber)
-			C = "&"+C;
+    // Unpack the dest.
+    Value *DestVal;
+    int DestValNo = -1;
 
-		Out << "\"=" << C << "\"(" << GetValueName(DestVal);
-		if (DestValNo != -1)
-			Out << ".field" << DestValNo; // Multiple retvals.
-		Out << ")";
-		++ValueCount;
-	}
+    if (ValueCount < ResultVals.size()) {
+      DestVal = ResultVals[ValueCount].first;
+      DestValNo = ResultVals[ValueCount].second;
+    } else
+      DestVal = CI.getArgOperand(ValueCount - ResultVals.size());
 
+    if (I->isEarlyClobber)
+      C = "&" + C;
 
-	// Convert over all the input constraints.
-	Out << "\n        :";
-	IsFirst = true;
-	ValueCount = 0;
-	for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-			E = Constraints.end(); I != E; ++I) {
-		if (I->Type != InlineAsm::isInput) {
-			++ValueCount;
-			continue;  // Ignore non-input constraints.
-		}
+    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
+    if (DestValNo != -1)
+      Out << ".field" << DestValNo; // Multiple retvals.
+    Out << ")";
+    ++ValueCount;
+  }
 
-		assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-		std::string C = InterpretASMConstraint(*I);
-		if (C.empty()) continue;
+  // Convert over all the input constraints.
+  Out << "\n        :";
+  IsFirst = true;
+  ValueCount = 0;
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+                                                 E = Constraints.end();
+       I != E; ++I) {
+    if (I->Type != InlineAsm::isInput) {
+      ++ValueCount;
+      continue; // Ignore non-input constraints.
+    }
 
-		if (!IsFirst) {
-			Out << ", ";
-			IsFirst = false;
-		}
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty())
+      continue;
 
-		assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-		Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
 
-		Out << "\"" << C << "\"(";
-		if (!I->isIndirect)
-			writeOperand(SrcVal);
-		else
-			writeOperandDeref(SrcVal);
-		Out << ")";
-	}
+    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
+    Value *SrcVal = CI.getArgOperand(ValueCount - ResultVals.size());
 
-	// Convert over the clobber constraints.
-	IsFirst = true;
-	for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-			E = Constraints.end(); I != E; ++I) {
-		if (I->Type != InlineAsm::isClobber)
-			continue;  // Ignore non-input constraints.
+    Out << "\"" << C << "\"(";
+    if (!I->isIndirect)
+      writeOperand(SrcVal);
+    else
+      writeOperandDeref(SrcVal);
+    Out << ")";
+  }
 
-		assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-		std::string C = InterpretASMConstraint(*I);
-		if (C.empty()) continue;
+  // Convert over the clobber constraints.
+  IsFirst = true;
+  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
+                                                 E = Constraints.end();
+       I != E; ++I) {
+    if (I->Type != InlineAsm::isClobber)
+      continue; // Ignore non-input constraints.
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty())
+      continue;
 
-		if (!IsFirst) {
-			Out << ", ";
-			IsFirst = false;
-		}
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
 
-		Out << '\"' << C << '"';
-	}
+    Out << '\"' << C << '"';
+  }
 
-	Out << ")";
+  Out << ")";
 }
 
 void CWriter::visitAllocaInst(AllocaInst &I) {
-	Out << '(';
-	printTypeName(Out, I.getType());
-	Out << ") alloca(sizeof(";
-	printTypeName(Out, I.getType()->getElementType());
-	if (I.isArrayAllocation()) { 
-		Out << ") * (" ;
-		writeOperand(I.getArraySize(), ContextCasted);
-	}
-	Out << "))";
+  Out << '(';
+  printTypeName(Out, I.getType());
+  Out << ") alloca(sizeof(";
+  printTypeName(Out, I.getType()->getElementType());
+  if (I.isArrayAllocation()) {
+    Out << ") * (";
+    writeOperand(I.getArraySize(), ContextCasted);
+  }
+  Out << "))";
 }
 
 void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
-		gep_type_iterator E, bool isArrayType, GetElementPtrInst *GEPI) {
-	DEBUG(errs() << "Printing GEP\n");
-	DEBUG(errs() << "\tPtr: " << *Ptr << "\n");
-	DEBUG(errs() << "\tGEPI: " << *GEPI <<"\n"); 
-	// If there are no indices, just print out the pointer.
-	if (I == E) {
-		DEBUG(errs() << "I==E: Calling writeOperand()\n");
-		writeOperand(Ptr);
-		return;
-	}
+                                 gep_type_iterator E, bool isArrayType,
+                                 GetElementPtrInst *GEPI) {
+  DEBUG(errs() << "Printing GEP\n");
+  DEBUG(errs() << "\tPtr: " << *Ptr << "\n");
+  DEBUG(errs() << "\tGEPI: " << *GEPI << "\n");
+  // If there are no indices, just print out the pointer.
+  if (I == E) {
+    DEBUG(errs() << "I==E: Calling writeOperand()\n");
+    writeOperand(Ptr);
+    return;
+  }
 
-	// Find out if the last index is into a vector.  If so, we have to print this
-	// specially.  Since vectors can't have elements of indexable type, only the
-	// last index could possibly be of a vector element.
-	VectorType *LastIndexIsVector = 0;
-	{
-		for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
-			//LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy());
-			// CHECK: This change needs thorough testing
-			LastIndexIsVector = dyn_cast<VectorType>(TmpI.getIndexedType());
-	}
-	Out << "(";
-
-	// If the last index is into a vector, we can't print it as &a[i][j] because
-	// we can't index into a vector with j in GCC.  Instead, emit this as
-	// (((float*)&a[i])+j)
-	// TODO: this is no longer true now that we don't represent vectors using gcc-extentions
-	if (LastIndexIsVector) {
-		DEBUG(errs() << "LastIndexIsVector\n");
-		Out << "((";
-		printTypeName(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
-		Out << ")(";
-	}
-	bool isArrayAccess = false; 
+  // Find out if the last index is into a vector.  If so, we have to print this
+  // specially.  Since vectors can't have elements of indexable type, only the
+  // last index could possibly be of a vector element.
+  VectorType *LastIndexIsVector = 0;
+  {
+    for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
+      // LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy());
+      // CHECK: This change needs thorough testing
+      LastIndexIsVector = dyn_cast<VectorType>(TmpI.getIndexedType());
+  }
+  Out << "(";
+
+  // If the last index is into a vector, we can't print it as &a[i][j] because
+  // we can't index into a vector with j in GCC.  Instead, emit this as
+  // (((float*)&a[i])+j)
+  // TODO: this is no longer true now that we don't represent vectors using
+  // gcc-extentions
+  if (LastIndexIsVector) {
+    DEBUG(errs() << "LastIndexIsVector\n");
+    Out << "((";
+    printTypeName(Out,
+                  PointerType::getUnqual(LastIndexIsVector->getElementType()));
+    Out << ")(";
+  }
+  bool isArrayAccess = false;
 
-	if (GEPStack.size() > 0 && GEPStack.top() == GEPI) {
-		DEBUG(errs() << "Processing load-specific GEP\n");
-		GEPStack.pop();
-		isArrayAccess = true;
-	} else {
-		DEBUG(errs() << "I'm hereee!\n");
-		Out << '&';
-	}
-	DEBUG(errs() << "Here!\n");
-	// If the first index is 0 (very typical) we can do a number of
-	// simplifications to clean up the code.
-	Value *FirstOp = I.getOperand();
-	DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n");
-	if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
-		DEBUG(errs() << "Calling writeoperand()\n");
-		// First index isn't simple, print it the hard way.
-		writeOperand(Ptr, ContextNormal, isArrayAccess);
-	} else {
-		++I;  // Skip the zero index.
-		DEBUG(errs() << "Skipping zero index\n");
-
-		// Okay, emit the first operand. If Ptr is something that is already address
-		// exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
-		if (isAddressExposed(Ptr)) {
-			DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n");
-			writeOperandInternal(Ptr);
-		}
-		//else if (I != E && (I.getCurTy())->isStructTy()) {
-		// NOTE: This change needs to be tested more
-		else if (I != E && (I.isStruct()) ) {
-			DEBUG(errs() << "Not address exposed; is struct type\n");
-			// If we didn't already emit the first operand, see if we can print it as
-			// P->f instead of "P[0].f"
-			writeOperand(Ptr);
-			Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-			++I;  // eat the struct index as well.
-		} else {
-			DEBUG(errs() << "In else; emitting *P\n");
-			// Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
-			Out << "(*";
-			writeOperand(Ptr);
-			Out << ")";
-		}
-	}
+  if (GEPStack.size() > 0 && GEPStack.top() == GEPI) {
+    DEBUG(errs() << "Processing load-specific GEP\n");
+    GEPStack.pop();
+    isArrayAccess = true;
+  } else {
+    DEBUG(errs() << "I'm hereee!\n");
+    Out << '&';
+  }
+  DEBUG(errs() << "Here!\n");
+  // If the first index is 0 (very typical) we can do a number of
+  // simplifications to clean up the code.
+  Value *FirstOp = I.getOperand();
+  DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n");
+  if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
+    DEBUG(errs() << "Calling writeoperand()\n");
+    // First index isn't simple, print it the hard way.
+    writeOperand(Ptr, ContextNormal, isArrayAccess);
+  } else {
+    ++I; // Skip the zero index.
+    DEBUG(errs() << "Skipping zero index\n");
+
+    // Okay, emit the first operand. If Ptr is something that is already address
+    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
+    if (isAddressExposed(Ptr)) {
+      DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n");
+      writeOperandInternal(Ptr);
+    }
+    // else if (I != E && (I.getCurTy())->isStructTy()) {
+    // NOTE: This change needs to be tested more
+    else if (I != E && (I.isStruct())) {
+      DEBUG(errs() << "Not address exposed; is struct type\n");
+      // If we didn't already emit the first operand, see if we can print it as
+      // P->f instead of "P[0].f"
+      writeOperand(Ptr);
+      Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+      ++I; // eat the struct index as well.
+    } else {
+      DEBUG(errs() << "In else; emitting *P\n");
+      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
+      Out << "(*";
+      writeOperand(Ptr);
+      Out << ")";
+    }
+  }
 
-	Type *Agg = GEPI->getSourceElementType();
-	unsigned CurIdx = 1;
-	for (; I != E; ++CurIdx, ++I) {
-		assert(I.getOperand()->getType()->isIntegerTy()); // TODO: indexing a Vector with a Vector is valid, but we don't support it here
-		DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand()) << "\n");
-		if ((Agg->isStructTy())){
-			DEBUG(errs() << "Found a struct\n");
-			Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-		} else if (Agg->isArrayTy()) {
-			DEBUG(errs() << "Found an array!\n");
-			Out << ".array[";
-			writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-			Out << ']';
-		} else if (!Agg->isVectorTy()) {
-			DEBUG(errs() << "Not a vector!\n");
-			Out << '[';
-			writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-			Out << ']';
-		} else {
-			DEBUG(errs() << "In else!\n");
-			// If the last index is into a vector, then print it out as "+j)".  This
-			// works with the 'LastIndexIsVector' code above.
-			if (isa<Constant>(I.getOperand()) &&
-					cast<Constant>(I.getOperand())->isNullValue()) {
-				Out << "))";  // avoid "+0".
-			} else {
-				Out << ")+(";
-				writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-				Out << "))";
-			}
-		}
-		CompositeType *CT = dyn_cast<CompositeType>(Agg);
-		if (!CT || CT->isPointerTy()) 
-		{
-			DEBUG(errs() << "Something wrong!!\n");
-			break;
-		}
-		Value* Index = GEPI->getOperand(CurIdx);
-		if (!CT->indexValid(Index))
-			if (!CT || CT->isPointerTy()) 
-			{
-				DEBUG(errs() << "Something wrong 2!!\n");
-				break;
-			}
-		Agg = CT->getTypeAtIndex(Index);
-	}
-	Out << ")";
-	DEBUG(errs() << "Leaving printGEPExpression\n");
-	}
+  Type *Agg = GEPI->getSourceElementType();
+  unsigned CurIdx = 1;
+  for (; I != E; ++CurIdx, ++I) {
+    assert(I.getOperand()
+               ->getType()
+               ->isIntegerTy()); // TODO: indexing a Vector with a Vector is
+                                 // valid, but we don't support it here
+    DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand())
+                 << "\n");
+    if ((Agg->isStructTy())) {
+      DEBUG(errs() << "Found a struct\n");
+      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+    } else if (Agg->isArrayTy()) {
+      DEBUG(errs() << "Found an array!\n");
+      Out << ".array[";
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else if (!Agg->isVectorTy()) {
+      DEBUG(errs() << "Not a vector!\n");
+      Out << '[';
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else {
+      DEBUG(errs() << "In else!\n");
+      // If the last index is into a vector, then print it out as "+j)".  This
+      // works with the 'LastIndexIsVector' code above.
+      if (isa<Constant>(I.getOperand()) &&
+          cast<Constant>(I.getOperand())->isNullValue()) {
+        Out << "))"; // avoid "+0".
+      } else {
+        Out << ")+(";
+        writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+        Out << "))";
+      }
+    }
+    CompositeType *CT = dyn_cast<CompositeType>(Agg);
+    if (!CT || CT->isPointerTy()) {
+      DEBUG(errs() << "Something wrong!!\n");
+      break;
+    }
+    Value *Index = GEPI->getOperand(CurIdx);
+    if (!CT->indexValid(Index))
+      if (!CT || CT->isPointerTy()) {
+        DEBUG(errs() << "Something wrong 2!!\n");
+        break;
+      }
+    Agg = CT->getTypeAtIndex(Index);
+  }
+  Out << ")";
+  DEBUG(errs() << "Leaving printGEPExpression\n");
+}
 
-	void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
-			bool IsVolatile, unsigned Alignment /*bytes*/) {
-		DEBUG(errs() << *OperandType << "; " << *Operand << "\n");
-		bool arrayAccess = false;
-		if(isa<GetElementPtrInst>(Operand)) {
-			DEBUG(errs() << "ISA Get Element Pointer!\n");
-			arrayAccess = true;
-			GEPStack.push(dyn_cast<GetElementPtrInst>(Operand));
-		}
-		//  if (isAddressExposed(Operand)) {
-		// DEBUG(errs() << "Is address exposed!!\n");
-		//    writeOperandInternal(Operand);
-		//    return;
-		//  }
-
-		bool IsUnaligned = Alignment &&
-			Alignment < TD->getABITypeAlignment(OperandType);
-		if (!arrayAccess) {
-			if (!IsUnaligned)
-				Out << '*';
-
-			else if (IsUnaligned) {
-				Out << "__UNALIGNED_LOAD__(";
-				printTypeNameUnaligned(Out, OperandType, false);
-				if (IsVolatile) Out << " volatile";
-				Out << ", " << Alignment << ", ";
-			}
-
-			else if (IsVolatile) {
-				Out << "(";
-				printTypeName(Out, OperandType, false);
-				Out << "volatile";
-				Out << "*)";
-			} 
-		}
+void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
+                                bool IsVolatile, unsigned Alignment /*bytes*/) {
+  DEBUG(errs() << *OperandType << "; " << *Operand << "\n");
+  bool arrayAccess = false;
+  if (isa<GetElementPtrInst>(Operand)) {
+    DEBUG(errs() << "ISA Get Element Pointer!\n");
+    arrayAccess = true;
+    GEPStack.push(dyn_cast<GetElementPtrInst>(Operand));
+  }
+  //  if (isAddressExposed(Operand)) {
+  // DEBUG(errs() << "Is address exposed!!\n");
+  //    writeOperandInternal(Operand);
+  //    return;
+  //  }
 
-		writeOperand(Operand,ContextNormal, arrayAccess );
+  bool IsUnaligned =
+      Alignment && Alignment < TD->getABITypeAlignment(OperandType);
+  if (!arrayAccess) {
+    if (!IsUnaligned)
+      Out << '*';
+
+    else if (IsUnaligned) {
+      Out << "__UNALIGNED_LOAD__(";
+      printTypeNameUnaligned(Out, OperandType, false);
+      if (IsVolatile)
+        Out << " volatile";
+      Out << ", " << Alignment << ", ";
+    }
 
-		if (IsUnaligned) {
-			Out << ")";
-		}
-	}
+    else if (IsVolatile) {
+      Out << "(";
+      printTypeName(Out, OperandType, false);
+      Out << "volatile";
+      Out << "*)";
+    }
+  }
 
-	void CWriter::visitLoadInst(LoadInst &I) {
-		DEBUG(errs() << "Visiting Load instruction!\n");
-		DEBUG(errs() << "Visiting load: " << I << "\n");
-		writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
-				I.getAlignment());
+  writeOperand(Operand, ContextNormal, arrayAccess);
 
-	}
+  if (IsUnaligned) {
+    Out << ")";
+  }
+}
 
-	void CWriter::visitStoreInst(StoreInst &I) {
-		DEBUG(errs() << "Visiting store instruction!\n");
-		writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
-				I.isVolatile(), I.getAlignment());
-		Out << " = ";
-		Value *Operand = I.getOperand(0);
-		unsigned BitMask = 0;
-		if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
-			if (!ITy->isPowerOf2ByteWidth())
-				// We have a bit width that doesn't match an even power-of-2 byte
-				// size. Consequently we must & the value with the type's bit mask
-				BitMask = ITy->getBitMask();
-		if (BitMask)
-			Out << "((";
-		writeOperand(Operand, BitMask ? ContextNormal : ContextCasted);
-		if (BitMask)
-			Out << ") & " << BitMask << ")";
-	}
+void CWriter::visitLoadInst(LoadInst &I) {
+  DEBUG(errs() << "Visiting Load instruction!\n");
+  DEBUG(errs() << "Visiting load: " << I << "\n");
+  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
+                    I.getAlignment());
+}
 
-	void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
-		DEBUG(errs() <<"Visiting GEP: " << I << "\n");
-		printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
-				gep_type_end(I), I.getSourceElementType()->isArrayTy(), &I);
-	}
+void CWriter::visitStoreInst(StoreInst &I) {
+  DEBUG(errs() << "Visiting store instruction!\n");
+  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
+                    I.isVolatile(), I.getAlignment());
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  unsigned BitMask = 0;
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ITy->getBitMask();
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand, BitMask ? ContextNormal : ContextCasted);
+  if (BitMask)
+    Out << ") & " << BitMask << ")";
+}
 
-	void CWriter::visitVAArgInst(VAArgInst &I) {
-		Out << "va_arg(*(va_list*)";
-		writeOperand(I.getOperand(0), ContextCasted);
-		Out << ", ";
-		printTypeName(Out, I.getType());
-		Out << ");\n ";
-	}
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  DEBUG(errs() << "Visiting GEP: " << I << "\n");
+  printGEPExpression(I.getPointerOperand(), gep_type_begin(I), gep_type_end(I),
+                     I.getSourceElementType()->isArrayTy(), &I);
+}
 
-	void CWriter::visitInsertElementInst(InsertElementInst &I) {
-		// Start by copying the entire aggregate value into the result variable.
-		writeOperand(I.getOperand(0));
-		Type *EltTy = I.getType()->getElementType();
-		assert(I.getOperand(1)->getType() == EltTy);
-		if (isEmptyType(EltTy)) return;
-
-		// Then do the insert to update the field.
-		Out << ";\n  ";
-		Out << GetValueName(&I) << ".vector[";
-		writeOperand(I.getOperand(2));
-		Out << "] = ";
-		writeOperand(I.getOperand(1), ContextCasted);
-	}
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0), ContextCasted);
+  Out << ", ";
+  printTypeName(Out, I.getType());
+  Out << ");\n ";
+}
 
-	void CWriter::visitExtractElementInst(ExtractElementInst &I) {
-		assert(!isEmptyType(I.getType()));
-		if (isa<UndefValue>(I.getOperand(0))) {
-			Out << "(";
-			printTypeName(Out, I.getType());
-			Out << ") 0/*UNDEF*/";
-		} else {
-			Out << "(";
-			writeOperand(I.getOperand(0));
-			Out << ").vector[";
-			writeOperand(I.getOperand(1));
-			Out << "]";
-		}
-	}
+void CWriter::visitInsertElementInst(InsertElementInst &I) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(I.getOperand(0));
+  Type *EltTy = I.getType()->getElementType();
+  assert(I.getOperand(1)->getType() == EltTy);
+  if (isEmptyType(EltTy))
+    return;
 
-	// <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask>
-	// ; yields <m x <ty>>
-	void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
-		VectorType *VT = SVI.getType();
-		Type *EltTy = VT->getElementType();
-		VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType());
-		assert(!isEmptyType(VT));
-		assert(InputVT->getElementType() == VT->getElementType());
-
-		CtorDeclTypes.insert(VT);
-		Out << "llvm_ctor_";
-		printTypeString(Out, VT, false);
-		Out << "(";
-
-		Constant *Zero = Constant::getNullValue(EltTy);
-		unsigned NumElts = VT->getNumElements();
-		unsigned NumInputElts = InputVT->getNumElements(); // n
-		for (unsigned i = 0; i != NumElts; ++i) {
-			if (i) Out << ", ";
-			int SrcVal = SVI.getMaskValue(i);
-			if ((unsigned)SrcVal >= NumInputElts * 2) {
-				Out << "/*undef*/";
-				printConstant(Zero, ContextCasted);
-			} else {
-				// If SrcVal belongs [0, n - 1], it extracts value from <v1>
-				// If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2>
-				// In C++, the value false is converted to zero and the value true is
-				// converted to one
-				Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts);
-				if (isa<Instruction>(Op)) {
-					// Do an extractelement of this value from the appropriate input.
-					Out << "(";
-					writeOperand(Op);
-					Out << ").vector[";
-					Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts : SrcVal);
-					Out << "]";
-				} else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
-					printConstant(Zero, ContextCasted);
-				} else {
-					printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
-								(NumElts-1)),
-							ContextNormal);
-				}
-			}
-		}
-		Out << ")";
-	}
+  // Then do the insert to update the field.
+  Out << ";\n  ";
+  Out << GetValueName(&I) << ".vector[";
+  writeOperand(I.getOperand(2));
+  Out << "] = ";
+  writeOperand(I.getOperand(1), ContextCasted);
+}
 
-	void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
-		// Start by copying the entire aggregate value into the result variable.
-		writeOperand(IVI.getOperand(0));
-		Type *EltTy = IVI.getOperand(1)->getType();
-		if (isEmptyType(EltTy)) return;
-
-		// Then do the insert to update the field.
-		Out << ";\n  ";
-		Out << GetValueName(&IVI);
-		for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
-				i != e; ++i) {
-			Type *IndexedTy =
-				ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-						makeArrayRef(b, i));
-			assert(IndexedTy);
-			if (IndexedTy->isArrayTy())
-				Out << ".array[" << *i << "]";
-			else
-				Out << ".field" << *i;
-		}
-		Out << " = ";
-		writeOperand(IVI.getOperand(1), ContextCasted);
-	}
+void CWriter::visitExtractElementInst(ExtractElementInst &I) {
+  assert(!isEmptyType(I.getType()));
+  if (isa<UndefValue>(I.getOperand(0))) {
+    Out << "(";
+    printTypeName(Out, I.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    Out << "(";
+    writeOperand(I.getOperand(0));
+    Out << ").vector[";
+    writeOperand(I.getOperand(1));
+    Out << "]";
+  }
+}
 
-	void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
-		Out << "(";
-		if (isa<UndefValue>(EVI.getOperand(0))) {
-			Out << "(";
-			printTypeName(Out, EVI.getType());
-			Out << ") 0/*UNDEF*/";
-		} else {
-			writeOperand(EVI.getOperand(0));
-			for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
-					i != e; ++i) {
-				Type *IndexedTy =
-					ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-							makeArrayRef(b, i));
-				if (IndexedTy->isArrayTy())
-					Out << ".array[" << *i << "]";
-				else
-					Out << ".field" << *i;
-			}
-		}
-		Out << ")";
-	}
+// <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask>
+// ; yields <m x <ty>>
+void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = SVI.getType();
+  Type *EltTy = VT->getElementType();
+  VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType());
+  assert(!isEmptyType(VT));
+  assert(InputVT->getElementType() == VT->getElementType());
+
+  CtorDeclTypes.insert(VT);
+  Out << "llvm_ctor_";
+  printTypeString(Out, VT, false);
+  Out << "(";
+
+  Constant *Zero = Constant::getNullValue(EltTy);
+  unsigned NumElts = VT->getNumElements();
+  unsigned NumInputElts = InputVT->getNumElements(); // n
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (i)
+      Out << ", ";
+    int SrcVal = SVI.getMaskValue(i);
+    if ((unsigned)SrcVal >= NumInputElts * 2) {
+      Out << "/*undef*/";
+      printConstant(Zero, ContextCasted);
+    } else {
+      // If SrcVal belongs [0, n - 1], it extracts value from <v1>
+      // If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2>
+      // In C++, the value false is converted to zero and the value true is
+      // converted to one
+      Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts);
+      if (isa<Instruction>(Op)) {
+        // Do an extractelement of this value from the appropriate input.
+        Out << "(";
+        writeOperand(Op);
+        Out << ").vector[";
+        Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts
+                                                 : SrcVal);
+        Out << "]";
+      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
+        printConstant(Zero, ContextCasted);
+      } else {
+        printConstant(
+            cast<ConstantVector>(Op)->getOperand(SrcVal & (NumElts - 1)),
+            ContextNormal);
+      }
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(IVI.getOperand(0));
+  Type *EltTy = IVI.getOperand(1)->getType();
+  if (isEmptyType(EltTy))
+    return;
+
+  // Then do the insert to update the field.
+  Out << ";\n  ";
+  Out << GetValueName(&IVI);
+  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end(); i != e;
+       ++i) {
+    Type *IndexedTy = ExtractValueInst::getIndexedType(
+        IVI.getOperand(0)->getType(), makeArrayRef(b, i));
+    assert(IndexedTy);
+    if (IndexedTy->isArrayTy())
+      Out << ".array[" << *i << "]";
+    else
+      Out << ".field" << *i;
+  }
+  Out << " = ";
+  writeOperand(IVI.getOperand(1), ContextCasted);
+}
 
-	//===----------------------------------------------------------------------===//
-	//                       External Interface declaration
-	//===----------------------------------------------------------------------===//
+void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
+  Out << "(";
+  if (isa<UndefValue>(EVI.getOperand(0))) {
+    Out << "(";
+    printTypeName(Out, EVI.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    writeOperand(EVI.getOperand(0));
+    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
+         i != e; ++i) {
+      Type *IndexedTy = ExtractValueInst::getIndexedType(
+          EVI.getOperand(0)->getType(), makeArrayRef(b, i));
+      if (IndexedTy->isArrayTy())
+        Out << ".array[" << *i << "]";
+      else
+        Out << ".field" << *i;
+    }
+  }
+  Out << ")";
+}
 
-	bool CTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
-			raw_pwrite_stream &Out,
-			raw_pwrite_stream *Out2,
-			CodeGenFileType FileType,
-			bool DisableVerify,
-			MachineModuleInfo *MMI){
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
 
-		if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+bool CTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *Out2,
+    CodeGenFileType FileType, bool DisableVerify, MachineModuleInfo *MMI) {
 
-		PM.add(createGCLoweringPass());
-		PM.add(createLowerInvokePass());
-		PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
-		PM.add(new CWriter(Out));
-		return false;
-	}
+  if (FileType != TargetMachine::CGFT_AssemblyFile)
+    return true;
+
+  PM.add(createGCLoweringPass());
+  PM.add(createLowerInvokePass());
+  PM.add(createCFGSimplificationPass()); // clean up after lower invoke.
+  PM.add(new CWriter(Out));
+  return false;
+}
diff --git a/hpvm/projects/llvm-cbe/test/APInt-C.cpp b/hpvm/projects/llvm-cbe/test/APInt-C.cpp
index c44440985a0b50a57bd25e1995d39cd904ec32c5..d37b2a4f799fb28cba55d85bb3048b189885a357 100644
--- a/hpvm/projects/llvm-cbe/test/APInt-C.cpp
+++ b/hpvm/projects/llvm-cbe/test/APInt-C.cpp
@@ -22,12 +22,12 @@ inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align,
 #define CREATE(s)                                                              \
   APInt s;                                                                     \
   if ((numbits % integerPartWidth) != 0) {                                     \
-    /* use LLT_ALIGN to round the memory area up to the nearest \                                                                             \
+    /* use LLT_ALIGN to round the memory area up to the nearest \ \ \                                                                             \
      * integerPart-sized chunk */                                              \
     unsigned nbytes =                                                          \
         RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit;         \
     integerPart *data_a64 = (integerPart *)alloca(nbytes);                     \
-    /* TODO: this memcpy assumes little-endian, \ for big-endian, need to                                                       \
+    /* TODO: this memcpy assumes little-endian, \ for big-endian, need to \ \                                                                             \
      * align the copy to the other end */                                      \
     memcpy(data_a64, p##s,                                                     \
            RoundUpToAlignment(numbits, host_char_bit) / host_char_bit);        \
diff --git a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp~ b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp~
index a86a11ce6fcc144055b168739b4de0110b05ae0c..79125a86ec523a4674b222ea1263735aed93765f 100644
--- a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp~
+++ b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp~
@@ -61,19 +61,19 @@ extern "C" void LLVMInitializeCBackendTargetMC();
 // and back-end code generation options are specified with the target machine.
 //
 static cl::opt<std::string>
-InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-"));
+    InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-"));
 
-static cl::opt<std::string>
-OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"));
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"));
 
 static cl::opt<unsigned>
-TimeCompilations("time-compilations", cl::Hidden, cl::init(1u),
-                 cl::value_desc("N"),
-                 cl::desc("Repeat compilation N times for timing"));
+    TimeCompilations("time-compilations", cl::Hidden, cl::init(1u),
+                     cl::value_desc("N"),
+                     cl::desc("Repeat compilation N times for timing"));
 
 static cl::opt<bool>
-NoIntegratedAssembler("no-integrated-as", cl::Hidden,
-                      cl::desc("Disable integrated assembler"));
+    NoIntegratedAssembler("no-integrated-as", cl::Hidden,
+                          cl::desc("Disable integrated assembler"));
 
 static cl::opt<bool>
     PreserveComments("preserve-as-comments", cl::Hidden,
@@ -82,21 +82,20 @@ static cl::opt<bool>
 
 // Determine optimization level.
 static cl::opt<char>
-OptLevel("O",
-         cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                  "(default = '-O2')"),
-         cl::Prefix,
-         cl::ZeroOrMore,
-         cl::init(' '));
+    OptLevel("O",
+             cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+                      "(default = '-O2')"),
+             cl::Prefix, cl::ZeroOrMore, cl::init(' '));
 
 static cl::opt<std::string>
-TargetTriple("mtriple", cl::desc("Override target triple for module"));
+    TargetTriple("mtriple", cl::desc("Override target triple for module"));
 
 static cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
                               cl::desc("Do not verify input module"));
 
-static cl::opt<bool> DisableSimplifyLibCalls("disable-simplify-libcalls",
-                                             cl::desc("Disable simplify-libcalls"));
+static cl::opt<bool>
+    DisableSimplifyLibCalls("disable-simplify-libcalls",
+                            cl::desc("Disable simplify-libcalls"));
 
 static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
                                     cl::desc("Show encoding in .s output"));
@@ -120,14 +119,13 @@ static cl::opt<bool> DiscardValueNames(
     cl::desc("Discard names from Value (other than GlobalValue)."),
     cl::init(false), cl::Hidden);
 
-static cl::opt<std::string> StopAfter("stop-after",
-                            cl::desc("Stop compilation after a specific pass"),
-                            cl::value_desc("pass-name"),
-                                      cl::init(""));
-static cl::opt<std::string> StartAfter("start-after",
-                          cl::desc("Resume compilation after a specific pass"),
-                          cl::value_desc("pass-name"),
-                          cl::init(""));
+static cl::opt<std::string>
+    StopAfter("stop-after", cl::desc("Stop compilation after a specific pass"),
+              cl::value_desc("pass-name"), cl::init(""));
+static cl::opt<std::string>
+    StartAfter("start-after",
+               cl::desc("Resume compilation after a specific pass"),
+               cl::value_desc("pass-name"), cl::init(""));
 
 namespace {
 static ManagedStatic<std::vector<std::string>> RunPassNames;
@@ -142,7 +140,7 @@ struct RunPassOption {
       RunPassNames->push_back(PassName);
   }
 };
-}
+} // namespace
 
 static RunPassOption RunPassOpt;
 
@@ -153,9 +151,9 @@ static cl::opt<RunPassOption, true, cl::parser<std::string>> RunPass(
 
 static int compileModule(char **, LLVMContext &);
 
-static std::unique_ptr<tool_output_file>
-GetOutputStream(const char *TargetName, Triple::OSType OS,
-                const char *ProgName) {
+static std::unique_ptr<tool_output_file> GetOutputStream(const char *TargetName,
+                                                         Triple::OSType OS,
+                                                         const char *ProgName) {
   // If we don't yet have an output filename, make one.
   if (OutputFilename.empty()) {
     if (InputFilename == "-")
@@ -175,7 +173,7 @@ GetOutputStream(const char *TargetName, Triple::OSType OS,
         if (TargetName[0] == 'c') {
           if (TargetName[1] == 0)
             OutputFilename += ".cl";
-//            OutputFilename += ".cbe.c";
+          //            OutputFilename += ".cbe.c";
           else if (TargetName[1] == 'p' && TargetName[2] == 'p')
             OutputFilename += ".cpp";
           else
@@ -212,8 +210,8 @@ GetOutputStream(const char *TargetName, Triple::OSType OS,
   sys::fs::OpenFlags OpenFlags = sys::fs::F_None;
   if (!Binary)
     OpenFlags |= sys::fs::F_Text;
-  auto FDOut = llvm::make_unique<tool_output_file>(OutputFilename, EC,
-                                                   OpenFlags);
+  auto FDOut =
+      llvm::make_unique<tool_output_file>(OutputFilename, EC, OpenFlags);
   if (EC) {
     errs() << EC.message() << '\n';
     return nullptr;
@@ -243,7 +241,7 @@ int main(int argc, char **argv) {
   EnableDebugBuffering = true;
 
   LLVMContext Context;
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
   // Initialize targets first, so that --version shows registered targets.
   InitializeAllTargets();
@@ -267,7 +265,7 @@ int main(int argc, char **argv) {
   initializeScalarEvolutionWrapperPassPass(*Registry);
   initializeDominatorTreeWrapperPassPass(*Registry);
   initializeAssumptionCacheTrackerPass(*Registry);
-  //initializeUnreachableBlockElimLegacyPassPass(*Registry);
+  // initializeUnreachableBlockElimLegacyPassPass(*Registry);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
@@ -288,8 +286,8 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-static bool addPass(PassManagerBase &PM, const char *argv0,
-                    StringRef PassName, TargetPassConfig &TPC) {
+static bool addPass(PassManagerBase &PM, const char *argv0, StringRef PassName,
+                    TargetPassConfig &TPC) {
   if (PassName == "none")
     return false;
 
@@ -323,17 +321,17 @@ static int compileModule(char **argv, LLVMContext &Context) {
   std::unique_ptr<MIRParser> MIR;
   Triple TheTriple;
 
-  bool SkipModule = MCPU == "help" ||
-                    (!MAttrs.empty() && MAttrs.front() == "help");
+  bool SkipModule =
+      MCPU == "help" || (!MAttrs.empty() && MAttrs.front() == "help");
 
   // If user just wants to list available options, skip module loading
   if (!SkipModule) {
-    //if (StringRef(InputFilename).endswith_lower(".mir")) {
-      //MIR = createMIRParserFromFile(InputFilename, Err, Context);
-      //if (MIR)
-        //M = MIR->parseLLVMModule();
+    // if (StringRef(InputFilename).endswith_lower(".mir")) {
+    // MIR = createMIRParserFromFile(InputFilename, Err, Context);
+    // if (MIR)
+    // M = MIR->parseLLVMModule();
     //} else
-      M = parseIRFile(InputFilename, Err, Context);
+    M = parseIRFile(InputFilename, Err, Context);
     if (!M) {
       Err.print(argv[0], errs());
       return 1;
@@ -361,9 +359,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Get the target specific parser.
   std::string Error;
   // Override MArch
-  MArch = "c"; //FIX ME
-  const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
-                                                         Error);
+  MArch = "c"; // FIX ME
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(MArch, TheTriple, Error);
   if (!TheTarget) {
     errs() << argv[0] << ": " << Error;
     return 1;
@@ -376,11 +374,20 @@ static int compileModule(char **argv, LLVMContext &Context) {
   default:
     errs() << argv[0] << ": invalid optimization level.\n";
     return 1;
-  case ' ': break;
-  case '0': OLvl = CodeGenOpt::None; break;
-  case '1': OLvl = CodeGenOpt::Less; break;
-  case '2': OLvl = CodeGenOpt::Default; break;
-  case '3': OLvl = CodeGenOpt::Aggressive; break;
+  case ' ':
+    break;
+  case '0':
+    OLvl = CodeGenOpt::None;
+    break;
+  case '1':
+    OLvl = CodeGenOpt::Less;
+    break;
+  case '2':
+    OLvl = CodeGenOpt::Default;
+    break;
+  case '3':
+    OLvl = CodeGenOpt::Aggressive;
+    break;
   }
 
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
@@ -390,7 +397,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
   Options.MCOptions.AsmVerbose = AsmVerbose;
   Options.MCOptions.PreserveAsmComments = PreserveComments;
 
-//  std::unique_ptr<TargetMachine> Target(
+  //  std::unique_ptr<TargetMachine> Target(
   TargetMachine *Target(
       TheTarget->createTargetMachine(TheTriple.getTriple(), CPUStr, FeaturesStr,
                                      Options, getRelocModel(), CMModel, OLvl));
@@ -410,7 +417,8 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Figure out where we are going to send the output.
   std::unique_ptr<tool_output_file> Out =
       GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]);
-  if (!Out) return 1;
+  if (!Out)
+    return 1;
 
   // Build up all of the passes that we want to do to the module.
   legacy::PassManager PM;
@@ -433,7 +441,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
   if (RelaxAll.getNumOccurrences() > 0 &&
       FileType != TargetMachine::CGFT_ObjectFile)
     errs() << argv[0]
-             << ": warning: ignoring -mc-relax-all because filetype != obj";
+           << ": warning: ignoring -mc-relax-all because filetype != obj";
 
   {
     raw_pwrite_stream *OS = &Out->os();
@@ -455,24 +463,25 @@ static int compileModule(char **argv, LLVMContext &Context) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     if (!RunPassNames->empty()) {
       if (!StartAfter.empty() || !StopAfter.empty()) {
-        errs() << argv[0] << ": start-after and/or stop-after passes are "
-                             "redundant when run-pass is specified.\n";
+        errs() << argv[0]
+               << ": start-after and/or stop-after passes are "
+                  "redundant when run-pass is specified.\n";
         return 1;
       }
       if (!MIR) {
         errs() << argv[0] << ": run-pass needs a .mir input.\n";
         return 1;
       }
-      LLVMTargetMachine *LLVMTM = static_cast<LLVMTargetMachine*>(Target);
+      LLVMTargetMachine *LLVMTM = static_cast<LLVMTargetMachine *>(Target);
       TargetPassConfig *TPC = LLVMTM->createPassConfig(PM);
       PM.add(TPC);
-      
-//      LLVMTM.addMachineModuleInfo(PM);
-//      LLVMTM.addMachineFunctionAnalysis(PM, MIR.get());
+
+      //      LLVMTM.addMachineModuleInfo(PM);
+      //      LLVMTM.addMachineFunctionAnalysis(PM, MIR.get());
       MachineModuleInfo *MMI = new MachineModuleInfo(LLVMTM);
       MMI->setMachineFunctionInitializer(MIR.get());
       PM.add(MMI);
-      
+
       TPC->printAndVerify("");
 
       for (const std::string &RunPassName : *RunPassNames) {
@@ -554,4 +563,4 @@ static int compileModule(char **argv, LLVMContext &Context) {
   Out->keep();
 
   return 0;
-} 
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/hpvm/projects/visc-rt/CMakeLists.txt b/hpvm/projects/visc-rt/CMakeLists.txt
deleted file mode 100644
index 0395624253e4bc2b62e9eca51bd98a1a6a86436e..0000000000000000000000000000000000000000
--- a/hpvm/projects/visc-rt/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-add_definitions(-DNUM_CORES=8)
-
-SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
-SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
-
-add_llvm_library(visc-rt.ll visc-rt.cpp
-
-  DEPENDS
-  clang
-  )
-
-
-target_compile_options(visc-rt.ll PUBLIC -flto )
-target_compile_options(visc-rt.ll PUBLIC -std=c++11)
-
-add_custom_target(visc-rt.cpp.o ALL
-  COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libvisc-rt.ll.a
-  COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/visc-rt/visc-rt.bc
-  )
-
-add_dependencies(visc-rt.cpp.o   visc-rt.ll)
diff --git a/hpvm/test/CTestSuite/Makefile b/hpvm/test/CTestSuite/Makefile
index 226a83287d743360d9cd64a7c57e864871829b0b..1169e4e896a861975ac0562ebff8b208828bbf89 100644
--- a/hpvm/test/CTestSuite/Makefile
+++ b/hpvm/test/CTestSuite/Makefile
@@ -9,7 +9,7 @@ LLVM_CC:=$(LLVM_INSTALL)/bin/clang
 LLVM_OPT:=$(LLVM_INSTALL)/bin/opt
 BUILD_DIR:=build
 
-all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll)
+all: $(BUILD_DIR) $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -17,10 +17,10 @@ $(BUILD_DIR):
 $(HOST:%=$(BUILD_DIR)/%.ll):$(BUILD_DIR)/%.ll:%.c
 	$(LLVM_CC) -S -emit-llvm $< -O3 -o $@
 
-$(HOST:%=$(BUILD_DIR)/%.visc.ll):$(BUILD_DIR)/%.visc.ll:$(BUILD_DIR)/%.ll
-	$(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenVISC.so -genvisc -globaldce $< -S -o $@
+$(HOST:%=$(BUILD_DIR)/%.hpvm.ll):$(BUILD_DIR)/%.hpvm.ll:$(BUILD_DIR)/%.ll
+	$(LLVM_OPT) -load $(LLVM_SRC_ROOT)/Release+Asserts/lib/LLVMGenHPVM.so -genhpvm -globaldce $< -S -o $@
 	@cat RUN.script $@ > $@.tmp
 	@mv $@.tmp $@
 
 clean :
-	rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.visc.ll.nvptx.s)  $(BUILD_DIR)/DataflowGraph.dot*
+	rm -f $(HOST:%=$(BUILD_DIR)/%.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.kernels.ll) $(HOST:%=$(BUILD_DIR)/%.hpvm.ll.nvptx.s)  $(BUILD_DIR)/DataflowGraph.dot*
diff --git a/hpvm/test/CTestSuite/RUN.script b/hpvm/test/CTestSuite/RUN.script
index 10bf667818824719af2e041fc6b2dc3e449d9158..23fa1694ebf4b7448c731327b96b949c0509b62e 100644
--- a/hpvm/test/CTestSuite/RUN.script
+++ b/hpvm/test/CTestSuite/RUN.script
@@ -1,6 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
 ; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
 ; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll %llvm_src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin
diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c
index d0a69ba25c27fb65ea549023deed2dfb0197b882..eb0a3c5e9204d9621c4a15ae7f07ef5158ac1d07 100644
--- a/hpvm/test/CTestSuite/gemm.c
+++ b/hpvm/test/CTestSuite/gemm.c
@@ -54,14 +54,14 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   return 1; // Success
 }
 
-// Dummy visc node execution call
-// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// Dummy hpvm node execution call
+// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned),
 // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
 // outputs);
 
 void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
 
-  __visc__attributes(2, A, B, 1, C);
+  __hpvm__attributes(2, A, B, 1, C);
   // printf("Entered function\n");
   int tx = get_local_id(0); // 2D Global Thread ID x
   int ty = get_local_id(1); // 2D Global Thread ID y
@@ -130,10 +130,10 @@ int main(int argc, char **argv) {
 
   // Compute using OpenCL
   // matrixMul(h_A, h_B, h_C, WA, WB);
-  //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
-  unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B,
+  //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
+  unsigned graphMM = __hpvm__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B,
                                   bytes_B, h_C, bytes_C, WA, WB, 0);
-  __visc__wait(graphMM);
+  __hpvm__wait(graphMM);
   if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c
index bd7ab27fc0160275442d23faf507851b7c2369f7..df4555936316703cfccd4048f2ade4e28592e53a 100644
--- a/hpvm/test/CTestSuite/gemm_2.c
+++ b/hpvm/test/CTestSuite/gemm_2.c
@@ -54,13 +54,13 @@ __attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   return 1; // Success
 }
 
-// Dummy visc node execution call
-// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// Dummy hpvm node execution call
+// void __hpvm__node(void kernel (float*, float*, float*, unsigned, unsigned),
 // int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
 // outputs);
 
 void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
-  __visc__attributes(2, A, B, 1, C);
+  __hpvm__attributes(2, A, B, 1, C);
 
   // printf("Entered function\n");
   int tx = get_global_id(0); // 2D Global Thread ID x
@@ -130,11 +130,11 @@ int main(int argc, char **argv) {
 
   // Compute using OpenCL
   // matrixMul(h_A, h_B, h_C, WA, WB);
-  //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
+  //__hpvm__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
   unsigned graphMM =
-      __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A,
+      __hpvm__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A,
                    h_B, bytes_B, h_C, bytes_C, WA, WB, 0);
-  __visc__wait(graphMM);
+  __hpvm__wait(graphMM);
   if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
diff --git a/hpvm/test/README.md b/hpvm/test/README.md
index 94103affb668afc29d32e52d85d0d60182bd16d8..1cc9abf4f963cffca8d6dbf52e14413172b5a218 100644
--- a/hpvm/test/README.md
+++ b/hpvm/test/README.md
@@ -9,11 +9,11 @@ Tests may be built for the cpu or gpu with hpvm.
 # sgemm example
 cd parboil/benchmarks/sgemm
 # HPVM cpu
-make TARGET=seq VERSION=visc
-make run TARGET=seq VERSION=visc
+make TARGET=seq VERSION=hpvm
+make run TARGET=seq VERSION=hpvm
 # HPVM gpu
-make TARGET=gpu VERSION=visc
-make run TARGET=gpu VERSION=visc
+make TARGET=gpu VERSION=hpvm
+make run TARGET=gpu VERSION=hpvm
 ```
 
 ## Cava
@@ -27,4 +27,4 @@ make TARGET={seq, gpu}
 
 ## Your own project
 See `template/` for an example Makefile and config.
-Include `visc.h` to use HPVM C api functions, found in the `test/include/visc.h`.
+Include `hpvm.h` to use HPVM C api functions, found in the `test/include/hpvm.h`.
diff --git a/hpvm/test/hpvm-cava/.gitignore b/hpvm/test/hpvm-cava/.gitignore
index 2fc1b235647962ac761edda7dfbda4499cbcd4f0..f08b880bf9b4b8171e9fb878bea3a6d266a1f9c0 100644
--- a/hpvm/test/hpvm-cava/.gitignore
+++ b/hpvm/test/hpvm-cava/.gitignore
@@ -1,5 +1,5 @@
 build/
-cava-visc
+cava-hpvm
 Makefile.config
 
 example-face/*.bin
diff --git a/hpvm/test/hpvm-cava/Makefile b/hpvm/test/hpvm-cava/Makefile
index 0054af8c4d9cc39c21b00e73a5b53c8ac2a089b8..dd8e4825c8b72ebf44b1acdbe7db2127987d6684 100644
--- a/hpvm/test/hpvm-cava/Makefile
+++ b/hpvm/test/hpvm-cava/Makefile
@@ -26,21 +26,21 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
 
 INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR)
-INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include
 ifneq ($(CONFUSE_ROOT),)
   INCLUDES += -I$(CONFUSE_ROOT)/include
   LFLAGS += -L$(CONFUSE_ROOT)/lib
 endif
 
-EXE = cava-visc-$(VERSION)-$(TARGET)
+EXE = cava-hpvm-$(VERSION)-$(TARGET)
 
 LFLAGS += -pthread
 
 ## BEGIN HPVM MAKEFILE
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll
 OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP = $(EXE)
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
@@ -52,23 +52,23 @@ OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt
 
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
 
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 
 ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
 endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 
 CFLAGS += -DDEVICE=$(DEVICE)
 CXXFLAGS += -DDEVICE=$(DEVICE)
@@ -79,7 +79,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
 .PRECIOUS: $(BUILD_DIR)/%.ll
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
 ifeq ($(TARGET),gpu)
@@ -105,11 +105,11 @@ $(KERNEL_OCL) : $(KERNEL)
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
+	$(OPT) $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -120,7 +120,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
 $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
 	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
 
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
+$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/hpvm-cava/README.md b/hpvm/test/hpvm-cava/README.md
index 890b629d172a2f53bf77d6d52bda27637c71afeb..1106c4781b285c47d59548d47e5cd03f09063b28 100644
--- a/hpvm/test/hpvm-cava/README.md
+++ b/hpvm/test/hpvm-cava/README.md
@@ -12,7 +12,7 @@ See the original camera/vision pipeline repo (repo: `yaoyuannnn/cava`) for detai
 After building HPVM, the following steps are required to build and run the camera pipeline:
 
 1. Build with `make TARGET=seq` for CPU and `make TARGET=gpu` for gpu.
-2. Run with `./cava-visc-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. 
+2. Run with `./cava-hpvm-<Target> example-tulip-small/raw_tulip-small.bin example-tulip-small/tulip-small`. 
     * `<Target>` can be either `seq` or `gpu` depending on what target is used to build.
     * This processes the raw image `example-tulip-small/raw_tulip-small.bin`. Note that raw images are different from bitmaps, so you might need to obtain them using special software.
     * This generates: `tulip-small.bin` and `tulip-small-<stage>.bin` where `<stage>` represents the stage of the pipeline.
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe.c b/hpvm/test/hpvm-cava/src/cam_pipe.c
index 7874ff9d529afebc40d1660637e85b3a1e00f23e..cdeaf393320121706d13d423212896e2551142c8 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe.c
@@ -1,11 +1,11 @@
+#include "cam_pipe_utility.h"
+#include "dma_interface.h"
+#include "load_cam_model.h"
+#include "pipe_stages.h"
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
-#include "pipe_stages.h"
-#include "load_cam_model.h"
-#include "cam_pipe_utility.h"
-#include "dma_interface.h"
 #ifdef DMA_MODE
 #include "gem5_harness.h"
 #endif
@@ -13,7 +13,7 @@
 // FIXME: Include gem5/dma_interface.cc/h separately
 #ifndef DMA_INTERFACE_V3
 #define DMA_INTERFACE_V3
-#endif//DMA_INTERFACE_V3
+#endif // DMA_INTERFACE_V3
 
 ///////////////////////////////////////////////////////////////
 // Camera Model Parameters
@@ -71,7 +71,8 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   uint8_t *acc_input, *acc_result;
   float *acc_input_scaled, *acc_result_scaled;
   float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map;
-  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist;
+  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map,
+      *acc_l2_dist;
 
   strcat(cam_model_path, "cam_models/NikonD7000/");
 
@@ -84,20 +85,25 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   host_coefs = get_coefs(cam_model_path, num_ctrl_pts);
   host_tone_map = get_tone_map(cam_model_path);
 
-  acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
-  acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
-  acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9);
-  acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_coefs = (float*) malloc_aligned(sizeof(float) * 12);
-  acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
-  acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);
+  acc_input = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
+                                        CHAN_SIZE);
+  acc_result = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
+                                         CHAN_SIZE);
+  acc_input_scaled =
+      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_result_scaled =
+      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_TsTw = (float *)malloc_aligned(sizeof(float) * 9);
+  acc_ctrl_pts =
+      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_weights =
+      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_coefs = (float *)malloc_aligned(sizeof(float) * 12);
+  acc_tone_map = (float *)malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
+  acc_l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
 
   // Load camera model parameters for the ISP
-  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw,
-                     sizeof(float) * 9);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, sizeof(float) * 9);
   MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts,
                      sizeof(float) * num_ctrl_pts * CHAN_SIZE);
   MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights,
@@ -136,4 +142,3 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   free(acc_tone_map);
   free(acc_l2_dist);
 }
-
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
index f806e9ee1a2e288fabcb8ad658a47c3919fbb661..864f02d5b28f2c4738279cf66cba5f4312c2a3de 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
@@ -1,6 +1,6 @@
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 
 #include "cam_pipe_utility.h"
 //#include "pipe_stages.h"
@@ -26,10 +26,11 @@ uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) {
   return image;
 }
 
-void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) {
+void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
+                           int col_size) {
   FILE *fp = fopen(file_path, "w");
 
-  int shape[3] = { row_size, col_size, CHAN_SIZE };
+  int shape[3] = {row_size, col_size, CHAN_SIZE};
   fwrite(shape, sizeof(int), 3, fp);
 
   int size = row_size * col_size * CHAN_SIZE;
@@ -40,8 +41,8 @@ void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int co
 float *transpose_mat(float *inmat, int width, int height) {
   // Define vectors
   float *outmat;
-  int err =
-      posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width);
+  int err = posix_memalign((void **)&outmat, CACHELINE_SIZE,
+                           sizeof(float) * height * width);
   assert(err == 0 && "Failed to allocate memory!");
 
   // Transpose the matrix
@@ -71,7 +72,7 @@ void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size,
                         uint8_t **result) {
   if (*result == NULL) {
     *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE *
-                                      sizeof(uint8_t));
+                                        sizeof(uint8_t));
   }
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
index b4fb6cde0c438b23c2b596cf0418953aaedca501..b61b7cc9b52aa59522f93661895fca960b947f17 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
@@ -1,8 +1,8 @@
 #ifndef _CAM_PIPE_UTILITY_H_
 #define _CAM_PIPE_UTILITY_H_
 
-#include "utility.h"
 #include "pipe_stages.h"
+#include "utility.h"
 
 uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size);
 void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
diff --git a/hpvm/test/hpvm-cava/src/defs.h b/hpvm/test/hpvm-cava/src/defs.h
index ccc8acc857c36fd13115670932a38dc3a406dc29..0fa95ef3d2ea55c67a921e0bc5fc8a6ec6ba949f 100644
--- a/hpvm/test/hpvm-cava/src/defs.h
+++ b/hpvm/test/hpvm-cava/src/defs.h
@@ -10,46 +10,46 @@ typedef unsigned long uint64_t;
 
 // Debugging message macros.
 #if DEBUG_LEVEL >= 1
-  #define INFO_MSG(args...) printf(args)
-
-  #if DEBUG_LEVEL >= 2
-    #define PRINT_MSG(args...) printf(args)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
-        print_debug(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
-        print_debug4d(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
-        print_debug4d_fp16(hid, num, height, rows, cols)
-
-    #if DEBUG_LEVEL >= 3
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
-          print_debug(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
-          print_debug4d(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...) printf(args)
-    #else
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...)
-    #endif
-  #else
-    #define PRINT_MSG(args...)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-    #define PRINT_DEBUG_V(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-    #define PRINT_MSG_V(args...)
-  #endif
+#define INFO_MSG(args...) printf(args)
+
+#if DEBUG_LEVEL >= 2
+#define PRINT_MSG(args...) printf(args)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
+  print_debug(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
+  print_debug4d(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
+  print_debug4d_fp16(hid, num, height, rows, cols)
+
+#if DEBUG_LEVEL >= 3
+#define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
+  print_debug(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
+  print_debug4d(hid, rows, cols, height)
+#define PRINT_MSG_V(args...) printf(args)
 #else
-  #define INFO_MSG(args...)
-  #define PRINT_DEBUG(hid, rows, cols, num_cols)
-  #define PRINT_DEBUG4D(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-  #define PRINT_MSG(args...)
-  #define PRINT_DEBUG_V(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-  #define PRINT_MSG_V(args...)
+#define PRINT_DEBUG_V(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
+#endif
+#else
+#define PRINT_MSG(args...)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+#define PRINT_DEBUG_V(hid, rows, cols, height)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
+#endif
+#else
+#define INFO_MSG(args...)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+#define PRINT_MSG(args...)
+#define PRINT_DEBUG_V(hid, rows, cols, height)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
 #endif
 
 #define STRING(arg) #arg
@@ -72,9 +72,9 @@ typedef unsigned long uint64_t;
 #define max3(e0, e1, e2) max2(max2(e0, e1), e2)
 #define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
 #define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
-    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
+  max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
 #define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
-    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
+  max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
 
 #define min2(A, B) (((A) < (B)) ? (A) : (B))
 
@@ -92,7 +92,8 @@ typedef unsigned long uint64_t;
 //  If GEM5_HARNESS is defined:
 //
 //     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
-//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
+//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr,
+//        mySize)
 //
 //     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
 //        ===>   invokeAcceleratorAndBlock(myReqCode)
@@ -107,69 +108,69 @@ typedef unsigned long uint64_t;
 #ifdef GEM5_HARNESS
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    mapArrayToAccelerator(req_code, name, base_addr, size)
+  mapArrayToAccelerator(req_code, name, base_addr, size)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndBlock(req_code);                                   \
-    } while (0)
+  do {                                                                         \
+    UNUSED(kernel_ptr);                                                        \
+    invokeAcceleratorAndBlock(req_code);                                       \
+  } while (0)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
-    } while (0)
+  do {                                                                         \
+    UNUSED(kernel_ptr);                                                        \
+    invokeAcceleratorAndReturn2(req_code, finish_flag);                        \
+  } while (0)
 
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);           \
+  } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);            \
+  } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);            \
+  } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);             \
+  } while (0)
 
 #else
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    do {                                                                       \
-        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
-                 name, (void*)base_addr, (int)(size));                         \
-        UNUSED(req_code);                                                      \
-        UNUSED(name);                                                          \
-        UNUSED(base_addr);                                                     \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    INFO_MSG("Mapping array %s @ %p, size %d.\n", name, (void *)base_addr,     \
+             (int)(size));                                                     \
+    UNUSED(req_code);                                                          \
+    UNUSED(name);                                                              \
+    UNUSED(base_addr);                                                         \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    kernel_ptr(args)
+  kernel_ptr(args)
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 
 #endif
 
@@ -177,14 +178,14 @@ typedef unsigned long uint64_t;
 //
 // This assumes that the current name of the base pointer is also the name of
 // the array in the top level function of the dynamic trace. THIS IS VERY
-// IMPORTANT - if the argument passed to a top level function has been renamed in
-// the function, then this WILL NOT WORK!
+// IMPORTANT - if the argument passed to a top level function has been renamed
+// in the function, then this WILL NOT WORK!
 //
 // MAP_ARRAY(myReqCode, myArray, mySize)
 //    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
 #define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
-    MAP_ARRAY_TO_ACCEL(                                                        \
-            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
+  MAP_ARRAY_TO_ACCEL(req_code, STRING(name_and_base_addr), name_and_base_addr, \
+                     size)
 
 // Use these convenience macros to cast a raw pointer into a multidimensional
 // variable-length array, which lets us use [] notation inside of the ugly
@@ -202,23 +203,24 @@ typedef unsigned long uint64_t;
 //
 //   And so on...
 #define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
-    TYPE* output_array_name = (TYPE*)input_array_name
+  TYPE *output_array_name = (TYPE *)input_array_name
 
 #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
-    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+  TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
 
 #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
-    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
-        (TYPE(*)[DIM_1][DIM_2])input_array_name
-
-#define ARRAY_4D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
-
-#define ARRAY_5D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name
+
+#define ARRAY_4D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
+                 DIM_3)                                                        \
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2][DIM_3] = (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
+
+#define ARRAY_5D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
+                 DIM_3, DIM_4)                                                 \
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2][DIM_3][DIM_4] =                                               \
+      (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
 
 #endif
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.c b/hpvm/test/hpvm-cava/src/dma_interface.c
index 81bce54469886153170f994a77250a784cc9b7d7..68698635a4fceb4fe67e323bd0f354bd70bca99d 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.c
+++ b/hpvm/test/hpvm-cava/src/dma_interface.c
@@ -1,6 +1,6 @@
+#include "dma_interface.h"
 #include <assert.h>
 #include <string.h>
-#include "dma_interface.h"
 
 // All _dmaImplN functions must be always inlined or we'll get extra functions
 // in the trace.
@@ -10,22 +10,22 @@
 // Starting with version 3, all versioning will be distinguished by the return
 // value of the DMA functions.
 
-__attribute__((__always_inline__))
-int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) {
+__attribute__((__always_inline__)) int _dmaImpl3(void *dst_addr, void *src_addr,
+                                                 size_t size) {
   assert(size > 0);
   memmove(dst_addr, src_addr, size);
   return 3;
 }
 
-int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) {
+int dmaLoad(void *dst_addr, void *src_host_addr, size_t size) {
   return _dmaImpl3(dst_addr, src_host_addr, size);
 }
 
-int dmaStore(void* dst_host_addr, void* src_addr, size_t size) {
+int dmaStore(void *dst_host_addr, void *src_addr, size_t size) {
   return _dmaImpl3(dst_host_addr, src_addr, size);
 }
 
-int setReadyBits(void* start_addr, size_t size, unsigned value) {
+int setReadyBits(void *start_addr, size_t size, unsigned value) {
   asm("");
   return 0;
 }
@@ -35,39 +35,37 @@ int setReadyBits(void* start_addr, size_t size, unsigned value) {
 // With version 2 and earlier, we return (void*)NULL and use the number of
 // function arguments to distinguish the DMA functions.
 
-__attribute__((__always_inline__))
-void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+__attribute__((__always_inline__)) void *
+_dmaImpl2(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   assert(size > 0);
   memmove(base_addr + dst_off, base_addr + src_off, size);
   return NULL;
 }
 
-void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
-void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
 #else
 
-__attribute__((__always_inline__))
-void* _dmaImpl1(void* base_addr, size_t offset, size_t size) {
+__attribute__((__always_inline__)) void *_dmaImpl1(void *base_addr,
+                                                   size_t offset, size_t size) {
   assert(size > 0);
   asm("");
   return NULL;
 }
 
-void* dmaLoad(void* addr, size_t offset, size_t size) {
+void *dmaLoad(void *addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 
-void* dmaStore(void* addr, size_t offset, size_t size) {
+void *dmaStore(void *addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 #endif
 
-void dmaFence() {
-  asm("");
-}
+void dmaFence() { asm(""); }
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.h b/hpvm/test/hpvm-cava/src/dma_interface.h
index f23234eede4df99db84b144646530dfe240c6e62..771ece523824cff5923581aca671ab7d26fae706 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.h
+++ b/hpvm/test/hpvm-cava/src/dma_interface.h
@@ -10,12 +10,12 @@
 // Version 3 of the DMA interface enables memcpy operations from arbitrary
 // source and destination addresses.
 
-int dmaLoad(void* dst_addr, void* src_host_addr, size_t size);
-int dmaStore(void* dst_host_addr, void* src_addr, size_t size);
+int dmaLoad(void *dst_addr, void *src_host_addr, size_t size);
+int dmaStore(void *dst_host_addr, void *src_addr, size_t size);
 
 // The user can explicitly toggle the state of ready bits, if ready mode is
 // enabled. This requires support from DMA v3.
-int setReadyBits(void* start_addr, size_t size, unsigned value);
+int setReadyBits(void *start_addr, size_t size, unsigned value);
 
 #elif defined(DMA_INTERFACE_V2)
 
@@ -26,17 +26,18 @@ int setReadyBits(void* start_addr, size_t size, unsigned value);
 // actually copied from source to destination (the memory copy will not show up
 // in the trace).
 
-void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size);
-void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size);
+void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size);
+void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size);
 
 #else
 
 #warning "DMA interface v1 is deprecated!"
 
-// Version 1 of the DMA interface is now deprecated and will be removed entirely.
+// Version 1 of the DMA interface is now deprecated and will be removed
+// entirely.
 
-void* dmaLoad(void* addr, size_t offset, size_t size);
-void* dmaStore(void* addr, size_t offset, size_t size);
+void *dmaLoad(void *addr, size_t offset, size_t size);
+void *dmaStore(void *addr, size_t offset, size_t size);
 
 #endif
 void dmaFence();
diff --git a/hpvm/test/hpvm-cava/src/load_cam_model.c b/hpvm/test/hpvm-cava/src/load_cam_model.c
index 124fe0b7d175c2655feac562ecd6e2a5b73cc96a..dffb12b04b1e8f1cc56060737527a33de074d4a5 100644
--- a/hpvm/test/hpvm-cava/src/load_cam_model.c
+++ b/hpvm/test/hpvm-cava/src/load_cam_model.c
@@ -1,13 +1,14 @@
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
+
 #include "utility.h"
 #include "pipe_stages.h"
 #include "load_cam_model.h"
 
 // Get color space transform
-float* get_Ts(char* cam_model_path) {
+float *get_Ts(char *cam_model_path) {
   float *Ts;
   int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -32,7 +33,7 @@ float* get_Ts(char* cam_model_path) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
@@ -50,7 +51,7 @@ float* get_Ts(char* cam_model_path) {
 }
 
 // Get white balance transform
-float* get_Tw(char* cam_model_path, int wb_index) {
+float *get_Tw(char *cam_model_path, int wb_index) {
   float *Tw;
   int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -62,7 +63,7 @@ float* get_Tw(char* cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base  = 8 + 5*(wb_index-1);
+  int wb_base = 8 + 5 * (wb_index - 1);
 
   // Open file for reading
   // Open file for reading
@@ -81,15 +82,15 @@ float* get_Tw(char* cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
 
     if (line_idx == wb_base) {
       // Convert the white balance vector into a diagaonal matrix
-      for (int i=0; i<3; i++) {
-        for (int j=0; j<3; j++) {
+      for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
           if (i == j) {
             Tw[i * 3 + j] = line_data[i];
           } else {
@@ -105,9 +106,8 @@ float* get_Tw(char* cam_model_path, int wb_index) {
   return Tw;
 }
 
-
 // Get combined transforms for checking
-float* get_TsTw(char* cam_model_path, int wb_index) {
+float *get_TsTw(char *cam_model_path, int wb_index) {
   float *TsTw;
   int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -119,7 +119,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base  = 5 + 5*(wb_index-1);
+  int wb_base = 5 + 5 * (wb_index - 1);
 
   // Open file for reading
   char file_name[] = "raw2jpg_transform.txt";
@@ -137,7 +137,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
@@ -155,7 +155,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
 }
 
 // Get control points
-float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
+float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts) {
   float *ctrl_pnts;
   int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -200,7 +200,7 @@ float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
 }
 
 // Get weights
-float* get_weights(char* cam_model_path, int num_cntrl_pts) {
+float *get_weights(char *cam_model_path, int num_cntrl_pts) {
   float *weights;
   int err = posix_memalign((void **)&weights, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -245,7 +245,7 @@ float* get_weights(char* cam_model_path, int num_cntrl_pts) {
 }
 
 // Get coeficients
-float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
+float *get_coefs(char *cam_model_path, int num_cntrl_pts) {
   float *coefs;
   int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12);
   assert(err == 0 && "Failed to allocate memory!");
@@ -288,9 +288,8 @@ float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
   return coefs;
 }
 
-
 // Get tone mapping table
-float* get_tone_map(char* cam_model_path) {
+float *get_tone_map(char *cam_model_path) {
   float *tone_map;
   int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE,
                            sizeof(float) * 256 * CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c
index e43bbb4f25c4c97c9907ebae37251c854860c3b5..d3834165a86ba114ef4b2369af980b02dbfb62c1 100644
--- a/hpvm/test/hpvm-cava/src/main.c
+++ b/hpvm/test/hpvm-cava/src/main.c
@@ -1,16 +1,16 @@
+#include "utility.h"
 #include <argp.h>
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 #include <string.h>
-#include <math.h>
-#include "utility.h"
 
 #include "cam_pipe_utility.h"
-#include "pipe_stages.h"
 #include "load_cam_model.h"
+#include "pipe_stages.h"
 
-#include "visc.h"
+#include "hpvm.h"
 
 int NUM_TEST_CASES;
 int NUM_CLASSES;
@@ -20,117 +20,129 @@ int NUM_WORKER_THREADS;
 // Type of struct that is used to pass arguments to the HPVM dataflow graph
 // using the hpvm launch operation
 typedef struct __attribute__((__packed__)) {
-    uint8_t *input; size_t bytes_input;
-    uint8_t *result; size_t bytes_result;
-    float *input_scaled; size_t bytes_input_scaled; 
-    float *result_scaled; size_t bytes_result_scaled;
-    float *demosaic_out; size_t bytes_demosaic_out;
-    float *denoise_out; size_t bytes_denoise_out;
-    float *transform_out; size_t bytes_transform_out;
-    float *gamut_out;size_t bytes_gamut_out;
-    float *TsTw; size_t bytes_TsTw;
-    float *ctrl_pts; size_t bytes_ctrl_pts;
-    float *weights; size_t bytes_weights;
-    float*coefs; size_t bytes_coefs;
-    float *l2_dist; size_t bytes_l2_dist;
-    float *tone_map; size_t bytes_tone_map;
-    size_t row_size; size_t col_size;
-} 
-RootIn;
+  uint8_t *input;
+  size_t bytes_input;
+  uint8_t *result;
+  size_t bytes_result;
+  float *input_scaled;
+  size_t bytes_input_scaled;
+  float *result_scaled;
+  size_t bytes_result_scaled;
+  float *demosaic_out;
+  size_t bytes_demosaic_out;
+  float *denoise_out;
+  size_t bytes_denoise_out;
+  float *transform_out;
+  size_t bytes_transform_out;
+  float *gamut_out;
+  size_t bytes_gamut_out;
+  float *TsTw;
+  size_t bytes_TsTw;
+  float *ctrl_pts;
+  size_t bytes_ctrl_pts;
+  float *weights;
+  size_t bytes_weights;
+  float *coefs;
+  size_t bytes_coefs;
+  float *l2_dist;
+  size_t bytes_l2_dist;
+  float *tone_map;
+  size_t bytes_tone_map;
+  size_t row_size;
+  size_t col_size;
+} RootIn;
 
 typedef enum _argnum {
-    RAW_IMAGE_BIN,
-    OUTPUT_IMAGE_BIN,
-    NUM_REQUIRED_ARGS,
-    DATA_FILE = NUM_REQUIRED_ARGS,
-    NUM_ARGS,
+  RAW_IMAGE_BIN,
+  OUTPUT_IMAGE_BIN,
+  NUM_REQUIRED_ARGS,
+  DATA_FILE = NUM_REQUIRED_ARGS,
+  NUM_ARGS,
 } argnum;
 
 typedef struct _arguments {
-    char* args[NUM_ARGS];
-    int num_inputs;
-    int num_threads;
+  char *args[NUM_ARGS];
+  int num_inputs;
+  int num_threads;
 } arguments;
 
 static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
 static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
 static struct argp_option options[] = {
-    { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 },
-    { "data-file", 'f', "F", 0,
-      "File to read data and weights from (if data-init-mode == READ_FILE or "
-      "save-params is true). *.txt files are decoded as text files, while "
-      "*.bin files are decoded as binary files." },
+    {"num-inputs", 'n', "N", 0, "Number of input images"},
+    {0},
+    {"data-file", 'f', "F", 0,
+     "File to read data and weights from (if data-init-mode == READ_FILE or "
+     "save-params is true). *.txt files are decoded as text files, while "
+     "*.bin files are decoded as binary files."},
 };
 
-static error_t parse_opt(int key, char* arg, struct argp_state* state) {
-    arguments* args = (arguments*)(state->input);
-    switch (key) {
-        case 'n': {
-            args->num_inputs = strtol(arg, NULL, 10);
-            break;
-        }
-        case 'f': {
-            args->args[DATA_FILE] = arg;
-            break;
-        }
-        case 't': {
-            args->num_threads = strtol(arg, NULL, 10);
-            break;
-        }
-        case ARGP_KEY_ARG: {
-            if (state->arg_num >= NUM_REQUIRED_ARGS)
-                argp_usage(state);
-            args->args[state->arg_num] = arg;
-            break;
-        }
-        case ARGP_KEY_END: {
-            if (state->arg_num < NUM_REQUIRED_ARGS) {
-                fprintf(stderr,
-                        "Not enough arguments! Got %d, require %d.\n",
-                        state->arg_num,
-                        NUM_REQUIRED_ARGS);
-                argp_usage(state);
-            }
-            break;
-        }
-        default:
-            return ARGP_ERR_UNKNOWN;
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+  arguments *args = (arguments *)(state->input);
+  switch (key) {
+  case 'n': {
+    args->num_inputs = strtol(arg, NULL, 10);
+    break;
+  }
+  case 'f': {
+    args->args[DATA_FILE] = arg;
+    break;
+  }
+  case 't': {
+    args->num_threads = strtol(arg, NULL, 10);
+    break;
+  }
+  case ARGP_KEY_ARG: {
+    if (state->arg_num >= NUM_REQUIRED_ARGS)
+      argp_usage(state);
+    args->args[state->arg_num] = arg;
+    break;
+  }
+  case ARGP_KEY_END: {
+    if (state->arg_num < NUM_REQUIRED_ARGS) {
+      fprintf(stderr, "Not enough arguments! Got %d, require %d.\n",
+              state->arg_num, NUM_REQUIRED_ARGS);
+      argp_usage(state);
     }
-    return 0;
+    break;
+  }
+  default:
+    return ARGP_ERR_UNKNOWN;
+  }
+  return 0;
 }
 
-void set_default_args(arguments* args) {
-    args->num_inputs = 1;
-    args->num_threads = 0;
-    for (int i = 0; i < NUM_ARGS; i++) {
-        args->args[i] = NULL;
-    }
+void set_default_args(arguments *args) {
+  args->num_inputs = 1;
+  args->num_threads = 0;
+  for (int i = 0; i < NUM_ARGS; i++) {
+    args->args[i] = NULL;
+  }
 }
 
-static struct argp parser = { options, parse_opt, args_doc, prog_doc };
+static struct argp parser = {options, parse_opt, args_doc, prog_doc};
 
 // Helper function for printing intermediate results
-void descale_cpu(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  
+void descale_cpu(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    for (i = 0; i < n - 1; i++)
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+  for (i = 0; i < n - 1; i++)
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 /**************************************************************/
@@ -140,256 +152,259 @@ static void sort(float arr[], int n) {
 // In this benchmark, no use of HPVM query intrinsics in the leaf node functions
 
 // Leaf HPVM node function for scale
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               size_t row_size, size_t col_size) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, size_t row_size, size_t col_size) {
 
-  //Specifies compilation target for current node
-  __visc__hint(CPU_TARGET);
+  // Specifies compilation target for current node
+  __hpvm__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-  __visc__attributes(2, input, output, 1, output);
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__attributes(2, input, output, 1, output);
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++){
-        int index = (chan*row_size + row) * col_size + col;
-        output[index] = input[index] * 1.0 / 255;
-      }
-  __visc__return(1, bytes_output);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      output[index] = input[index] * 1.0 / 255;
+    }
+  __hpvm__return(1, bytes_output);
 }
 
 // Leaf HPVM node function for descale
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, output, 1, output);
-  
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, output, 1, output);
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for demosaicing
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 1; row < row_size - 1; row++)
-    for (int col = 1; col < col_size - 1; col++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - 1];
-            float R2 = input[index_0 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size];
-            float B2 = input[index_2 + col_size];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size - 1];
-            float B2 = input[index_2 - col_size + 1];
-            float B3 = input[index_2 + col_size - 1];
-            float B4 = input[index_2 + col_size + 1];
-            // R
-            result[index_0] = input[index_0];
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            result[index_2] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size - 1];
-            float R2 = input[index_0 + col_size - 1];
-            float R3 = input[index_0 - col_size + 1];
-            float R4 = input[index_0 + col_size + 1];
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // R
-            result[index_0] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            result[index_2] = input[index_2];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size];
-            float R2 = input[index_0 + col_size];
-            // Getting the B values
-            float B1 = input[index_2 - 1];
-            float B2 = input[index_2 + 1];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        }
-      }
-  __visc__return(1, bytes_result);
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
+  //  for (int row = 1; row < row_size - 1; row++)
+  for (int col = 1; col < col_size - 1; col++) {
+    int index_0 = (0 * row_size + row) * col_size + col;
+    int index_1 = (1 * row_size + row) * col_size + col;
+    int index_2 = (2 * row_size + row) * col_size + col;
+    if (row % 2 == 0 && col % 2 == 0) {
+      // Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - 1];
+      float R2 = input[index_0 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size];
+      float B2 = input[index_2 + col_size];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    } else if (row % 2 == 0 && col % 2 == 1) {
+      // Red pixel
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size - 1];
+      float B2 = input[index_2 - col_size + 1];
+      float B3 = input[index_2 + col_size - 1];
+      float B4 = input[index_2 + col_size + 1];
+      // R
+      result[index_0] = input[index_0];
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B (center pixel)
+      result[index_2] = (B1 + B2 + B3 + B4) / 4;
+    } else if (row % 2 == 1 && col % 2 == 0) {
+      // Blue pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size - 1];
+      float R2 = input[index_0 + col_size - 1];
+      float R3 = input[index_0 - col_size + 1];
+      float R4 = input[index_0 + col_size + 1];
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // R
+      result[index_0] = (R1 + R2 + R3 + R4) / 4;
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B
+      result[index_2] = input[index_2];
+    } else {
+      // Bottom Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size];
+      float R2 = input[index_0 + col_size];
+      // Getting the B values
+      float B1 = input[index_2 - 1];
+      float B2 = input[index_2 + 1];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    }
+  }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for denoise
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++)
-        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
-          float filter[9];
-          for (int i = -1; i < 2; i++)
-            for (int j = -1; j < 2; j++) {
-              int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1;
-              filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)];
-            }
-          sort(filter, 9);
-          result[(chan * row_size + row) * col_size + col] = filter[4];
-        } else {
-      result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col];
-        }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++)
+      if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+        float filter[9];
+        for (int i = -1; i < 2; i++)
+          for (int j = -1; j < 2; j++) {
+            int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1;
+            filter[index] =
+                input[(chan * row_size + (i + row)) * col_size + (j + col)];
+          }
+        sort(filter, 9);
+        result[(chan * row_size + row) * col_size + col] = filter[4];
+      } else {
+        result[(chan * row_size + row) * col_size + col] =
+            input[(chan * row_size + row) * col_size + col];
+      }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for color map and white balance transform
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        int index_2d_0 = 0 * CHAN_SIZE + chan;
-        int index_2d_1 = 1 * CHAN_SIZE + chan;
-        int index_2d_2 = 2 * CHAN_SIZE + chan;
-        result[index] =
-            max(input[index_0] * TsTw_tran[index_2d_0] +
-                input[index_1] * TsTw_tran[index_2d_1] +
-                input[index_2] * TsTw_tran[index_2d_2],
-                0);
-      }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      int index_2d_0 = 0 * CHAN_SIZE + chan;
+      int index_2d_1 = 1 * CHAN_SIZE + chan;
+      int index_2d_2 = 2 * CHAN_SIZE + chan;
+      result[index] = max(input[index_0] * TsTw_tran[index_2d_0] +
+                              input[index_1] * TsTw_tran[index_2d_1] +
+                              input[index_2] * TsTw_tran[index_2d_2],
+                          0);
+    }
+  __hpvm__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for gamut mapping
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
-
- // First, get the L2 norm from every pixel to the control points,
- // Then, sum it and weight it. Finally, add the bias.
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      float chan_val_0 = 0.0;
-      float chan_val_1 = 0.0;
-      float chan_val_2 = 0.0;
-      for (int cp = 0; cp < 3702; cp++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); 
-        float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
-        float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); 
-        float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
-        float val = val1 * val2 + val3 * val4 + val5 * val6;
-        float sqrt_val = sqrt(val);
-        chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
-        chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
-        chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
-      }
-        chan_val_0 += coefs[0 * CHAN_SIZE + 0] + 
-                    coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
-        chan_val_1 += coefs[0 * CHAN_SIZE + 1] + 
-                    coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
-        chan_val_2 += coefs[0 * CHAN_SIZE + 2] + 
-                    coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
-        result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
-        result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
-        result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2,
+                     result, l2_dist);
+
+  // First, get the L2 norm from every pixel to the control points,
+  // Then, sum it and weight it. Finally, add the bias.
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
+  //  for (int row = 0; row < row_size; row++)
+  for (int col = 0; col < col_size; col++) {
+    float chan_val_0 = 0.0;
+    float chan_val_1 = 0.0;
+    float chan_val_2 = 0.0;
+    for (int cp = 0; cp < 3702; cp++) {
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val = val1 * val2 + val3 * val4 + val5 * val6;
+      float sqrt_val = sqrt(val);
+      chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
+      chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
+      chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
     }
-  __visc__return(1, bytes_result);
+    chan_val_0 +=
+        coefs[0 * CHAN_SIZE + 0] +
+        coefs[1 * CHAN_SIZE + 0] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 0] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
+    chan_val_1 +=
+        coefs[0 * CHAN_SIZE + 1] +
+        coefs[1 * CHAN_SIZE + 1] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 1] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
+    chan_val_2 +=
+        coefs[0 * CHAN_SIZE + 2] +
+        coefs[1 * CHAN_SIZE + 2] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 2] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
+    result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
+    result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
+    result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  }
+  __hpvm__return(1, bytes_result);
 }
 
 // HPVM leaf node function, for tone mapping
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+
+  void *thisNode = __hpvm__getNode();
+  int row = __hpvm__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        uint8_t x = input[index] * 255;
-        result[index] = tone_map[x * CHAN_SIZE + chan];
-      }
-  __visc__return(1, bytes_result);
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      uint8_t x = input[index] * 255;
+      result[index] = tone_map[x * CHAN_SIZE + chan];
+    }
+  __hpvm__return(1, bytes_result);
 }
 
 /********************************************************************/
@@ -400,185 +415,184 @@ void tone_map_fxp(float *input, size_t bytes_input,
 // requirement for the FPGA backend . The CPU backend also supports this,
 // so it does not cause a portability issue.
 
-void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
+void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
+                       size_t bytes_result, size_t row_size, size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
 
   // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic
   // instance (last argument) associated with node function scale_fxp
-  void *ScaleNode = __visc__createNodeND(1, scale_fxp, row_size);
+  void *ScaleNode = __hpvm__createNodeND(1, scale_fxp, row_size);
 
   // Binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
   // - argument position in argument list of function of destination node
   // - streaming (1) or non-streaming (0)
-  __visc__bindIn(ScaleNode, 0, 0, 0); // bind input
-  __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(ScaleNode, 2, 2, 0); // bind result
-  __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size
+  __hpvm__bindIn(ScaleNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(ScaleNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(ScaleNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(ScaleNode, 5, 5, 0); // bind col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-  __visc__bindOut(ScaleNode, 0, 0, 0);
+  __hpvm__bindOut(ScaleNode, 0, 0, 0);
 }
 
-void descale_fxp_wrapper(float *input, size_t bytes_input, 
-                       uint8_t *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size);
-  __visc__bindIn(DescaleNode, 0, 0, 0); // bind input
-  __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DescaleNode, 2, 2, 0); // bind result
-  __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DescaleNode, 0, 0, 0);
+void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DescaleNode = __hpvm__createNodeND(1, descale_fxp, row_size);
+  __hpvm__bindIn(DescaleNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DescaleNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DescaleNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DescaleNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DescaleNode, 0, 0, 0);
 }
 
-void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size);
-  __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input
-  __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result
-  __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DemosaicNode, 0, 0, 0);
+void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, size_t row_size,
+                          size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DemosaicNode = __hpvm__createNodeND(1, demosaic_fxp, row_size);
+  __hpvm__bindIn(DemosaicNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DemosaicNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DemosaicNode, 0, 0, 0);
 }
 
-void denoise_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, input, result, 1, result);
-  void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size);
-  __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input
-  __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result
-  __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
-  __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
-  
-  __visc__bindOut(DenoiseNode, 0, 0, 0);
+void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, input, result, 1, result);
+  void *DenoiseNode = __hpvm__createNodeND(1, denoise_fxp, row_size);
+  __hpvm__bindIn(DenoiseNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(DenoiseNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
+  __hpvm__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
+
+  __hpvm__bindOut(DenoiseNode, 0, 0, 0);
 }
 
-void transform_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *TsTw_tran, size_t bytes_TsTw,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size);
-  __visc__bindIn(TransformNode, 0, 0, 0); // bind input
-  __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(TransformNode, 2, 2, 0); // bind result
-  __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw
-  __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
-  __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size
-  __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size
-  
-  __visc__bindOut(TransformNode, 0, 0, 0);
+void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                           size_t bytes_result, float *TsTw_tran,
+                           size_t bytes_TsTw, size_t row_size,
+                           size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+  void *TransformNode = __hpvm__createNodeND(1, transform_fxp, row_size);
+  __hpvm__bindIn(TransformNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(TransformNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(TransformNode, 4, 4, 0); // bind tstw
+  __hpvm__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
+  __hpvm__bindIn(TransformNode, 6, 6, 0); // bind row_size
+  __hpvm__bindIn(TransformNode, 7, 7, 0); // bind col_size
+
+  __hpvm__bindOut(TransformNode, 0, 0, 0);
 }
 
-void gamut_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *ctrl_pts, size_t bytes_ctrl_pts,
-                       float *weights, size_t bytes_weights,
-                       float *coefs, size_t bytes_coefs,
-                       float *l2_dist, size_t bytes_l2_dist,
-                       size_t row_size, size_t col_size) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size);
-  __visc__bindIn(GamutNode, 0, 0, 0); // bind input
-  __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(GamutNode, 2, 2, 0); // bind result
-  __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts
-  __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts
-  __visc__bindIn(GamutNode, 6, 6, 0); // bind weights
-  __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights
-  __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs
-  __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs
-  __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
-  __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
-  __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size
-  __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size
-  
-  __visc__bindOut(GamutNode, 0, 0, 0);
+void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                       size_t bytes_result, float *ctrl_pts,
+                       size_t bytes_ctrl_pts, float *weights,
+                       size_t bytes_weights, float *coefs, size_t bytes_coefs,
+                       float *l2_dist, size_t bytes_l2_dist, size_t row_size,
+                       size_t col_size) {
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
+  void *GamutNode = __hpvm__createNodeND(1, gamut_map_fxp, row_size);
+  __hpvm__bindIn(GamutNode, 0, 0, 0);   // bind input
+  __hpvm__bindIn(GamutNode, 1, 1, 0);   // bind bytes_input
+  __hpvm__bindIn(GamutNode, 2, 2, 0);   // bind result
+  __hpvm__bindIn(GamutNode, 3, 3, 0);   // bind bytes_result
+  __hpvm__bindIn(GamutNode, 4, 4, 0);   // bind ctrl_pts
+  __hpvm__bindIn(GamutNode, 5, 5, 0);   // bind bytes_ctrl_pts
+  __hpvm__bindIn(GamutNode, 6, 6, 0);   // bind weights
+  __hpvm__bindIn(GamutNode, 7, 7, 0);   // bind bytes_weights
+  __hpvm__bindIn(GamutNode, 8, 8, 0);   // bind coefs
+  __hpvm__bindIn(GamutNode, 9, 9, 0);   // bind bytes_coefs
+  __hpvm__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
+  __hpvm__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
+  __hpvm__bindIn(GamutNode, 12, 12, 0); // bind row_size
+  __hpvm__bindIn(GamutNode, 13, 13, 0); // bind col_size
+
+  __hpvm__bindOut(GamutNode, 0, 0, 0);
 }
-void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *tone_map, size_t bytes_tone_map,
-                       size_t row_size, size_t col_size) {
-
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, row_size);
-  __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input
-  __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
-  __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map 
-  __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
-  __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
-  __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
-  
-  __visc__bindOut(ToneMapNode, 0, 0, 0);
+void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, float *tone_map,
+                          size_t bytes_tone_map, size_t row_size,
+                          size_t col_size) {
+
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+  void *ToneMapNode = __hpvm__createNodeND(1, tone_map_fxp, row_size);
+  __hpvm__bindIn(ToneMapNode, 0, 0, 0); // bind input
+  __hpvm__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
+  __hpvm__bindIn(ToneMapNode, 2, 2, 0); // bind result
+  __hpvm__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
+  __hpvm__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map
+  __hpvm__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
+  __hpvm__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
+  __hpvm__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
+
+  __hpvm__bindOut(ToneMapNode, 0, 0, 0);
 }
 
-
 /*** ROOT Node - Top Level of the Graph Hierarchy ***/
-void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input, 
-                 /*2*/ uint8_t *result,        /*3*/ size_t bytes_result,
-                 /*4*/ float *input_scaled,    /*5*/ size_t bytes_input_scaled,
-                 /*6*/ float *result_scaled,   /*7*/ size_t bytes_result_scaled,
-                 /*8*/ float *demosaic_out,    /*9*/ size_t bytes_demosaic_out,
-                 /*10*/ float *denoise_out,    /*11*/ size_t bytes_denoise_out,
-                 /*12*/ float *transform_out,  /*13*/ size_t bytes_transform_out,
-                 /*14*/ float *gamut_out,      /*15*/ size_t bytes_gamut_out,
-                 /*16*/ float *TsTw,           /*17*/ size_t bytes_TsTw,
-                 /*18*/ float *ctrl_pts,       /*19*/ size_t bytes_ctrl_pts,
-                 /*20*/ float *weights,        /*21*/ size_t bytes_weights,
-                 /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
-                 /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
-                 /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
-                 /*28*/ size_t row_size,          /*29*/ size_t col_size) {
-
-  //Specifies compilation target for current node
-    __visc__hint(CPU_TARGET);
+void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
+                 /*2*/ uint8_t *result, /*3*/ size_t bytes_result,
+                 /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled,
+                 /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled,
+                 /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out,
+                 /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out,
+                 /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out,
+                 /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out,
+                 /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw,
+                 /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts,
+                 /*20*/ float *weights, /*21*/ size_t bytes_weights,
+                 /*22*/ float *coefs, /*23*/ size_t bytes_coefs,
+                 /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist,
+                 /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map,
+                 /*28*/ size_t row_size, /*29*/ size_t col_size) {
+
+  // Specifies compilation target for current node
+  __hpvm__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-    __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, 
-                       transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, 
-                       5, result, demosaic_out, denoise_out, transform_out, gamut_out);
+  __hpvm__attributes(14, input, result, input_scaled, result_scaled,
+                     demosaic_out, denoise_out, transform_out, gamut_out, TsTw,
+                     ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result,
+                     demosaic_out, denoise_out, transform_out, gamut_out);
 
   // Create an 0D (specified by 1st argument) HPVM node - so a single node
   // associated with node function ---_fxp_wrapper
-    void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
-    void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
-    void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
-    void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
-    void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
-    void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
-    void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
-    
+  void *ScNode = __hpvm__createNodeND(0, scale_fxp_wrapper);
+  void *DmNode = __hpvm__createNodeND(0, demosaic_fxp_wrapper);
+  void *DnNode = __hpvm__createNodeND(0, denoise_fxp_wrapper);
+  void *TrNode = __hpvm__createNodeND(0, transform_fxp_wrapper);
+  void *GmNode = __hpvm__createNodeND(0, gamut_fxp_wrapper);
+  void *TnNode = __hpvm__createNodeND(0, tone_map_fxp_wrapper);
+  void *DsNode = __hpvm__createNodeND(0, descale_fxp_wrapper);
+
   // BindIn binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
@@ -592,268 +606,281 @@ void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input,
   // - destination position (in argument list of destination node)
   // - streaming (1) or non-streaming (0)
 
-    // scale_fxp inputs
-    __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input
-    __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
-    __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result
-    __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result
-    __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
-    __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
-
-    // demosaic_fxp inputs
-    __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
-    __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input
-    __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result
-    __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result
-    __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size 
-    __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
-
-    // denoise_fxp inputs
-    __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
-    __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input
-    __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
-    __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
-    __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size 
-    __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
-    
-    // transform_fxp inputs
-    __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
-    __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input
-    __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
-    __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result
-    __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
-    __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
-    __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size 
-    __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
-    
-    // gamut_fxp inputs
-    __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
-    __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input
-    __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
-    __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
-    __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
-    __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
-    __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
-    __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
-    __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
-    __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
-    __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
-    __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
-    __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size 
-    __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
-    
-    // tone_map_fxp inputs
-    __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
-    __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input
-    __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
-    __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
-    __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
-    __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
-    __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size 
-    __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
-
-    // descale_fxp inputs
-    __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
-    __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input
-    __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result
-    __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result
-    __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
-    __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
+  // scale_fxp inputs
+  __hpvm__bindIn(ScNode, 0, 0, 0);  // input -> ScNode:input
+  __hpvm__bindIn(ScNode, 1, 1, 0);  // bytes_input -> ScNode:bytes_input
+  __hpvm__bindIn(ScNode, 4, 2, 0);  // input_scaled -> ScNode:result
+  __hpvm__bindIn(ScNode, 5, 3, 0);  // bytes_input_scaled -> ScNode:bytes_result
+  __hpvm__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
+  __hpvm__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
+
+  // demosaic_fxp inputs
+  __hpvm__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
+  __hpvm__edge(ScNode, DmNode, 1, 0, 1,
+               0);                  // SCNode:bytes_result -> DmNode:bytes_input
+  __hpvm__bindIn(DmNode, 8, 2, 0);  // demosaic_out -> DmNode:result
+  __hpvm__bindIn(DmNode, 9, 3, 0);  // bytes_demosaic_out -> DmNode:bytes_result
+  __hpvm__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size
+  __hpvm__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
+
+  // denoise_fxp inputs
+  __hpvm__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
+  __hpvm__edge(DmNode, DnNode, 1, 0, 1,
+               0);                  // DMNode:bytes_result -> DnNode:bytes_input
+  __hpvm__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
+  __hpvm__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
+  __hpvm__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size
+  __hpvm__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
+
+  // transform_fxp inputs
+  __hpvm__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
+  __hpvm__edge(DnNode, TrNode, 1, 0, 1,
+               0);                  // DnNode:bytes_result -> TrNode:bytes_input
+  __hpvm__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
+  __hpvm__bindIn(TrNode, 13, 3,
+                 0); // bytes_result_scaled -> TrNode:bytes_result
+  __hpvm__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
+  __hpvm__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
+  __hpvm__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size
+  __hpvm__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
+
+  // gamut_fxp inputs
+  __hpvm__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
+  __hpvm__edge(TrNode, GmNode, 1, 0, 1,
+               0);                  // TrNode:bytes_result -> GmNode:bytes_input
+  __hpvm__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
+  __hpvm__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
+  __hpvm__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
+  __hpvm__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
+  __hpvm__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
+  __hpvm__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
+  __hpvm__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
+  __hpvm__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
+  __hpvm__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
+  __hpvm__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
+  __hpvm__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size
+  __hpvm__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
+
+  // tone_map_fxp inputs
+  __hpvm__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
+  __hpvm__edge(GmNode, TnNode, 1, 0, 1,
+               0);                 // GmNode:bytes_result -> TnNode:bytes_input
+  __hpvm__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
+  __hpvm__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
+  __hpvm__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
+  __hpvm__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
+  __hpvm__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size
+  __hpvm__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
+
+  // descale_fxp inputs
+  __hpvm__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
+  __hpvm__edge(TnNode, DsNode, 1, 0, 1,
+               0);                  // TnNode:bytes_result -> DsNode:bytes_input
+  __hpvm__bindIn(DsNode, 2, 2, 0);  // result -> DsNode:result
+  __hpvm__bindIn(DsNode, 3, 3, 0);  // bytes_result -> DsNode:bytes_result
+  __hpvm__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
+  __hpvm__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-    __visc__bindOut(DsNode, 0, 0, 0);
-    
+  __hpvm__bindOut(DsNode, 0, 0, 0);
 }
 
-int main(int argc, char* argv[]) {
-    // Parse the arguments.
-    arguments args;
-    set_default_args(&args);
-    argp_parse(&parser, argc, argv, 0, 0, &args);
-
-    // Read a raw image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
-    printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
-    size_t row_size, col_size;
-    uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
-
-    printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
-
-    // Allocate a buffer for storing the output image data.
-    // (This is currently the same size as the input image data.)
-    size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
-    size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
-    uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image);
-
-    __visc__init();
-
-    ///////////////////////////////////////////////////////////////
-    // Camera Model Parameters
-    ///////////////////////////////////////////////////////////////
-    // Path to the camera model to be used
-//    char cam_model_path[100];
-//    char cam_model_path = "cam_models/NikonD7000/";
-    // White balance index (select white balance from transform file)
-    // The first white balance in the file has a wb_index of 1
-    // For more information on model format see the readme
-    int wb_index = 6;
-
-    // Number of control points
-    int num_ctrl_pts = 3702;
-    uint8_t *input, *result;
-    float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out;
-    float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
-
-    TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
-    float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
-    free(TsTw);
-    TsTw = trans;
-    ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
-    weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
-    coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
-    tone_map = get_tone_map("cam_models/NikonD7000/");
-    
-    input_scaled = (float*) malloc_aligned(bytes_fimage);
-    result_scaled = (float*) malloc_aligned(bytes_fimage);
-    demosaic_out = (float*) malloc_aligned(bytes_fimage);
-    denoise_out = (float*) malloc_aligned(bytes_fimage);
-    transform_out  = (float*) malloc_aligned(bytes_fimage);
-    gamut_out = (float*) malloc_aligned(bytes_fimage);
-    l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);    
-    
-    // This is host_input in cam_pipe()
-    input = (uint8_t*) malloc_aligned(bytes_image);
-    convert_hwc_to_chw(image_in, row_size, col_size, &input);
-    
-    // This is host_result in cam_pipe()
-    result = (uint8_t*) malloc_aligned(bytes_image);
-
-    // Allocate struct to pass DFG inputs
-    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
-
-    // Set up HPVM DFG inputs in the rootArgs struct.
-    rootArgs->input = input;
-    rootArgs->bytes_input = bytes_image;
-    
-    rootArgs->result = result;
-    rootArgs->bytes_result = bytes_image;
-    
-    rootArgs->input_scaled = input_scaled;
-    rootArgs->bytes_input_scaled = bytes_fimage;
-    
-    rootArgs->result_scaled = result_scaled;
-    rootArgs->bytes_result_scaled = bytes_fimage;
-    
-    rootArgs->demosaic_out = demosaic_out;
-    rootArgs->bytes_demosaic_out = bytes_fimage;
-    
-    rootArgs->denoise_out = denoise_out;
-    rootArgs->bytes_denoise_out = bytes_fimage;
-    
-    rootArgs->transform_out = transform_out;
-    rootArgs->bytes_transform_out = bytes_fimage;
-
-    rootArgs->gamut_out = gamut_out;
-    rootArgs->bytes_gamut_out = bytes_fimage;
-
-    rootArgs->TsTw = TsTw;
-    rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->ctrl_pts = ctrl_pts;
-    rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->weights = weights;
-    rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->coefs = coefs;
-    rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->tone_map = tone_map;
-    rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->l2_dist = l2_dist;
-    rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
-    
-    rootArgs->row_size = row_size;
-    rootArgs->col_size = col_size;
-
-    // Memory tracking is required for pointer arguments.
-    // Nodes can be scheduled on different targets, and 
-    // dataflow edge implementation needs to request data.
-    // The pair (pointer, size) is inserted in memory tracker using this call
-    llvm_visc_track_mem(input, bytes_image);
-    llvm_visc_track_mem(result, bytes_image);
-    llvm_visc_track_mem(input_scaled, bytes_fimage);
-    llvm_visc_track_mem(result_scaled, bytes_fimage);
-    llvm_visc_track_mem(demosaic_out, bytes_fimage);
-    llvm_visc_track_mem(denoise_out, bytes_fimage);
-    llvm_visc_track_mem(transform_out, bytes_fimage);
-    llvm_visc_track_mem(gamut_out, bytes_fimage);
-    llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); 
-    llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float));
-    llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
-    
-    printf("\n\nLaunching CAVA pipeline!\n");
-
-    void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs);
-    __visc__wait(camPipeDFG);
-
-    printf("\n\nPipeline execution completed!\n");
-    printf("\n\nRequesting memory!\n");
-
-    // Request data from graph.    
-    llvm_visc_request_mem(result, bytes_image);
-    llvm_visc_request_mem(demosaic_out, bytes_fimage);
-    llvm_visc_request_mem(denoise_out, bytes_fimage);
-    llvm_visc_request_mem(transform_out, bytes_fimage);
-    llvm_visc_request_mem(gamut_out, bytes_fimage);
-    printf("\n\nDone requesting memory!\n");
-
-
-    uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-  uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    
-  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size);
-    
-    convert_chw_to_hwc(result, row_size, col_size, &image_out);
-   convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
-    convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic);
-    convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise);
-    convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform);
-
-    
-    // Remove tracked pointers.
-    llvm_visc_untrack_mem(input);
-    llvm_visc_untrack_mem(result);
-    llvm_visc_untrack_mem(input_scaled);
-    llvm_visc_untrack_mem(result_scaled);
-    llvm_visc_untrack_mem(demosaic_out);
-    llvm_visc_untrack_mem(denoise_out);
-    llvm_visc_untrack_mem(transform_out);
-    llvm_visc_untrack_mem(gamut_out);
-    
-    llvm_visc_untrack_mem(TsTw); 
-    llvm_visc_untrack_mem(ctrl_pts);
-    llvm_visc_untrack_mem(weights);
-    llvm_visc_untrack_mem(coefs);
-    llvm_visc_untrack_mem(tone_map);
-    llvm_visc_untrack_mem(l2_dist);
-
-    // Output the image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
+int main(int argc, char *argv[]) {
+  // Parse the arguments.
+  arguments args;
+  set_default_args(&args);
+  argp_parse(&parser, argc, argv, 0, 0, &args);
+
+  // Read a raw image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
+  printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
+  size_t row_size, col_size;
+  uint8_t *image_in =
+      read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
+
+  printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
+
+  // Allocate a buffer for storing the output image data.
+  // (This is currently the same size as the input image data.)
+  size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
+  size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
+  uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image);
+
+  __hpvm__init();
+
+  ///////////////////////////////////////////////////////////////
+  // Camera Model Parameters
+  ///////////////////////////////////////////////////////////////
+  // Path to the camera model to be used
+  //    char cam_model_path[100];
+  //    char cam_model_path = "cam_models/NikonD7000/";
+  // White balance index (select white balance from transform file)
+  // The first white balance in the file has a wb_index of 1
+  // For more information on model format see the readme
+  int wb_index = 6;
+
+  // Number of control points
+  int num_ctrl_pts = 3702;
+  uint8_t *input, *result;
+  float *input_scaled, *result_scaled, *demosaic_out, *denoise_out,
+      *transform_out, *gamut_out;
+  float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
+
+  TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+  float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
+  free(TsTw);
+  TsTw = trans;
+  ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
+  weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
+  coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
+  tone_map = get_tone_map("cam_models/NikonD7000/");
+
+  input_scaled = (float *)malloc_aligned(bytes_fimage);
+  result_scaled = (float *)malloc_aligned(bytes_fimage);
+  demosaic_out = (float *)malloc_aligned(bytes_fimage);
+  denoise_out = (float *)malloc_aligned(bytes_fimage);
+  transform_out = (float *)malloc_aligned(bytes_fimage);
+  gamut_out = (float *)malloc_aligned(bytes_fimage);
+  l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
+
+  // This is host_input in cam_pipe()
+  input = (uint8_t *)malloc_aligned(bytes_image);
+  convert_hwc_to_chw(image_in, row_size, col_size, &input);
+
+  // This is host_result in cam_pipe()
+  result = (uint8_t *)malloc_aligned(bytes_image);
+
+  // Allocate struct to pass DFG inputs
+  RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn));
+
+  // Set up HPVM DFG inputs in the rootArgs struct.
+  rootArgs->input = input;
+  rootArgs->bytes_input = bytes_image;
+
+  rootArgs->result = result;
+  rootArgs->bytes_result = bytes_image;
+
+  rootArgs->input_scaled = input_scaled;
+  rootArgs->bytes_input_scaled = bytes_fimage;
+
+  rootArgs->result_scaled = result_scaled;
+  rootArgs->bytes_result_scaled = bytes_fimage;
+
+  rootArgs->demosaic_out = demosaic_out;
+  rootArgs->bytes_demosaic_out = bytes_fimage;
+
+  rootArgs->denoise_out = denoise_out;
+  rootArgs->bytes_denoise_out = bytes_fimage;
+
+  rootArgs->transform_out = transform_out;
+  rootArgs->bytes_transform_out = bytes_fimage;
+
+  rootArgs->gamut_out = gamut_out;
+  rootArgs->bytes_gamut_out = bytes_fimage;
+
+  rootArgs->TsTw = TsTw;
+  rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
+
+  rootArgs->ctrl_pts = ctrl_pts;
+  rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->weights = weights;
+  rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->coefs = coefs;
+  rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->tone_map = tone_map;
+  rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->l2_dist = l2_dist;
+  rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
+
+  rootArgs->row_size = row_size;
+  rootArgs->col_size = col_size;
+
+  // Memory tracking is required for pointer arguments.
+  // Nodes can be scheduled on different targets, and
+  // dataflow edge implementation needs to request data.
+  // The pair (pointer, size) is inserted in memory tracker using this call
+  llvm_hpvm_track_mem(input, bytes_image);
+  llvm_hpvm_track_mem(result, bytes_image);
+  llvm_hpvm_track_mem(input_scaled, bytes_fimage);
+  llvm_hpvm_track_mem(result_scaled, bytes_fimage);
+  llvm_hpvm_track_mem(demosaic_out, bytes_fimage);
+  llvm_hpvm_track_mem(denoise_out, bytes_fimage);
+  llvm_hpvm_track_mem(transform_out, bytes_fimage);
+  llvm_hpvm_track_mem(gamut_out, bytes_fimage);
+  llvm_hpvm_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
+  llvm_hpvm_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
+
+  printf("\n\nLaunching CAVA pipeline!\n");
+
+  void *camPipeDFG = __hpvm__launch(0, CamPipeRoot, (void *)rootArgs);
+  __hpvm__wait(camPipeDFG);
+
+  printf("\n\nPipeline execution completed!\n");
+  printf("\n\nRequesting memory!\n");
+
+  // Request data from graph.
+  llvm_hpvm_request_mem(result, bytes_image);
+  llvm_hpvm_request_mem(demosaic_out, bytes_fimage);
+  llvm_hpvm_request_mem(denoise_out, bytes_fimage);
+  llvm_hpvm_request_mem(transform_out, bytes_fimage);
+  llvm_hpvm_request_mem(gamut_out, bytes_fimage);
+  printf("\n\nDone requesting memory!\n");
+
+  uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+
+  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image,
+              row_size, col_size);
+
+  convert_chw_to_hwc(result, row_size, col_size, &image_out);
+  convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
+  convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size,
+                     &image_out_demosaic);
+  convert_chw_to_hwc(denoise_out_descaled, row_size, col_size,
+                     &image_out_denoise);
+  convert_chw_to_hwc(transform_out_descaled, row_size, col_size,
+                     &image_out_transform);
+
+  // Remove tracked pointers.
+  llvm_hpvm_untrack_mem(input);
+  llvm_hpvm_untrack_mem(result);
+  llvm_hpvm_untrack_mem(input_scaled);
+  llvm_hpvm_untrack_mem(result_scaled);
+  llvm_hpvm_untrack_mem(demosaic_out);
+  llvm_hpvm_untrack_mem(denoise_out);
+  llvm_hpvm_untrack_mem(transform_out);
+  llvm_hpvm_untrack_mem(gamut_out);
+
+  llvm_hpvm_untrack_mem(TsTw);
+  llvm_hpvm_untrack_mem(ctrl_pts);
+  llvm_hpvm_untrack_mem(weights);
+  llvm_hpvm_untrack_mem(coefs);
+  llvm_hpvm_untrack_mem(tone_map);
+  llvm_hpvm_untrack_mem(l2_dist);
+
+  // Output the image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
   char str[50], base_str[50];
   strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
   strcpy(str, base_str);
@@ -877,8 +904,7 @@ int main(int argc, char* argv[]) {
   printf("Writing output image to %s\n", str);
   write_image_to_binary(str, image_out_transform, row_size, col_size);
 
-    __visc__cleanup();
+  __hpvm__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c
index 2ebedec936915b5e7f11881c5001c84b6db26474..05bb06697fa8df130aa0d0d324f9bc39bc575fb2 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.c
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.c
@@ -1,172 +1,169 @@
-#include <stdio.h>
-#include <math.h>
 #include "pipe_stages.h"
 #include "cam_pipe_utility.h"
+#include <math.h>
+#include <stdio.h>
+
+// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, output, 1, output);
 
-//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, output, 1, output);
-  
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(float, _output, output, row_size, col_size);
-  sl_chan:
+sl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    sl_row:
+  sl_row:
     for (int row = 0; row < row_size; row++)
-      sl_col:
+    sl_col:
       for (int col = 0; col < col_size; col++)
         _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255;
 
-  __visc__return(1, bytes_output);
+  __hpvm__return(1, bytes_output);
 }
 
-//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, output, 1, output);
-  
+// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, output, 1, output);
+
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _output, output, row_size, col_size);
-  dsl_chan:
+dsl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dsl_row:
+  dsl_row:
     for (int row = 0; row < row_size; row++)
-      dsl_col:
+    dsl_col:
       for (int col = 0; col < col_size; col++)
-        _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255);
+        _output[chan][row][col] =
+            min(max(_input[chan][row][col] * 255, 0), 255);
 
-  __visc__return(1, bytes_output);
+  __hpvm__return(1, bytes_output);
 }
 
 // Demosaicing stage
 // G R
 // B G
-//void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
+// void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
   printf("Demosaicing.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dm_row:
+dm_row:
   for (int row = 1; row < row_size - 1; row++)
-    dm_col:
+  dm_col:
     for (int col = 1; col < col_size - 1; col++)
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = _input[0][row][col - 1];
-            float R2 = _input[0][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col];
-            float B2 = _input[2][row + 1][col];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col - 1];
-            float B2 = _input[2][row - 1][col + 1];
-            float B3 = _input[2][row + 1][col - 1];
-            float B4 = _input[2][row + 1][col + 1];
-            // R
-            _result[0][row][col] = _input[0][row][col];
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col - 1];
-            float R2 = _input[0][row + 1][col - 1];
-            float R3 = _input[0][row - 1][col + 1];
-            float R4 = _input[0][row + 1][col + 1];
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            _result[2][row][col] = _input[2][row][col];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col];
-            float R2 = _input[0][row + 1][col];
-            // Getting the B values
-            float B1 = _input[2][row][col - 1];
-            float B2 = _input[2][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        }
+      if (row % 2 == 0 && col % 2 == 0) {
+        // Green pixel
+        // Getting the R values
+        float R1 = _input[0][row][col - 1];
+        float R2 = _input[0][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col];
+        float B2 = _input[2][row + 1][col];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      } else if (row % 2 == 0 && col % 2 == 1) {
+        // Red pixel
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col - 1];
+        float B2 = _input[2][row - 1][col + 1];
+        float B3 = _input[2][row + 1][col - 1];
+        float B4 = _input[2][row + 1][col + 1];
+        // R
+        _result[0][row][col] = _input[0][row][col];
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B (center pixel)
+        _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
+      } else if (row % 2 == 1 && col % 2 == 0) {
+        // Blue pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col - 1];
+        float R2 = _input[0][row + 1][col - 1];
+        float R3 = _input[0][row - 1][col + 1];
+        float R4 = _input[0][row + 1][col + 1];
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B
+        _result[2][row][col] = _input[2][row][col];
+      } else {
+        // Bottom Green pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col];
+        float R2 = _input[0][row + 1][col];
+        // Getting the B values
+        float B1 = _input[2][row][col - 1];
+        float B2 = _input[2][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      }
 
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    dn_sort_i:
-    for (i = 0; i < n - 1; i++)
-        dn_sort_j:
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+dn_sort_i:
+  for (i = 0; i < n - 1; i++)
+  dn_sort_j:
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 // Simple denoise
-//void denoise_fxp(float *input, int row_size, int col_size, float *result) {
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(2, input, result, 1, result);
-  
+// void denoise_fxp(float *input, int row_size, int col_size, float *result) {
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, int row_size, int col_size) {
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(2, input, result, 1, result);
+
   printf("Denoising.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dn_chan:
+dn_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dn_row:
+  dn_row:
     for (int row = 0; row < row_size; row++)
-      dn_col:
+    dn_col:
       for (int col = 0; col < col_size; col++)
         if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
           float filter[9];
-          dn_slide_row:
-          for (int i = row-1; i < row+2; i++)
-            dn_slide_col:
-            for (int j = col-1; j < col+2; j++) {
+        dn_slide_row:
+          for (int i = row - 1; i < row + 2; i++)
+          dn_slide_col:
+            for (int j = col - 1; j < col + 2; j++) {
               int index = (i - row + 1) * 3 + j - col + 1;
               filter[index] = _input[chan][i][j];
             }
@@ -175,53 +172,52 @@ void denoise_fxp(float *input, size_t bytes_input,
         } else {
           _result[chan][row][col] = _input[chan][row][col];
         }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Color map and white balance transform
-//void transform_fxp(float *input, int row_size, int col_size, float *result,
+// void transform_fxp(float *input, int row_size, int col_size, float *result,
 //                   float *TsTw_tran) {
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
+
   printf("Color mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3);
 
-  tr_chan:
+tr_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tr_row:
+  tr_row:
     for (int row = 0; row < row_size; row++)
-      tr_col:
+    tr_col:
       for (int col = 0; col < col_size; col++)
         _result[chan][row][col] =
             max(_input[0][row][col] * _TsTw_tran[0][chan] +
                     _input[1][row][col] * _TsTw_tran[1][chan] +
                     _input[2][row][col] * _TsTw_tran[2][chan],
                 0);
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 //
 // Weighted radial basis function for gamut mapping
 //
-//void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
-//                   float *ctrl_pts, float *weights, float *coefs, float *l2_dist) {
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+// void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *ctrl_pts, float *weights, float *coefs, float
+//                   *l2_dist) {
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
+
   printf("Gamut mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
@@ -229,26 +225,25 @@ void gamut_map_fxp(float *input, size_t bytes_input,
   ARRAY_2D(float, _weights, weights, 3);
   ARRAY_2D(float, _coefs, coefs, 3);
 
-  // First, get the L2 norm from every pixel to the control points,
-  // Then, sum it and weight it. Finally, add the bias.
-  gm_rbf_row:
+// First, get the L2 norm from every pixel to the control points,
+// Then, sum it and weight it. Finally, add the bias.
+gm_rbf_row:
   for (int row = 0; row < row_size; row++)
-    gm_rbf_col:
+  gm_rbf_col:
     for (int col = 0; col < col_size; col++) {
-      gm_rbf_cp0:
+    gm_rbf_cp0:
       for (int cp = 0; cp < num_ctrl_pts; cp++) {
-        l2_dist[cp] =
-            sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
-                     (_input[0][row][col] - _ctrl_pts[cp][0]) +
-                 (_input[1][row][col] - _ctrl_pts[cp][1]) *
-                     (_input[1][row][col] - _ctrl_pts[cp][1]) +
-                 (_input[2][row][col] - _ctrl_pts[cp][2]) *
-                     (_input[2][row][col] - _ctrl_pts[cp][2]));
+        l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
+                               (_input[0][row][col] - _ctrl_pts[cp][0]) +
+                           (_input[1][row][col] - _ctrl_pts[cp][1]) *
+                               (_input[1][row][col] - _ctrl_pts[cp][1]) +
+                           (_input[2][row][col] - _ctrl_pts[cp][2]) *
+                               (_input[2][row][col] - _ctrl_pts[cp][2]));
       }
-      gm_rbf_chan:
+    gm_rbf_chan:
       for (int chan = 0; chan < CHAN_SIZE; chan++) {
         float chan_val = 0.0;
-        gm_rbf_cp1:
+      gm_rbf_cp1:
         for (int cp = 0; cp < num_ctrl_pts; cp++) {
           chan_val += l2_dist[cp] * _weights[cp][chan];
         }
@@ -259,32 +254,31 @@ void gamut_map_fxp(float *input, size_t bytes_input,
         _result[chan][row][col] = max(chan_val, 0);
       }
     }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
 
 // Tone mapping
-//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
+// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
 //                  float *result) {
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   int row_size, int col_size) {
-  __visc__hint(DEVICE);
-  __visc__attributes(3, input, result, tone_map, 1, result);
-  
+  __hpvm__hint(DEVICE);
+  __hpvm__attributes(3, input, result, tone_map, 1, result);
+
   printf("Tone mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _tone_map, tone_map, 3);
 
-  tm_chan:
+tm_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tm_row:
+  tm_row:
     for (int row = 0; row < row_size; row++)
-      tm_col:
+    tm_col:
       for (int col = 0; col < col_size; col++) {
         uint8_t x = _input[chan][row][col] * 255;
         _result[chan][row][col] = _tone_map[x][chan];
       }
-  __visc__return(1, bytes_result);
+  __hpvm__return(1, bytes_result);
 }
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.h b/hpvm/test/hpvm-cava/src/pipe_stages.h
index 8d98cb65cc8af7353cc1faf08988f3b1a6758046..f960822a03326638189c8d294938452ba2670b41 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.h
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.h
@@ -7,54 +7,52 @@
 
 #define ISP 0x4
 
-#define max(a,b) \
-  ({ __typeof__ (a) _a = (a); \
-      __typeof__ (b) _b = (b); \
-    _a > _b ? _a : _b; })
-
-#define min(a,b) \
-  ({ __typeof__ (a) _a = (a); \
-      __typeof__ (b) _b = (b); \
-    _a < _b ? _a : _b; })
-
-#define abs(a) \
-  ({ __typeof__ (a) _a = (a); \
-    _a < 0 ? -_a : _a; })
+#define max(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a > _b ? _a : _b;                                                         \
+  })
+
+#define min(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a < _b ? _a : _b;                                                         \
+  })
+
+#define abs(a)                                                                 \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    _a < 0 ? -_a : _a;                                                         \
+  })
 
 extern int num_ctrl_pts;
 
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               size_t row_size, size_t col_size);
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, size_t row_size, size_t col_size);
 
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size);
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size);
 
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  size_t row_size, size_t col_size);
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, size_t row_size, size_t col_size);
 
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 size_t row_size, size_t col_size);
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, size_t row_size, size_t col_size);
 
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size);
 
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size);
 
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size);
 
 void tone_map_approx_fxp(float *input, size_t row_size, size_t col_size,
diff --git a/hpvm/test/hpvm-cava/src/utility.c b/hpvm/test/hpvm-cava/src/utility.c
index c1eaee3333c2afffdcae827f956efa4e25705352..86bd018183403f637ca8fb7cfb634a09c3ceace8 100644
--- a/hpvm/test/hpvm-cava/src/utility.c
+++ b/hpvm/test/hpvm-cava/src/utility.c
@@ -1,7 +1,7 @@
-#include <stdlib.h>
-#include <assert.h>
-#include "defs.h"
 #include "utility.h"
+#include "defs.h"
+#include <assert.h>
+#include <stdlib.h>
 
 void *malloc_aligned(size_t size) {
   void *ptr = NULL;
diff --git a/hpvm/test/include/hpvm.h b/hpvm/test/include/hpvm.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e31c98946f00e32d84933fe4bfd443e65cb92a9
--- /dev/null
+++ b/hpvm/test/include/hpvm.h
@@ -0,0 +1,73 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef DEVICE
+#define DEVICE GPU_TARGET
+#endif
+
+#include "../../include/SupportHPVM/HPVMHint.h"
+
+#ifndef __cplusplus
+#define noexcept
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+void __hpvm__hint(hpvm::Target) noexcept;
+#else
+void __hpvm__hint(enum Target) noexcept;
+#endif
+
+void *__hpvm__createNodeND(unsigned, ...) noexcept;
+void __hpvm__return(unsigned, ...) noexcept;
+
+void __hpvm__attributes(unsigned, ...) noexcept;
+void __hpvm__init() noexcept;
+void __hpvm__cleanup() noexcept;
+
+void __hpvm__bindIn(void *, unsigned, unsigned, unsigned) noexcept;
+void __hpvm__bindOut(void *, unsigned, unsigned, unsigned) noexcept;
+void *__hpvm__edge(void *, void *, unsigned, unsigned, unsigned,
+                   unsigned) noexcept;
+
+void __hpvm__push(void *, void *) noexcept;
+void *__hpvm__pop(void *) noexcept;
+void *__hpvm__launch(unsigned, ...) noexcept;
+void __hpvm__wait(void *) noexcept;
+
+void *__hpvm__getNode() noexcept;
+void *__hpvm__getParentNode(void *) noexcept;
+void __hpvm__barrier() noexcept;
+void *__hpvm__malloc(long) noexcept;
+long __hpvm__getNodeInstanceID_x(void *) noexcept;
+long __hpvm__getNodeInstanceID_y(void *) noexcept;
+long __hpvm__getNodeInstanceID_z(void *) noexcept;
+long __hpvm__getNumNodeInstances_x(void *) noexcept;
+long __hpvm__getNumNodeInstances_y(void *) noexcept;
+long __hpvm__getNumNodeInstances_z(void *) noexcept;
+
+// Atomic
+// signed int
+int __hpvm__atomic_add(int *, int) noexcept;
+int __hpvm__atomic_sub(int *, int) noexcept;
+int __hpvm__atomic_xchg(int *, int) noexcept;
+int __hpvm__atomic_inc(int *) noexcept;
+int __hpvm__atomic_dec(int *) noexcept;
+int __hpvm__atomic_min(int *, int) noexcept;
+int __hpvm__atomic_max(int *, int) noexcept;
+int __hpvm__atomic_and(int *, int) noexcept;
+int __hpvm__atomic_or(int *, int) noexcept;
+int __hpvm__atomic_xor(int *, int) noexcept;
+
+void llvm_hpvm_track_mem(void *, size_t) noexcept;
+void llvm_hpvm_untrack_mem(void *) noexcept;
+void llvm_hpvm_request_mem(void *, size_t) noexcept;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/hpvm/test/include/visc.h b/hpvm/test/include/visc.h
deleted file mode 100644
index 18b29500261362be66ea23feecf9a5f85ac68005..0000000000000000000000000000000000000000
--- a/hpvm/test/include/visc.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef DEVICE
-#define DEVICE GPU_TARGET
-#endif
-
-#include "../../include/SupportVISC/VISCHint.h"
-
-#ifndef __cplusplus
-#define noexcept
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-void __visc__hint(visc::Target) noexcept;
-#else
-void __visc__hint(enum Target) noexcept;
-#endif
-
-void *__visc__createNodeND(unsigned, ...) noexcept;
-void __visc__return(unsigned, ...) noexcept;
-
-void __visc__attributes(unsigned, ...) noexcept;
-void __visc__init() noexcept;
-void __visc__cleanup() noexcept;
-
-void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept;
-void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept;
-void *__visc__edge(void *, void *, unsigned, unsigned, unsigned,
-                   unsigned) noexcept;
-
-void __visc__push(void *, void *) noexcept;
-void *__visc__pop(void *) noexcept;
-void *__visc__launch(unsigned, ...) noexcept;
-void __visc__wait(void *) noexcept;
-
-void *__visc__getNode() noexcept;
-void *__visc__getParentNode(void *) noexcept;
-void __visc__barrier() noexcept;
-void *__visc__malloc(long) noexcept;
-long __visc__getNodeInstanceID_x(void *) noexcept;
-long __visc__getNodeInstanceID_y(void *) noexcept;
-long __visc__getNodeInstanceID_z(void *) noexcept;
-long __visc__getNumNodeInstances_x(void *) noexcept;
-long __visc__getNumNodeInstances_y(void *) noexcept;
-long __visc__getNumNodeInstances_z(void *) noexcept;
-
-// Atomic
-// signed int
-int __visc__atomic_add(int *, int) noexcept;
-int __visc__atomic_sub(int *, int) noexcept;
-int __visc__atomic_xchg(int *, int) noexcept;
-int __visc__atomic_inc(int *) noexcept;
-int __visc__atomic_dec(int *) noexcept;
-int __visc__atomic_min(int *, int) noexcept;
-int __visc__atomic_max(int *, int) noexcept;
-int __visc__atomic_and(int *, int) noexcept;
-int __visc__atomic_or(int *, int) noexcept;
-int __visc__atomic_xor(int *, int) noexcept;
-
-void llvm_visc_track_mem(void *, size_t) noexcept;
-void llvm_visc_untrack_mem(void *) noexcept;
-void llvm_visc_request_mem(void *, size_t) noexcept;
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/hpvm/test/parboil/README.md b/hpvm/test/parboil/README.md
index 1166e4f10f6a6e29e4f5d40871674c27da975acc..853b46ed515455fbcb206630a74d5490c79ffd88 100644
--- a/hpvm/test/parboil/README.md
+++ b/hpvm/test/parboil/README.md
@@ -2,7 +2,7 @@
 
 | Benchmark | Version | Supported on CPU | Supported on GPU |
 | :-------- | :------ | :--------------: | :--------------: |
-| sgemm     | visc    | âœ”                | âœ”                |
-| stencil   | visc    | âœ”                | âœ”                |
-| spmv      | visc    | âœ”                | âœ˜                |
-| lbm       | visc    | âœ”                | âœ˜                |
+| sgemm     | hpvm    | âœ”                | âœ”                |
+| stencil   | hpvm    | âœ”                | âœ”                |
+| spmv      | hpvm    | âœ”                | âœ˜                |
+| lbm       | hpvm    | âœ”                | âœ˜                |
diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile
index 4ebf6fc0af2f05cd10f6d556e0b52bee186540d8..af7215ff7039795e2d09ce98af675a851b32b0cb 100644
--- a/hpvm/test/parboil/benchmarks/lbm/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/Makefile
@@ -5,9 +5,9 @@ ifeq ($(NUM_CORES),)
   NUM_CORES=8
 endif
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
similarity index 85%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
index d1664ee9880312ccfa2677e6a284851ecadf1f24..5aa206f758e87a94cdaa1cbaadfa3bf9b661d120 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=lbm.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES)
 APP_CXXFLAGS=-ffast-math -O3 -DNUM_CORES=$(NUM_CORES)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/layout_config.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.cpp
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/lbm_macros.h
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
similarity index 86%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
index bb9f6ed1f03d203e412df679775a83c6ff5c349d..445978c086aaab4c2c45be93da6031bf06da7123 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.cpp
@@ -12,7 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <visc.h>
+#include <hpvm.h>
 
 #include "lbm_macros.h"
 #include "layout_config.h"
@@ -92,18 +92,18 @@ typedef struct __attribute__((__packed__)) {
 
 void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG,
                                  size_t bytes_dstG) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
 
   srcG += MARGIN;
   dstG += MARGIN;
 
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
 
   // Using some predefined macros here.  Consider this the declaration
   //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
@@ -274,40 +274,40 @@ void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG,
 
 void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
   void *lbm_node =
-      __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
+      __hpvm__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
 }
 
 void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
-  void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
-  __visc__bindIn(lbm_node, 4, 4, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __hpvm__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
+  __hpvm__bindIn(lbm_node, 4, 4, 0);
 }
 
 void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
              size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, srcG, dstG, 1, dstG);
-  void *lbm_node = __visc__createNodeND(0, lbmLvl2);
-  __visc__bindIn(lbm_node, 0, 0, 0);
-  __visc__bindIn(lbm_node, 1, 1, 0);
-  __visc__bindIn(lbm_node, 2, 2, 0);
-  __visc__bindIn(lbm_node, 3, 3, 0);
-  __visc__bindIn(lbm_node, 4, 4, 0);
-  __visc__bindIn(lbm_node, 5, 5, 0);
-  __visc__bindIn(lbm_node, 6, 6, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __hpvm__createNodeND(0, lbmLvl2);
+  __hpvm__bindIn(lbm_node, 0, 0, 0);
+  __hpvm__bindIn(lbm_node, 1, 1, 0);
+  __hpvm__bindIn(lbm_node, 2, 2, 0);
+  __hpvm__bindIn(lbm_node, 3, 3, 0);
+  __hpvm__bindIn(lbm_node, 4, 4, 0);
+  __hpvm__bindIn(lbm_node, 5, 5, 0);
+  __hpvm__bindIn(lbm_node, 6, 6, 0);
 }
 
 __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src,
@@ -321,9 +321,9 @@ __attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src,
   RootIn root_in_local = {src - MARGIN, size,   dst - MARGIN, size,
                           SIZE_X,       SIZE_Y, SIZE_Z};
   *(RootIn *)root_in = root_in_local;
-  void *lbmDFG = __visc__launch(0, lbmLvl3, root_in);
+  void *lbmDFG = __hpvm__launch(0, lbmLvl3, root_in);
 
-  __visc__wait(lbmDFG);
+  __hpvm__wait(lbmDFG);
 }
 
 void MAIN_initialize(const MAIN_Param *param) {
@@ -379,12 +379,12 @@ int main(int nArgs, char *arg[]) {
   MAIN_initialize(&param);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(srcGrid - MARGIN, size);
-  llvm_visc_track_mem(dstGrid - MARGIN, size);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(srcGrid - MARGIN, size);
+  llvm_hpvm_track_mem(dstGrid - MARGIN, size);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   for (t = 1; t <= param.nTimeSteps; t++) {
@@ -404,15 +404,15 @@ int main(int nArgs, char *arg[]) {
   }
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(srcGrid - MARGIN, size);
+  llvm_hpvm_request_mem(srcGrid - MARGIN, size);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(srcGrid - MARGIN);
-  llvm_visc_untrack_mem(dstGrid - MARGIN);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(srcGrid - MARGIN);
+  llvm_hpvm_untrack_mem(dstGrid - MARGIN);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
   MAIN_finalize(&param);
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h b/hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/lbm/src/visc/main.h
rename to hpvm/test/parboil/benchmarks/lbm/src/hpvm/main.h
diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile
index ace9ded22b6ef365c9cd0f6262245dd2e086643d..4757432d224ea5a1aaa762bfc89c1c89e869bd32 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = sgemm
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc_sh
+  VERSION = hpvm_sh
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
similarity index 83%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
index d1f6c96d0c279bc2f2e3e70313369d49881b62b8..6e63f8384190ff75c281592df1ab3843b017d07f 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O1
 APP_CXXFLAGS=-ffast-math -O1
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/io.cc
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
similarity index 69%
rename from hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
rename to hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
index 627f5a82412374cff4a9061620ce1f27ea3c14a6..de36705707d7062b4cef2042197902c2c415e312 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm/main.cc
@@ -10,6 +10,7 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -19,7 +20,6 @@
 #include <string.h>
 #include <sys/time.h>
 #include <vector>
-#include <visc.h>
 
 // I/O routines
 extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
@@ -57,17 +57,17 @@ typedef struct __attribute__((__packed__)) {
 void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha,
                float beta) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, A, B, C, 1, C);
-
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int ly = __visc__getNodeInstanceID_y(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
-  int gridy = __visc__getNumNodeInstances_y(thisNode);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int ly = __hpvm__getNodeInstanceID_y(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
+  int gridy = __hpvm__getNumNodeInstances_y(thisNode);
   int m = gx * gridx + lx;
   int n = gy * gridy + ly;
 
@@ -83,46 +83,46 @@ void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
 void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, A, B, C, 1, C);
   void *sgemm_node =
-      __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
+      __hpvm__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
 }
 
 void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1,
                     size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
   void *sgemm_node =
-      __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
-  __visc__bindIn(sgemm_node, 12, 12, 0);
-  __visc__bindIn(sgemm_node, 13, 13, 0);
+      __hpvm__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
+  __hpvm__bindIn(sgemm_node, 12, 12, 0);
+  __hpvm__bindIn(sgemm_node, 13, 13, 0);
 }
 
 // A wrapper level used in codegen for some backends
@@ -130,25 +130,25 @@ void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
                     int ldb, float *C, size_t bytes_C, int ldc, int k,
                     float alpha, float beta, size_t dim_X1, size_t dim_Y1,
                     size_t dim_X2, size_t dim_Y2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2);
-  __visc__bindIn(sgemm_node, 0, 0, 0);
-  __visc__bindIn(sgemm_node, 1, 1, 0);
-  __visc__bindIn(sgemm_node, 2, 2, 0);
-  __visc__bindIn(sgemm_node, 3, 3, 0);
-  __visc__bindIn(sgemm_node, 4, 4, 0);
-  __visc__bindIn(sgemm_node, 5, 5, 0);
-  __visc__bindIn(sgemm_node, 6, 6, 0);
-  __visc__bindIn(sgemm_node, 7, 7, 0);
-  __visc__bindIn(sgemm_node, 8, 8, 0);
-  __visc__bindIn(sgemm_node, 9, 9, 0);
-  __visc__bindIn(sgemm_node, 10, 10, 0);
-  __visc__bindIn(sgemm_node, 11, 11, 0);
-  __visc__bindIn(sgemm_node, 12, 12, 0);
-  __visc__bindIn(sgemm_node, 13, 13, 0);
-  __visc__bindIn(sgemm_node, 14, 14, 0);
-  __visc__bindIn(sgemm_node, 15, 15, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *sgemm_node = __hpvm__createNodeND(0, basicSgemmLvl2);
+  __hpvm__bindIn(sgemm_node, 0, 0, 0);
+  __hpvm__bindIn(sgemm_node, 1, 1, 0);
+  __hpvm__bindIn(sgemm_node, 2, 2, 0);
+  __hpvm__bindIn(sgemm_node, 3, 3, 0);
+  __hpvm__bindIn(sgemm_node, 4, 4, 0);
+  __hpvm__bindIn(sgemm_node, 5, 5, 0);
+  __hpvm__bindIn(sgemm_node, 6, 6, 0);
+  __hpvm__bindIn(sgemm_node, 7, 7, 0);
+  __hpvm__bindIn(sgemm_node, 8, 8, 0);
+  __hpvm__bindIn(sgemm_node, 9, 9, 0);
+  __hpvm__bindIn(sgemm_node, 10, 10, 0);
+  __hpvm__bindIn(sgemm_node, 11, 11, 0);
+  __hpvm__bindIn(sgemm_node, 12, 12, 0);
+  __hpvm__bindIn(sgemm_node, 13, 13, 0);
+  __hpvm__bindIn(sgemm_node, 14, 14, 0);
+  __hpvm__bindIn(sgemm_node, 15, 15, 0);
 }
 
 __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
@@ -194,8 +194,8 @@ __attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
                           dg[0] / db[0],
                           dg[1] / db[1]};
   *(RootIn *)root_in = root_in_local;
-  void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in);
-  __visc__wait(sgemmDFG);
+  void *sgemmDFG = __hpvm__launch(0, basicSgemmLvl3, root_in);
+  __hpvm__wait(sgemmDFG);
 }
 
 int main(int argc, char *argv[]) {
@@ -233,7 +233,7 @@ int main(int argc, char *argv[]) {
   readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
@@ -246,9 +246,9 @@ int main(int argc, char *argv[]) {
   // OpenCL memory allocation
   std::vector<float> matC(matArow * matBcol);
 
-  llvm_visc_track_mem(&matA.front(), A_sz);
-  llvm_visc_track_mem(&matBT.front(), B_sz);
-  llvm_visc_track_mem(&matC.front(), C_sz);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
   // Copy A and B^T into device memory
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -263,16 +263,16 @@ int main(int argc, char *argv[]) {
              matArow);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(&matC.front(), C_sz);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-  llvm_visc_untrack_mem(&matA.front());
-  llvm_visc_untrack_mem(&matBT.front());
-  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (params->outFile) {
 
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..ae0f5b60f4b800515bd84a04b02926acd625665c
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/kernel.cl
@@ -0,0 +1,40 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    // Partial results 
+    float c[TILE_N];
+    for (int i=0; i < TILE_N; i++)
+	c[i] = 0.0f;
+   
+    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
+    int m = get_group_id(0) * TILE_M + mid;
+
+    int b_base = 0;
+
+    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+	float a; 
+        b_base = get_group_id(1) * TILE_N + i * ldb;
+
+	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+	    a = A[m + (i+j)*lda];
+	    for (int kk = 0; kk < TILE_N; kk++)
+		c[kk] += a * B[b_base + j * ldb + kk];
+
+	}
+    }
+    int t = ldc * get_group_id(1) * TILE_N + m;
+    for (int i = 0; i < TILE_N; i++) {
+	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
+    }
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1db2e56a5c5639319d7be5f6a890d44c3a28421
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_opt/main.cc
@@ -0,0 +1,186 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_N 16
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __hpvm__hint(hpvm::GPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
+
+  int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+  int m = get_group_id(0) * TILE_M + mid;
+
+  int b_base = 0;
+
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    b_base = get_group_id(1) * TILE_N + i * ldb;
+
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++)
+        c[kk] += a * B[b_base + j * ldb + kk];
+    }
+  }
+  int t = ldc * get_group_id(1) * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
+}
+
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
+
+  unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
+  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+  unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N};
+
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __hpvm__wait(sgemmDFG);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+
+  if (params->outFile) {
+
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f81bac47072bc017dcdcdccf373cdfbd0f21ceac
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/Makefile
@@ -0,0 +1,9 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
+APP_OPTFLAGS=-unroll-threshold=300 -loop-unroll -sroa
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de0d473ed6fe6724ef81f99b13e02d0de29b103b
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_sh/main.cc
@@ -0,0 +1,350 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_N 16
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+typedef struct __attribute__((__packed__)) {
+  float *A;
+  size_t bytesA;
+  int lda;
+  float *B;
+  size_t bytesB;
+  int ldb;
+  float *C;
+  size_t bytesC;
+  int ldc;
+  int k;
+  float alpha;
+  float beta;
+  long block_x;
+  long block_y;
+  long grid_x;
+  long grid_y;
+} RootIn;
+
+void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B,
+              size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k,
+              float alpha, float beta, long block_x, long block_y, long grid_x,
+              long grid_y) {
+  args->A = A;
+  args->bytesA = bytesA;
+  args->lda = lda;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->ldb = ldb;
+  args->C = C;
+  args->bytesC = bytesC;
+  args->ldc = ldc;
+  args->k = k;
+  args->alpha = alpha;
+  args->beta = beta;
+  args->block_x = block_x;
+  args->block_y = block_y;
+  args->grid_x = grid_x;
+  args->grid_y = grid_y;
+}
+
+void Allocation(long block_x, long block_y) {
+  void *shB = __hpvm__malloc(block_x * block_y * sizeof(float));
+  __hpvm__return(2, shB, block_x * block_y * sizeof(float));
+}
+
+void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
+               int ldb, float *C, size_t bytesC, int ldc, int k, float alpha,
+               float beta, float *shB, size_t bytesshB) {
+  __hpvm__hint(hpvm::DEVICE);
+  //__hpvm__hint(hpvm::SPIR_TARGET);
+  //__hpvm__hint(hpvm::GPU_TARGET);
+
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+
+  long lx = __hpvm__getNodeInstanceID_x(thisNode);
+  long ly = __hpvm__getNodeInstanceID_y(thisNode);
+
+  long gx = __hpvm__getNodeInstanceID_x(parentNode);
+  long gy = __hpvm__getNodeInstanceID_y(parentNode);
+
+  long dimx = __hpvm__getNumNodeInstances_x(thisNode);
+
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
+
+  int mid = ly * dimx + lx;
+  int m = gx * TILE_M + mid;
+  int n = gy * TILE_N + lx;
+
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    // shB[ly][lx] = B[n+(i+ly)*ldb];
+    shB[ly * dimx + lx] = B[n + (i + ly) * ldb];
+
+    __hpvm__barrier();
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++) {
+        // c[kk] += a * shB[j][kk];
+        c[kk] += a * shB[j * dimx + kk];
+      }
+    }
+    __hpvm__barrier();
+  }
+
+  int t = ldc * gy * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
+}
+
+// Work group node for sgemm - Creates allocation node and leaf (work item) node
+void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb,
+             float *C, size_t bytesC, int ldc, int k, float alpha, float beta,
+             long block_x, long block_y) {
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *AllocationNode = __hpvm__createNodeND(0, Allocation);
+  void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y);
+
+  // Bind edges
+  __hpvm__bindIn(SgemmLeafNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmLeafNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmLeafNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmLeafNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmLeafNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmLeafNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmLeafNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmLeafNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmLeafNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmLeafNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
+
+  __hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
+  __hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
+
+  // Create Edges between AllocationNode and BFSLeafNodeNode
+  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
+  __hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
+               0); // Edge bytes_local_B
+}
+
+// Root node for sgemm - Creates work group node
+void SgemmRoot(float *A, size_t bytesA, int lda,                    // 0-2
+               float *B, size_t bytesB, int ldb,                    // 3-5
+               float *C, size_t bytesC, int ldc,                    // 6-8
+               int k, float alpha, float beta,                      // 9-11
+               long block_x, long block_y, long grid_x, long grid_y // 12-15
+) {
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y);
+
+  // Bind edges
+  __hpvm__bindIn(SgemmTBNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmTBNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmTBNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmTBNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmTBNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmTBNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmTBNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmTBNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmTBNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmTBNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
+  __hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
+  __hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
+}
+
+void SgemmWrapper(float *A, size_t bytesA, int lda,                    // 0-2
+                  float *B, size_t bytesB, int ldb,                    // 3-5
+                  float *C, size_t bytesC, int ldc,                    // 6-8
+                  int k, float alpha, float beta,                      // 9-11
+                  long block_x, long block_y, long grid_x, long grid_y // 12-15
+) {
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+  void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot);
+
+  // Bind edges
+  __hpvm__bindIn(SgemmRootNode, 0, 0, 0);   // Bind A
+  __hpvm__bindIn(SgemmRootNode, 1, 1, 0);   // Bind bytesA
+  __hpvm__bindIn(SgemmRootNode, 2, 2, 0);   // Bind lda
+  __hpvm__bindIn(SgemmRootNode, 3, 3, 0);   // Bind B
+  __hpvm__bindIn(SgemmRootNode, 4, 4, 0);   // Bind bytesB
+  __hpvm__bindIn(SgemmRootNode, 5, 5, 0);   // Bind ldb
+  __hpvm__bindIn(SgemmRootNode, 6, 6, 0);   // Bind C
+  __hpvm__bindIn(SgemmRootNode, 7, 7, 0);   // Bind bytesC
+  __hpvm__bindIn(SgemmRootNode, 8, 8, 0);   // Bind ldc
+  __hpvm__bindIn(SgemmRootNode, 9, 9, 0);   // Bind k
+  __hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
+  __hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
+  __hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
+  __hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
+  __hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
+  __hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
+}
+
+// Creates root node for sgemm
+__attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers,
+                                          char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
+
+  //    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
+  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+
+  long block_x = TILE_N;
+  long block_y = TILE_TB_HEIGHT;
+  long grid_x = m / TILE_M;
+  long grid_y = n / TILE_N;
+
+  // Pack data in struct
+  RootIn *args = (RootIn *)malloc(sizeof(RootIn));
+  packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta,
+           block_x, block_y, grid_x, grid_y);
+
+  pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
+  void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args);
+
+  __hpvm__wait(sgemmDFG);
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  // Use standard sgemm interface
+  basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(),
+             A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(),
+             C_sz, matArow);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+
+  if (params->outFile) {
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be39d713d55d1cb518083679fb1ea1ce717a4ca9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc/main.cc
@@ -0,0 +1,180 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __hpvm__attributes(3, A, B, C, 1, C);
+  float c0, c1, c2, c3;
+  c0 = c1 = c2 = c3 = 0.0f;
+  int m = 4 * get_global_id(0);
+  int n = get_global_id(1);
+
+  for (int i = 0; i < k; ++i) {
+    float a0 = A[m + i * lda];
+    float a1 = A[m + 1 + i * lda];
+    float a2 = A[m + 2 + i * lda];
+    float a3 = A[m + 3 + i * lda];
+
+    float b = B[n + i * ldb];
+
+    c0 += a0 * b;
+    c1 += a1 * b;
+    c2 += a2 * b;
+    c3 += a3 * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
+  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
+  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
+  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
+}
+
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __hpvm__wait(sgemmDFG);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
+
+  if (params->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+    /* Write C to file */
+    llvm_hpvm_request_mem(&matC.front(), C_sz);
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be39d713d55d1cb518083679fb1ea1ce717a4ca9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_tc_vec/main.cc
@@ -0,0 +1,180 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __hpvm__attributes(3, A, B, C, 1, C);
+  float c0, c1, c2, c3;
+  c0 = c1 = c2 = c3 = 0.0f;
+  int m = 4 * get_global_id(0);
+  int n = get_global_id(1);
+
+  for (int i = 0; i < k; ++i) {
+    float a0 = A[m + i * lda];
+    float a1 = A[m + 1 + i * lda];
+    float a2 = A[m + 2 + i * lda];
+    float a3 = A[m + 3 + i * lda];
+
+    float b = B[n + i * ldb];
+
+    c0 += a0 * b;
+    c1 += a1 * b;
+    c2 += a2 * b;
+    c3 += a3 * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
+  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
+  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
+  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
+}
+
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __hpvm__wait(sgemmDFG);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
+
+  if (params->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+    /* Write C to file */
+    llvm_hpvm_request_mem(&matC.front(), C_sz);
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..7530a400759e2d6db6ffd466c3f6aaf9dfab2117
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/kernel.cl
@@ -0,0 +1,53 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global float *A, size_t bytesA, int lda, __global float *B, size_t bytesB, int ldb, __global float* C, size_t bytesC, int ldc, int k, float alpha, float beta )
+{
+/*
+    // Partial results 
+    float c[8];
+    for (int i=0; i < 8; i++)
+	c[i] = 0.0f;
+    float a[8];
+    float b[8];
+
+    int m = get_global_id(0) * 8;
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+        for (int id = 0; id < 8; id++) {
+	    a[id] = A[m + id + i * lda];
+	    b[id] = B[n + i * ldb];
+	    c[id] += a[id] * b[id];
+        }
+    }
+
+    for (int id = 0; id < 8; id++)
+        C[m+id+n*ldc] = C[m+id+n*ldc] * beta + alpha * c[id];
+*/
+
+    // Partial results 
+    float8 cp = (float8)(0.0f);
+
+    int m = get_global_id(0) * 8;
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+        float8 a = vload8(0, A + (m + i * lda));
+        float8 b = (float8)(B[n + i * ldb]);
+        cp += a * b;
+    }
+
+    float8 c = vload8(0, C + (m+n*ldc));
+    c = c * beta + alpha * cp;
+    vstore8(c, 0, C + (m+n*ldc));
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..286297d6fefe0b6f72bdc9e8a9079a131a7b16bf
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec/main.cc
@@ -0,0 +1,189 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+#define VEC_SZ 8
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __hpvm__hint(hpvm::GPU_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  float c = 0.0f;
+  int m = get_global_id(0);
+  int n = get_global_id(1);
+
+  for (int i = 0; i < k; ++i) {
+    float a = A[m + i * lda];
+    float b = B[n + i * ldb];
+    c += a * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c;
+  /*
+      Will be substituted by this kernel at the llvm level
+      // Partial results
+      float8 cp = (float8)(0.0f);
+
+      int m = get_global_id(0) * 8;
+      int n = get_global_id(1);
+
+      for (int i = 0; i < k; ++i) {
+          float8 a = vload8(0, A + (m + i * lda));
+          float8 b = (float8)(B[n + i * ldb]);
+          cp += a * b;
+      }
+
+      float8 c = vload8(0, C + (m+n*ldc));
+      c = c * beta + alpha * cp;
+      vstore8(c, 0, C + (m+n*ldc));
+  */
+}
+
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __hpvm__wait(sgemmDFG);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+
+  if (params->outFile) {
+
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2234bf54e1e665f95b38dd0e25c2fe1b5539ce4e
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=hpvm
+SRCDIR_OBJS=io.ll #compute_gold.o
+HPVM_OBJS=main.hpvm.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04744f404ebaf6e669c2bbe91600519742b57dc9
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/io.cc
@@ -0,0 +1,84 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
+
+  char *buffer;
+
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
+
+  f.close();
+
+  return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if (!f.good()) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " " << nr_col << " ";
+
+  float data;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..cc6e708148f40c80186004d3febd66988c67ae37
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/kernel.cl
@@ -0,0 +1,86 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+// Parameters of tile sizes
+#define TILE_N 8
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N*TILE_TB_HEIGHT)
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+
+    float c[TILE_N];
+    for (int i=0; i < TILE_N; i++)
+	c[i] = 0.0f;
+   
+    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
+    int m = get_group_id(0) * TILE_M + mid;
+
+    int b_base = 0;
+
+    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+	float a; 
+        b_base = get_group_id(1) * TILE_N + i * ldb;
+
+	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+	    a = A[m + (i+j)*lda];
+	    for (int kk = 0; kk < TILE_N; kk++)
+		c[kk] += a * B[b_base + j * ldb + kk];
+
+	}
+    }
+    int t = ldc * get_group_id(1) * TILE_N + m;
+    for (int i = 0; i < TILE_N; i++) {
+	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
+    }
+/*
+    Will be substituted by this kernel at the llvm level
+
+    // Partial results 
+    floatn cp = (floatn)(0.0f);
+
+    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
+    int m = get_group_id(0) * TILE_M + mid;
+
+    int b_base = 0;
+
+    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+	float a;
+        b_base = get_group_id(1) * TILE_N + i * ldb;
+
+	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+	    a = A[m + (i+j)*lda];
+	    cp += a * vloadn(0, B + b_base + j * ldb);
+	}
+    }
+
+    cp = alpha * cp;
+    float c[TILE_N];
+    c[0] = cp.s0;
+    c[1] = cp.s1;
+    c[2] = cp.s2;
+    c[3] = cp.s3;
+    c[4] = cp.s4;
+    c[5] = cp.s5;
+    c[6] = cp.s6;
+    c[7] = cp.s7;
+
+    int t = ldc * get_group_id(1) * TILE_N + m;
+    for (int i = 0; i < TILE_N; i++) {
+	C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
+    }
+
+*/
+
+*/
+
+}
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fbc45e08a9e2fd1e3af6cc03360086b354665d7
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/hpvm_vec_opt/main.cc
@@ -0,0 +1,227 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <hpvm.h>
+#include <iostream>
+#include <malloc.h>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <vector>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
+
+// Parameters of tile sizes
+#define TILE_N 8
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
+
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __hpvm__hint(hpvm::SPIR_TARGET);
+  __hpvm__attributes(3, A, B, C, 1, C);
+
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
+
+  int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+  int m = get_group_id(0) * TILE_M + mid;
+
+  int b_base = 0;
+
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    b_base = get_group_id(1) * TILE_N + i * ldb;
+
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++)
+        c[kk] += a * B[b_base + j * ldb + kk];
+    }
+  }
+  int t = ldc * get_group_id(1) * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
+  /*
+      Will be substituted by this kernel at the llvm level
+
+      // Partial results
+      floatn cp = (floatn)(0.0f);
+
+      int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
+      int m = get_group_id(0) * TILE_M + mid;
+
+      int b_base = 0;
+
+      for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+          float a;
+          b_base = get_group_id(1) * TILE_N + i * ldb;
+
+          for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+              a = A[m + (i+j)*lda];
+              cp += a * vloadn(0, B + b_base + j * ldb);
+          }
+      }
+
+      cp = alpha * cp;
+      float c[TILE_N];
+      c[0] = cp.s0;
+      c[1] = cp.s1;
+      c[2] = cp.s2;
+      c[3] = cp.s3;
+      c[4] = cp.s4;
+      c[5] = cp.s5;
+      c[6] = cp.s6;
+      c[7] = cp.s7;
+
+      int t = ldc * get_group_id(1) * TILE_N + m;
+      for (int i = 0; i < TILE_N; i++) {
+          C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
+      }
+
+  */
+}
+
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
+
+  //    unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
+  //    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
+  unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+
+  void *sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __hpvm__wait(sgemmDFG);
+}
+
+int main(int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
+
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(&matA.front(), A_sz);
+  llvm_hpvm_track_mem(&matBT.front(), B_sz);
+  llvm_hpvm_track_mem(&matC.front(), C_sz);
+
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_hpvm_request_mem(&matC.front(), C_sz);
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+  llvm_hpvm_untrack_mem(&matA.front());
+  llvm_hpvm_untrack_mem(&matBT.front());
+  llvm_hpvm_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  __hpvm__cleanup();
+
+  if (params->outFile) {
+
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile
index 0a85b9253f12d6a084df7347677353be04b4d367..b0582e60a05d1a81b2facaf169f6dbd2d70ad8dd 100644
--- a/hpvm/test/parboil/benchmarks/spmv/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = spmv
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
similarity index 88%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
index a289d68f342ba488f8ce4d90faf26816d4d00829..06af6bebea2aa6a94f56196e0399a25ebfdda030 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/Makefile
@@ -1,9 +1,9 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 TOOLS_SRC=common_src/convert-dataset
 SRCDIR_OBJS=gpu_info.ll file.ll
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
 APP_CXXFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.cpp
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/file.h
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/file.h
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.cpp
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/gpu_info.h
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
similarity index 68%
rename from hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
rename to hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
index 4f72d2000afd70a986c8a1c82aa06866e2606511..4414744b4995a9ae09bb88fdda297150dfbe1031 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/hpvm/main.cpp
@@ -8,15 +8,15 @@
 
 //#include <CL/cl.h>
 //#include <CL/cl_ext.h>
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
-#include "convert_dataset.h"
 
 #define WARP_BITS 5
 
@@ -54,15 +54,15 @@ void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
 
   int ix = gx * gridx + lx;
   int warp_id = ix >> WARP_BITS;
@@ -126,25 +126,25 @@ void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmv_jds, dim_X1);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
 }
 
 void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data,
@@ -153,26 +153,26 @@ void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
-  __visc__bindIn(spmv_node, 15, 15, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmvLvl1, dim_X2);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
+  __hpvm__bindIn(spmv_node, 15, 15, 0);
 }
 
 void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data,
@@ -181,27 +181,27 @@ void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data,
               size_t bytes_x_vec, int dim, int *jds_ptr_int,
               size_t bytes_jds_ptr_int, int *sh_zcnt_int,
               size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
                      sh_zcnt_int, 1, dst_vector);
-  void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2);
-  __visc__bindIn(spmv_node, 0, 0, 0);
-  __visc__bindIn(spmv_node, 1, 1, 0);
-  __visc__bindIn(spmv_node, 2, 2, 0);
-  __visc__bindIn(spmv_node, 3, 3, 0);
-  __visc__bindIn(spmv_node, 4, 4, 0);
-  __visc__bindIn(spmv_node, 5, 5, 0);
-  __visc__bindIn(spmv_node, 6, 6, 0);
-  __visc__bindIn(spmv_node, 7, 7, 0);
-  __visc__bindIn(spmv_node, 8, 8, 0);
-  __visc__bindIn(spmv_node, 9, 9, 0);
-  __visc__bindIn(spmv_node, 10, 10, 0);
-  __visc__bindIn(spmv_node, 11, 11, 0);
-  __visc__bindIn(spmv_node, 12, 12, 0);
-  __visc__bindIn(spmv_node, 13, 13, 0);
-  __visc__bindIn(spmv_node, 14, 14, 0);
-  __visc__bindIn(spmv_node, 15, 15, 0);
-  __visc__bindIn(spmv_node, 16, 16, 0);
+  void *spmv_node = __hpvm__createNodeND(1, spmvLvl2, dim_X2);
+  __hpvm__bindIn(spmv_node, 0, 0, 0);
+  __hpvm__bindIn(spmv_node, 1, 1, 0);
+  __hpvm__bindIn(spmv_node, 2, 2, 0);
+  __hpvm__bindIn(spmv_node, 3, 3, 0);
+  __hpvm__bindIn(spmv_node, 4, 4, 0);
+  __hpvm__bindIn(spmv_node, 5, 5, 0);
+  __hpvm__bindIn(spmv_node, 6, 6, 0);
+  __hpvm__bindIn(spmv_node, 7, 7, 0);
+  __hpvm__bindIn(spmv_node, 8, 8, 0);
+  __hpvm__bindIn(spmv_node, 9, 9, 0);
+  __hpvm__bindIn(spmv_node, 10, 10, 0);
+  __hpvm__bindIn(spmv_node, 11, 11, 0);
+  __hpvm__bindIn(spmv_node, 12, 12, 0);
+  __hpvm__bindIn(spmv_node, 13, 13, 0);
+  __hpvm__bindIn(spmv_node, 14, 14, 0);
+  __hpvm__bindIn(spmv_node, 15, 15, 0);
+  __hpvm__bindIn(spmv_node, 16, 16, 0);
 }
 
 int main(int argc, char **argv) {
@@ -261,7 +261,7 @@ int main(int argc, char **argv) {
   input_vec(parameters->inpFiles[1], h_x_vector, dim);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   memset(h_Ax_vector, 0, dim * sizeof(float));
@@ -271,14 +271,14 @@ int main(int argc, char **argv) {
 
   compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float));
-  llvm_visc_track_mem(h_data, len * sizeof(float));
-  llvm_visc_track_mem(h_indices, len * sizeof(int));
-  llvm_visc_track_mem(h_perm, dim * sizeof(int));
-  llvm_visc_track_mem(h_x_vector, dim * sizeof(float));
-  llvm_visc_track_mem(h_ptr, depth * sizeof(int));
-  llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int));
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_Ax_vector, dim * sizeof(float));
+  llvm_hpvm_track_mem(h_data, len * sizeof(float));
+  llvm_hpvm_track_mem(h_indices, len * sizeof(int));
+  llvm_hpvm_track_mem(h_perm, dim * sizeof(int));
+  llvm_hpvm_track_mem(h_x_vector, dim * sizeof(float));
+  llvm_hpvm_track_mem(h_ptr, depth * sizeof(int));
+  llvm_hpvm_track_mem(h_nzcnt, nzcnt_len * sizeof(int));
 
   // main execution
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@@ -306,9 +306,9 @@ int main(int argc, char **argv) {
                             block,
                             (grid / block)};
     *(RootIn *)root_in = root_in_local;
-    void *spmvDFG = __visc__launch(0, spmvLvl3, root_in);
+    void *spmvDFG = __hpvm__launch(0, spmvLvl3, root_in);
 
-    __visc__wait(spmvDFG);
+    __hpvm__wait(spmvDFG);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
     /******************************* Issues *******************************
@@ -326,21 +326,21 @@ int main(int argc, char **argv) {
 
   // HtoD memory copy
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float));
+  llvm_hpvm_request_mem(h_Ax_vector, dim * sizeof(float));
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
 
-  llvm_visc_untrack_mem(h_Ax_vector);
-  llvm_visc_untrack_mem(h_data);
-  llvm_visc_untrack_mem(h_indices);
-  llvm_visc_untrack_mem(h_perm);
-  llvm_visc_untrack_mem(h_x_vector);
-  llvm_visc_untrack_mem(h_ptr);
-  llvm_visc_untrack_mem(h_nzcnt);
+  llvm_hpvm_untrack_mem(h_Ax_vector);
+  llvm_hpvm_untrack_mem(h_data);
+  llvm_hpvm_untrack_mem(h_indices);
+  llvm_hpvm_untrack_mem(h_perm);
+  llvm_hpvm_untrack_mem(h_x_vector);
+  llvm_hpvm_untrack_mem(h_ptr);
+  llvm_hpvm_untrack_mem(h_nzcnt);
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   pb_PrintTimerSet(&timers);
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc
deleted file mode 100644
index b804d14d16cff805c0c1850d1f5079ab6e973ecf..0000000000000000000000000000000000000000
Binary files a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.bc and /dev/null differ
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll
deleted file mode 100644
index 5604d70e8a005ee7e21c5ae9bf6dbf0dbac77d15..0000000000000000000000000000000000000000
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.visc.ll.kernels.ll
+++ /dev/null
@@ -1,138 +0,0 @@
-; ModuleID = 'build/visc_default/main.visc.ll.kernels.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-target triple = "spir64-unknown-unknown"
-
-%rtype = type {}
-
-; Function Attrs: optsize zeroext
-define void @spmv_jds(float* %dst_vector, i64 %bytes_dst_vector, float* %d_data, i64 %bytes_d_data, i32* %d_index, i64 %bytes_d_index, i32* %d_perm, i64 %bytes_d_perm, float* %x_vec, i64 %bytes_x_vec, i32 %dim, i32* %jds_ptr_int, i64 %bytes_jds_ptr_int, i32* %sh_zcnt_int, i64 %bytes_sh_zcnt_int) #0 {
-entry:
-  ;%0 = call i64 @_Z12get_group_idj(i32 0)
-  ;%1 = trunc i64 %0 to i32
-  ;%2 = call i64 @_Z14get_local_sizej(i32 0)
-  ;%3 = trunc i64 %2 to i32
-  ;%4 = mul i32 %1, %3
-  ;%5 = call i64 @_Z12get_local_idj(i32 0)
-  ;%6 = trunc i64 %5 to i32
-  ;%7 = add i32 %4, %6
-  %0 = add i32 0, 0
-  %1 = add i32 0, 0
-  %2 = add i32 0, 0
-  %3 = add i32 0, 0
-  %4 = add i32 0, 0
-  %5 = add i32 0, 0
-  %6 = call i64 @_Z13get_global_idj(i32 0)
-  %7 = trunc i64 %6 to i32
-  %cmp = icmp slt i32 %7, %dim
-  br i1 %cmp, label %if.then, label %if.end38
-
-if.then:                                          ; preds = %entry
-  %shr = ashr i32 %7, 5
-  %idxprom = sext i32 %shr to i64
-  %arrayidx = getelementptr inbounds i32* %sh_zcnt_int, i64 %idxprom
-  %8 = load i32* %arrayidx, align 4, !tbaa !4
-  %9 = load i32* %jds_ptr_int, align 4, !tbaa !4
-  %add = add nsw i32 %9, %7
-  %idxprom3 = sext i32 %add to i64
-  %arrayidx4 = getelementptr inbounds float* %d_data, i64 %idxprom3
-  %10 = load float* %arrayidx4, align 4, !tbaa !8
-  %arrayidx6 = getelementptr inbounds i32* %d_index, i64 %idxprom3
-  %11 = load i32* %arrayidx6, align 4, !tbaa !4
-  %idxprom7 = sext i32 %11 to i64
-  %arrayidx8 = getelementptr inbounds float* %x_vec, i64 %idxprom7
-  %12 = load float* %arrayidx8, align 4, !tbaa !8
-  %cmp9 = icmp sgt i32 %8, 1
-  br i1 %cmp9, label %if.then10, label %if.end
-
-if.then10:                                        ; preds = %if.then
-  %arrayidx11 = getelementptr inbounds i32* %jds_ptr_int, i64 1
-  %.pn77 = load i32* %arrayidx11, align 4
-  %idxprom13.pn.in78 = add nsw i32 %.pn77, %7
-  %idxprom13.pn79 = sext i32 %idxprom13.pn.in78 to i64
-  %i.0.in80 = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn79
-  %i.081 = load i32* %i.0.in80, align 4
-  %cmp1582 = icmp sgt i32 %8, 2
-  %arrayidx1783 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn79
-  %13 = load float* %arrayidx1783, align 4, !tbaa !8
-  br i1 %cmp1582, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.body, %if.then10
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 2, %if.then10 ]
-  %14 = phi float [ %16, %for.body ], [ %13, %if.then10 ]
-  %i.088 = phi i32 [ %i.0, %for.body ], [ %i.081, %if.then10 ]
-  %sum.086 = phi float [ %add25, %for.body ], [ 0.000000e+00, %if.then10 ]
-  %t.085 = phi float [ %15, %for.body ], [ %12, %if.then10 ]
-  %d.084 = phi float [ %14, %for.body ], [ %10, %if.then10 ]
-  %arrayidx19 = getelementptr inbounds i32* %jds_ptr_int, i64 %indvars.iv
-  %idxprom23 = sext i32 %i.088 to i64
-  %arrayidx24 = getelementptr inbounds float* %x_vec, i64 %idxprom23
-  %15 = load float* %arrayidx24, align 4, !tbaa !8
-  %mul = fmul fast float %d.084, %t.085
-  %add25 = fadd fast float %sum.086, %mul
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %.pn = load i32* %arrayidx19, align 4
-  %idxprom13.pn.in = add nsw i32 %.pn, %7
-  %idxprom13.pn = sext i32 %idxprom13.pn.in to i64
-  %i.0.in = getelementptr inbounds i32* %d_index, i64 %idxprom13.pn
-  %i.0 = load i32* %i.0.in, align 4
-  %arrayidx17 = getelementptr inbounds float* %d_data, i64 %idxprom13.pn
-  %16 = load float* %arrayidx17, align 4, !tbaa !8
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %8
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %if.then10
-  %.lcssa = phi float [ %13, %if.then10 ], [ %16, %for.body ]
-  %i.0.lcssa = phi i32 [ %i.081, %if.then10 ], [ %i.0, %for.body ]
-  %sum.0.lcssa = phi float [ 0.000000e+00, %if.then10 ], [ %add25, %for.body ]
-  %t.0.lcssa = phi float [ %12, %if.then10 ], [ %15, %for.body ]
-  %d.0.lcssa = phi float [ %10, %if.then10 ], [ %14, %for.body ]
-  %idxprom28 = sext i32 %i.0.lcssa to i64
-  %arrayidx29 = getelementptr inbounds float* %x_vec, i64 %idxprom28
-  %17 = load float* %arrayidx29, align 4, !tbaa !8
-  %mul30 = fmul fast float %d.0.lcssa, %t.0.lcssa
-  %add31 = fadd fast float %sum.0.lcssa, %mul30
-  br label %if.end
-
-if.end:                                           ; preds = %for.end, %if.then
-  %d.1 = phi float [ %.lcssa, %for.end ], [ %10, %if.then ]
-  %t.1 = phi float [ %17, %for.end ], [ %12, %if.then ]
-  %sum.1 = phi float [ %add31, %for.end ], [ 0.000000e+00, %if.then ]
-  %mul32 = fmul fast float %d.1, %t.1
-  %add33 = fadd fast float %sum.1, %mul32
-  %idxprom34 = sext i32 %7 to i64
-  %arrayidx35 = getelementptr inbounds i32* %d_perm, i64 %idxprom34
-  %18 = load i32* %arrayidx35, align 4, !tbaa !4
-  %idxprom36 = sext i32 %18 to i64
-  %arrayidx37 = getelementptr inbounds float* %dst_vector, i64 %idxprom36
-  store float %add33, float* %arrayidx37, align 4, !tbaa !8
-  br label %if.end38
-
-if.end38:                                         ; preds = %if.end, %entry
-  ret void
-}
-
-declare i64 @_Z13get_global_idj(i32)
-
-declare i64 @_Z12get_group_idj(i32)
-
-declare i64 @_Z14get_local_sizej(i32)
-
-declare i64 @_Z12get_local_idj(i32)
-
-attributes #0 = { optsize zeroext "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
-!visc_hint_gpu = !{}
-!visc_hint_cpu = !{!0, !1}
-!opencl.kernels = !{!2}
-
-!0 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32)* undef}
-!1 = metadata !{%rtype (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64, i32, i32)* undef}
-!2 = metadata !{void (float*, i64, float*, i64, i32*, i64, i32*, i64, float*, i64, i32, i32*, i64, i32*, i64)* @spmv_jds, metadata !3}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64", metadata !"float*", metadata !"i64", metadata !"i32", metadata !"i32*", metadata !"i64", metadata !"i32*", metadata !"i64"}
-!4 = metadata !{metadata !5, metadata !5, i64 0}
-!5 = metadata !{metadata !"int", metadata !6}
-!6 = metadata !{metadata !"omnipotent char", metadata !7}
-!7 = metadata !{metadata !"Simple C/C++ TBAA"}
-!8 = metadata !{metadata !9, metadata !9, i64 0}
-!9 = metadata !{metadata !"float", metadata !6}
diff --git a/hpvm/test/parboil/benchmarks/stencil/Makefile b/hpvm/test/parboil/benchmarks/stencil/Makefile
index f144c079ba27e9c2600139073c226fddd266da04..8412e4b2e8d370dc9266bd2765a2341512911f92 100644
--- a/hpvm/test/parboil/benchmarks/stencil/Makefile
+++ b/hpvm/test/parboil/benchmarks/stencil/Makefile
@@ -1,9 +1,9 @@
 PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = stencil
 
-# Default compile visc
+# Default compile hpvm
 ifeq ($(VERSION),)
-  VERSION = visc
+  VERSION = hpvm
 endif
 
 # Default use small test case
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
similarity index 80%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
index cf61fb3a6c77e07bf8ccc67902bd1a1997902763..35b36dcf3c053da03017c72d442204590675ecb4 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=file.ll
-VISC_OBJS=stencil.visc.ll
+HPVM_OBJS=stencil.hpvm.ll
 APP_CUDALDFLAGS=-lm
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/common.h
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/common.h
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.cc
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/file.h
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/file.h
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl
similarity index 100%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/kernel.cl
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/kernel.cl
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
similarity index 66%
rename from hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
rename to hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
index 5672a3ee490917d1374783eae5ab0ba1956ef441..e5810fc8101bef72dd4636b0b6c11826a8b18318 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm/stencil.cpp
@@ -9,11 +9,11 @@
 
 #include "common.h"
 #include "file.h"
+#include <hpvm.h>
 #include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <visc.h>
 
 static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
   int s = 0;
@@ -42,23 +42,23 @@ typedef struct __attribute__((__packed__)) {
 
 void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                   size_t bytes_Anext, int nx, int ny, int nz) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
 
-  void *thisNode = __visc__getNode();
-  void *parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __hpvm__getNode();
+  void *parentNode = __hpvm__getParentNode(thisNode);
 
-  int lx = __visc__getNodeInstanceID_x(thisNode);
-  int ly = __visc__getNodeInstanceID_y(thisNode);
-  int lz = __visc__getNodeInstanceID_z(thisNode);
+  int lx = __hpvm__getNodeInstanceID_x(thisNode);
+  int ly = __hpvm__getNodeInstanceID_y(thisNode);
+  int lz = __hpvm__getNodeInstanceID_z(thisNode);
 
-  int gx = __visc__getNodeInstanceID_x(parentNode);
-  int gy = __visc__getNodeInstanceID_y(parentNode);
-  int gz = __visc__getNodeInstanceID_z(parentNode);
+  int gx = __hpvm__getNodeInstanceID_x(parentNode);
+  int gy = __hpvm__getNodeInstanceID_y(parentNode);
+  int gz = __hpvm__getNodeInstanceID_z(parentNode);
 
-  int gridx = __visc__getNumNodeInstances_x(thisNode);
-  int gridy = __visc__getNumNodeInstances_y(thisNode);
-  int gridz = __visc__getNumNodeInstances_z(thisNode);
+  int gridx = __hpvm__getNumNodeInstances_x(thisNode);
+  int gridy = __hpvm__getNumNodeInstances_y(thisNode);
+  int gridz = __hpvm__getNumNodeInstances_z(thisNode);
 
   int i = gx * gridx + lx + 1;
   int j = gy * gridy + ly + 1;
@@ -78,65 +78,65 @@ void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
 void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
   void *stencil_node =
-      __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
+      __hpvm__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
 }
 
 void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
                  size_t dim_Z2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, A0, Anext, 1, Anext);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
   void *stencil_node =
-      __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
-  __visc__bindIn(stencil_node, 9, 9, 0);
-  __visc__bindIn(stencil_node, 10, 10, 0);
-  __visc__bindIn(stencil_node, 11, 11, 0);
+      __hpvm__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
+  __hpvm__bindIn(stencil_node, 9, 9, 0);
+  __hpvm__bindIn(stencil_node, 10, 10, 0);
+  __hpvm__bindIn(stencil_node, 11, 11, 0);
 }
 
 void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
                  size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
                  size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
                  size_t dim_Z2) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, A0, Anext, 1, Anext);
-  void *stencil_node = __visc__createNodeND(0, stencilLvl2);
-  __visc__bindIn(stencil_node, 0, 0, 0);
-  __visc__bindIn(stencil_node, 1, 1, 0);
-  __visc__bindIn(stencil_node, 2, 2, 0);
-  __visc__bindIn(stencil_node, 3, 3, 0);
-  __visc__bindIn(stencil_node, 4, 4, 0);
-  __visc__bindIn(stencil_node, 5, 5, 0);
-  __visc__bindIn(stencil_node, 6, 6, 0);
-  __visc__bindIn(stencil_node, 7, 7, 0);
-  __visc__bindIn(stencil_node, 8, 8, 0);
-  __visc__bindIn(stencil_node, 9, 9, 0);
-  __visc__bindIn(stencil_node, 10, 10, 0);
-  __visc__bindIn(stencil_node, 11, 11, 0);
-  __visc__bindIn(stencil_node, 12, 12, 0);
-  __visc__bindIn(stencil_node, 13, 13, 0);
-  __visc__bindIn(stencil_node, 14, 14, 0);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
+  void *stencil_node = __hpvm__createNodeND(0, stencilLvl2);
+  __hpvm__bindIn(stencil_node, 0, 0, 0);
+  __hpvm__bindIn(stencil_node, 1, 1, 0);
+  __hpvm__bindIn(stencil_node, 2, 2, 0);
+  __hpvm__bindIn(stencil_node, 3, 3, 0);
+  __hpvm__bindIn(stencil_node, 4, 4, 0);
+  __hpvm__bindIn(stencil_node, 5, 5, 0);
+  __hpvm__bindIn(stencil_node, 6, 6, 0);
+  __hpvm__bindIn(stencil_node, 7, 7, 0);
+  __hpvm__bindIn(stencil_node, 8, 8, 0);
+  __hpvm__bindIn(stencil_node, 9, 9, 0);
+  __hpvm__bindIn(stencil_node, 10, 10, 0);
+  __hpvm__bindIn(stencil_node, 11, 11, 0);
+  __hpvm__bindIn(stencil_node, 12, 12, 0);
+  __hpvm__bindIn(stencil_node, 13, 13, 0);
+  __hpvm__bindIn(stencil_node, 14, 14, 0);
 }
 
 int main(int argc, char **argv) {
@@ -195,11 +195,11 @@ int main(int argc, char **argv) {
   fclose(fp);
 
   pb_InitializeTimerSet(&timers);
-  __visc__init();
+  __hpvm__init();
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-  llvm_visc_track_mem(h_A0, sizeof(float) * size);
-  llvm_visc_track_mem(h_Anext, sizeof(float) * size);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_A0, sizeof(float) * size);
+  llvm_hpvm_track_mem(h_Anext, sizeof(float) * size);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -241,9 +241,9 @@ int main(int argc, char **argv) {
                             grid[1] / block[1],
                             grid[2] / block[2]};
     *(RootIn *)root_in = root_in_local;
-    void *stencilDFG = __visc__launch(0, stencilLvl3, root_in);
+    void *stencilDFG = __hpvm__launch(0, stencilLvl3, root_in);
 
-    __visc__wait(stencilDFG);
+    __hpvm__wait(stencilDFG);
     // printf("iteration %d\n",t);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     float *h_temp = h_A0;
@@ -255,19 +255,19 @@ int main(int argc, char **argv) {
   h_A0 = h_Anext;
   h_Anext = h_temp;
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  llvm_visc_request_mem(h_Anext, bytes);
+  llvm_hpvm_request_mem(h_Anext, bytes);
   printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
   printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
 
-  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
 
-  llvm_visc_untrack_mem(h_A0);
-  llvm_visc_untrack_mem(h_Anext);
+  llvm_hpvm_untrack_mem(h_A0);
+  llvm_hpvm_untrack_mem(h_Anext);
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..12a6d131c29067073fa79f09c4e6f91b8662969c
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/common.h
@@ -0,0 +1,15 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+//#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+// +3 for padding
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)) + 3)
+#define TCF 4
+#endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c
new file mode 100644
index 0000000000000000000000000000000000000000..35c5ed960c2031b0b84124bbdd1aeb95042625ee
--- /dev/null
+++ b/hpvm/test/parboil/benchmarks/stencil/src/hpvm_vec/stencil.c
@@ -0,0 +1,176 @@
+
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include "common.h"
+#include "file.h"
+#include <hpvm.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
+    }
+  }
+  return 0;
+}
+
+void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny,
+                  int nz) {
+  __hpvm__attributes(2, A0, Anext, 1, Anext);
+  int i = get_global_id(0) + 1;
+  int j = get_global_id(1) + 1;
+  int k = get_global_id(2) + 1;
+
+  if (i < nx - 1) {
+    Anext[Index3D(nx, ny, i, j, k)] = c1 * (A0[Index3D(nx, ny, i, j, k + 1)] +
+                                            A0[Index3D(nx, ny, i, j, k - 1)] +
+                                            A0[Index3D(nx, ny, i, j + 1, k)] +
+                                            A0[Index3D(nx, ny, i, j - 1, k)] +
+                                            A0[Index3D(nx, ny, i + 1, j, k)] +
+                                            A0[Index3D(nx, ny, i - 1, j, k)]) -
+                                      A0[Index3D(nx, ny, i, j, k)] * c0;
+  }
+}
+
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  // declaration
+  int nx, ny, nz;
+  size_t size;
+  int iteration;
+  float c0 = 1.0 / 6.0;
+  float c1 = 1.0 / 6.0 / 6.0;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
+
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // load data from files
+
+  size = nx * ny * nz;
+
+  // Padding in the beginning to get aligned loads and stores
+  size = size + 3;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0 + 3, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  __hpvm__init();
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_TRACK);
+  llvm_hpvm_track_mem(h_A0, sizeof(float) * size);
+  llvm_hpvm_track_mem(h_Anext, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  // only use 1D thread block
+  int tx = 256 / TCF;
+  int block[3] = {tx, 1, 1};
+  int grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+
+  printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2],
+         block[0], block[1], block[2]);
+  // main execution
+
+  int t;
+  size_t bytes = size * sizeof(float);
+  printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]);
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+    unsigned stencilDFG = __hpvm__node(
+        naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0],
+        grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0,
+        bytes, h_Anext, bytes, nx, ny, nz, 0);
+    __hpvm__wait(stencilDFG);
+    // printf("iteration %d\n",t);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    float *h_temp = h_A0;
+    h_A0 = h_Anext;
+    h_Anext = h_temp;
+  }
+
+  float *h_temp = h_A0;
+  h_A0 = h_Anext;
+  h_Anext = h_temp;
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_hpvm_request_mem(h_Anext, bytes);
+  printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
+
+  pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
+
+  llvm_hpvm_untrack_mem(h_A0);
+  llvm_hpvm_untrack_mem(h_Anext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+
+  __hpvm__cleanup();
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext + 3, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
+
+  return 0;
+}
diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h
index 30ad6721c3190610dd08ec131603b6fe622f897e..ba25726c027a5c67283c68a703216ad7ee785ef5 100644
--- a/hpvm/test/parboil/common/include/parboil.h
+++ b/hpvm/test/parboil/common/include/parboil.h
@@ -102,23 +102,23 @@ enum pb_TimerID {
                           * host activity: automatically filled in,
                           * not intended for direct usage */
   // GPU FUNCTION
-  visc_TimerID_INIT_CTX,
-  visc_TimerID_CLEAR_CTX,
-  visc_TimerID_COPY_SCALAR,
-  visc_TimerID_COPY_PTR,
-  visc_TimerID_MEM_FREE,
-  visc_TimerID_READ_OUTPUT,
-  visc_TimerID_SETUP,
-  visc_TimerID_MEM_TRACK,
-  visc_TimerID_MEM_UNTRACK,
-  visc_TimerID_MISC,
+  hpvm_TimerID_INIT_CTX,
+  hpvm_TimerID_CLEAR_CTX,
+  hpvm_TimerID_COPY_SCALAR,
+  hpvm_TimerID_COPY_PTR,
+  hpvm_TimerID_MEM_FREE,
+  hpvm_TimerID_READ_OUTPUT,
+  hpvm_TimerID_SETUP,
+  hpvm_TimerID_MEM_TRACK,
+  hpvm_TimerID_MEM_UNTRACK,
+  hpvm_TimerID_MISC,
   // LAUNCH FUNCTION
-  visc_TimerID_PTHREAD_CREATE,
-  visc_TimerID_ARG_PACK,
-  visc_TimerID_ARG_UNPACK,
-  visc_TimerID_COMPUTATION,
-  visc_TimerID_OUTPUT_PACK,
-  visc_TimerID_OUTPUT_UNPACK,
+  hpvm_TimerID_PTHREAD_CREATE,
+  hpvm_TimerID_ARG_PACK,
+  hpvm_TimerID_ARG_UNPACK,
+  hpvm_TimerID_COMPUTATION,
+  hpvm_TimerID_OUTPUT_PACK,
+  hpvm_TimerID_OUTPUT_UNPACK,
 
   pb_TimerID_LAST /* Number of timer IDs */
 };
diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/hpvm.mk
similarity index 83%
rename from hpvm/test/parboil/common/mk/visc.mk
rename to hpvm/test/parboil/common/mk/hpvm.mk
index 0a8984deeac5696557f4b6a220b4f0758f5aefcf..cbc4071be246517e9d0d70a7c5d220e04f48f427 100755
--- a/hpvm/test/parboil/common/mk/visc.mk
+++ b/hpvm/test/parboil/common/mk/hpvm.mk
@@ -9,37 +9,37 @@ CFLAGS=$(LANG_CFLAGS) $(PLATFORM_CFLAGS) $(APP_CFLAGS)
 CXXFLAGS=$(LANG_CXXFLAGS) $(PLATFORM_CXXFLAGS) $(APP_CXXFLAGS)
 LDFLAGS=$(LANG_LDFLAGS) $(PLATFORM_LDFLAGS) $(APP_LDFLAGS)
 
-# VISC
+# HPVM
 LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs
-VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/tools/hpvm/projects/visc-rt
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/../build/tools/hpvm/projects/hpvm-rt
 
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
 #LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc
 LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc
 #LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc
 
 LLVM_34_AS = /opt/llvm/bin/llvm-as
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
 
 ifeq ($(TARGET),x86)
   DEVICE = SPIR_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
   CFLAGS += -DOPENCL_CPU
 else ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
 else ifeq ($(TARGET),seqx86)
   DEVICE = CPU_OR_SPIR_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
   CFLAGS += -DOPENCL_CPU
 else ifeq ($(TARGET),seqgpu)
   DEVICE = CPU_OR_GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
 endif
 
 CFLAGS += -DDEVICE=$(DEVICE)
@@ -48,31 +48,31 @@ CXXFLAGS += -DDEVICE=$(DEVICE)
 HOST_LINKFLAGS =
 
 ifeq ($(TIMER),x86)
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else ifeq ($(TIMER),ptx)
-  VISC_OPTFLAGS += -visc-timers-ptx
+  HPVM_OPTFLAGS += -hpvm-timers-ptx
 else ifeq ($(TIMER),gen)
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 else ifeq ($(TIMER),spir)
-  TESTGEN_OPTFLAGS += -visc-timers-spir
+  TESTGEN_OPTFLAGS += -hpvm-timers-spir
 else ifeq ($(TIMER),no)
 else
   ifeq ($(TARGET),x86)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir
   else ifeq ($(TARGET),seq)
-    VISC_OPTFLAGS += -visc-timers-x86
+    HPVM_OPTFLAGS += -hpvm-timers-x86
   else ifeq ($(TARGET),seqx86)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-spir
   else ifeq ($(TARGET),seqgpu)
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
   else
-    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+    HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
   endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 endif
 
 ifeq ($(DABSTRACTION),true)
-  VISC_OPTFLAGS += -visc-eda
+  HPVM_OPTFLAGS += -hpvm-eda
 endif
 
 # Rules common to all makefiles
@@ -120,7 +120,7 @@ endif
 ########################################
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll)
 KERNEL = $(TEST_OBJS).kernels.ll
 KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll
@@ -181,11 +181,11 @@ $(KERNEL_OPT) : $(KERNEL)
 $(BIN) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(BUILDDIR)/parboil.ll $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS)
-	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILDDIR)/$(HPVM_OBJS)
+	$(OPT) $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(RUNDIR) :
 	mkdir -p $(RUNDIR)
@@ -202,7 +202,7 @@ $(BUILDDIR)/%.ll : $(SRCDIR)/%.cc
 $(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp
 	$(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@
 
-$(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll
+$(BUILDDIR)/%.hpvm.ll: $(BUILDDIR)/%.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 $(BUILDDIR)/%.o : $(SRCDIR)/%.c
diff --git a/hpvm/test/parboil/common/platform/visc.default.mk b/hpvm/test/parboil/common/platform/hpvm.default.mk
similarity index 61%
rename from hpvm/test/parboil/common/platform/visc.default.mk
rename to hpvm/test/parboil/common/platform/hpvm.default.mk
index 03a9b0874aa2b2617afab71b27470b97f5b1f4b0..ca90d453a38d0b63d16e850b57de5622cbd1f2e1 100644
--- a/hpvm/test/parboil/common/platform/visc.default.mk
+++ b/hpvm/test/parboil/common/platform/hpvm.default.mk
@@ -12,20 +12,20 @@
 #OPENCL_LIB_PATH=$(OPENCL_PATH)/lib/x86_64
 
 #build
-VISC_BUILD_DIR = $(LLVM_SRC_ROOT)/../build
+HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build
 # gcc (default)
-CC = $(VISC_BUILD_DIR)/bin/clang
-OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe
-PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include
+CC = $(HPVM_BUILD_DIR)/bin/clang
+OCLBE = $(HPVM_BUILD_DIR)/bin/llvm-cbe
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include
 
-CXX = $(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include -I../../../include
+CXX = $(HPVM_BUILD_DIR)/bin/clang++
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(HPVM_BUILD_DIR)/include -I../../../include
 
-LINKER = $(VISC_BUILD_DIR)/bin/clang++
+LINKER = $(HPVM_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lOpenCL
 
-LLVM_LIB_PATH = $(VISC_BUILD_DIR)/lib
-LLVM_BIN_PATH = $(VISC_BUILD_DIR)/bin
+LLVM_LIB_PATH = $(HPVM_BUILD_DIR)/lib
+LLVM_BIN_PATH = $(HPVM_BUILD_DIR)/bin
 
 OPT = $(LLVM_BIN_PATH)/opt
 LLVM_LINK = $(LLVM_BIN_PATH)/llvm-link
diff --git a/hpvm/test/pipeline/Makefile b/hpvm/test/pipeline/Makefile
index e3572ecdfc4322ecd12c25517880b87f94c0f9e1..c9a17c1634ab39b79ec903e889fcb8492eef0848 100644
--- a/hpvm/test/pipeline/Makefile
+++ b/hpvm/test/pipeline/Makefile
@@ -23,12 +23,12 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 EXE = pipeline-$(TARGET)
 
 INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR)
-INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include
 
 ## BEGIN HPVM MAKEFILE
 SRCDIR_OBJS= io.ll
 OBJS_SRC=src/io.cc
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP = $(EXE)
 APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
 APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
@@ -39,21 +39,21 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/visc-rt
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
+HPVM_RT_PATH = $(LLVM_SRC_ROOT)/tools/hpvm/projects/hpvm-rt
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 
 ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
 endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 
 CFLAGS += -DDEVICE=$(DEVICE)
 CXXFLAGS += -DDEVICE=$(DEVICE)
@@ -64,7 +64,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
 .PRECIOUS: $(BUILD_DIR)/%.ll
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
 ifeq ($(TARGET),seq)
@@ -91,11 +91,11 @@ $(KERNEL_OCL) : $(KERNEL)
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
+	$(OPT) -debug $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -106,7 +106,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc
 $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc
 	$(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $<
 
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
-	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
+	$(OPT) -debug-only=genhpvm $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/pipeline/copyToVersions.sh b/hpvm/test/pipeline/copyToVersions.sh
index 3b9c19bad6dd86de7eb9a82edc7f17b92265155e..67551aff2f1b47fb2ad9c69be44936e8145a68da 100755
--- a/hpvm/test/pipeline/copyToVersions.sh
+++ b/hpvm/test/pipeline/copyToVersions.sh
@@ -1,12 +1,12 @@
 
-declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC")
+declare -a versionList=("hpvmGPU" "hpvmVector" "hpvmScalar" "hpvmGPU-Scalar-MaxG" "hpvmVector-Scalar-MaxG" "hpvmGPU-Scalar-ZC" "hpvmVector-Scalar-ZC")
 declare -a fileList=("Makefile" "io.cc" "main.cc")
 
 for version in "${versionList[@]}"; do
   echo $version
   for filename in "${fileList[@]}"; do
-    echo cp ./src/visc_parallel/$filename ./src/$version/
-    cp ./src/visc_parallel/$filename ./src/$version/
+    echo cp ./src/hpvm_parallel/$filename ./src/$version/
+    cp ./src/hpvm_parallel/$filename ./src/$version/
   done
   echo
 done
diff --git a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
similarity index 95%
rename from hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll
rename to hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
index 06ec055bb746c7cc0cd58f75ed1f8090e0afa459..8056cc12eed0e4d20d45e294bf674dfc689f6bb8 100644
--- a/hpvm/test/pipeline/gradient.visc.merged.experiments.notimer.ll
+++ b/hpvm/test/pipeline/gradient.hpvm.merged.experiments.notimer.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'build/Gradient_default/main.visc.ll'
+; ModuleID = 'build/Gradient_default/main.hpvm.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -167,9 +167,9 @@ entry:
 ; Function Attrs: nounwind uwtable
 define %emptyStruct @squareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 {
 entry:
-  %call3 = tail call i8* @llvm.visc.getNode()
-  %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3)
-  %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3)
+  %call3 = tail call i8* @llvm.hpvm.getNode()
+  %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3)
+  %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3)
   %cmp = icmp slt i32 %call14, %n
   %cmp3 = icmp slt i32 %call25, %m
   %or.cond = and i1 %cmp, %cmp3
@@ -198,51 +198,51 @@ if.end:                                           ; preds = %if.then, %entry
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.23 @WrapperSquareRoot(float* nocapture in %Gx, i64 %bytesGx, float* nocapture in %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n, i32 %dummyH, i32 %dummyV) #2 {
 entry:
-  %squareRoot.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false)
+  %squareRoot.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot to i8*), i32 %m, i32 %n)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 8, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %squareRoot.node, i32 9, i32 9, i1 false)
   ret %emptyStruct.23 undef
 }
 
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.24 @Gradient(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %Sx, i64 %bytesSx, float* nocapture in %Sy, i64 %bytesSy, float* nocapture out %Gx, i64 %bytesGx, float* nocapture out %Gy, i64 %bytesGy, float* nocapture out %G, i64 %bytesG, i32 %m, i32 %n) #2 {
 entry:
-  %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*))
-  %WrapperSquareRoot.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*))
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false)
-  %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false)
-  %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false)
+  %WrapperHorizontal_WrapperVertical.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperHorizontal.WrapperVertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperHorizontal_WrapperVertical to i8*))
+  %WrapperSquareRoot.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperSquareRoot to i8*))
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 6, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 7, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 0, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 1, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 4, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 5, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 8, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 9, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 12, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperHorizontal_WrapperVertical.node, i32 13, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 6, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 7, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 8, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 9, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 10, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 11, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 12, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperSquareRoot.node, i32 13, i32 7, i1 false)
+  %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 0, i32 8, i1 false)
+  %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperHorizontal_WrapperVertical.node, i8* %WrapperSquareRoot.node, i1 false, i32 1, i32 9, i1 false)
   ret %emptyStruct.24 undef
 }
 
@@ -866,7 +866,7 @@ cond.false:                                       ; preds = %land.lhs.true58, %l
 
 cond.end:                                         ; preds = %land.lhs.true58
   call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5
   %104 = load i32* %103, align 4, !tbaa !9
   %arrayidx.i296 = getelementptr inbounds i32* %103, i64 1
@@ -1137,15 +1137,15 @@ cond.false87:                                     ; preds = %_Z12getNextFrameRN2
   unreachable
 
 cond.end88:                                       ; preds = %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit335
-  call void @llvm_visc_track_mem(i8* %150, i64 %mul65) #1
-  call void @llvm_visc_track_mem(i8* %106, i64 36) #1
-  call void @llvm_visc_track_mem(i8* %113, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %150, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %113, i64 36) #1
   %176 = load i8** %data73, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %176, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %176, i64 %mul65) #1
   %177 = load i8** %data74, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %177, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %177, i64 %mul65) #1
   %178 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %178, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %178, i64 %mul65) #1
   %179 = load i8** %data, align 8, !tbaa !5
   %180 = bitcast i8* %179 to float*
   store float* %180, float** %I1.i, align 1, !tbaa !5
@@ -1154,8 +1154,8 @@ cond.end88:                                       ; preds = %_Z12getNextFrameRN2
 
 for.body:                                         ; preds = %for.body, %cond.end88
   %j.0480 = phi i32 [ 0, %cond.end88 ], [ %inc, %for.body ]
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false)
-  call void @llvm.visc.wait(i8* %graphID)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @Gradient to i8*), i8* %call66, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %inc = add i32 %j.0480, 1
   %exitcond = icmp eq i32 %inc, 2994
   br i1 %exitcond, label %for.end, label %for.body
@@ -1163,19 +1163,19 @@ for.body:                                         ; preds = %for.body, %cond.end
 for.end:                                          ; preds = %for.body
   call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
   %181 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_request_mem(i8* %181, i64 %mul65) #1
+  call void @llvm_hpvm_request_mem(i8* %181, i64 %mul65) #1
   %182 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %182) #1
-  call void @llvm_visc_untrack_mem(i8* %106) #1
-  call void @llvm_visc_untrack_mem(i8* %113) #1
+  call void @llvm_hpvm_untrack_mem(i8* %182) #1
+  call void @llvm_hpvm_untrack_mem(i8* %106) #1
+  call void @llvm_hpvm_untrack_mem(i8* %113) #1
   %183 = load i8** %data73, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %183) #1
+  call void @llvm_hpvm_untrack_mem(i8* %183) #1
   %184 = load i8** %data74, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %184) #1
+  call void @llvm_hpvm_untrack_mem(i8* %184) #1
   %185 = load i8** %data75, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %185) #1
+  call void @llvm_hpvm_untrack_mem(i8* %185) #1
   call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1
   %u.i.i.i342 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9
   %186 = load %"struct.cv::UMatData"** %u.i.i.i342, align 8, !tbaa !5
@@ -1647,13 +1647,13 @@ declare noalias i8* @malloc(i64) #5
 
 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0
 
-declare void @llvm_visc_track_mem(i8*, i64) #0
+declare void @llvm_hpvm_track_mem(i8*, i64) #0
 
 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
 
-declare void @llvm_visc_request_mem(i8*, i64) #0
+declare void @llvm_hpvm_request_mem(i8*, i64) #0
 
-declare void @llvm_visc_untrack_mem(i8*) #0
+declare void @llvm_hpvm_untrack_mem(i8*) #0
 
 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
 
@@ -1713,50 +1713,50 @@ entry:
 declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.visc.getNode() #7
+declare i8* @llvm.hpvm.getNode() #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind readonly
 declare float @llvm.sqrt.f32(float) #8
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #1
+declare i8* @llvm.hpvm.createNode(i8*) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
+declare void @llvm.hpvm.wait(i8*) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
 define %horizontal.vertical.ty @horizontal_vertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %call3.i = tail call i8* @llvm.visc.getNode() #1
-  %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1
-  %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1
+  %call3.i = tail call i8* @llvm.hpvm.getNode() #1
+  %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1
+  %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1
   %mul.i = mul nsw i32 %call25.i, %n1_n
   %add.i = add nsw i32 %mul.i, %call14.i
   %cmp.i = icmp slt i32 %call14.i, %n1_n
@@ -2139,25 +2139,25 @@ vertical.exit:                                    ; preds = %if.end42.2.i67.us,
 ; Function Attrs: nounwind
 define %WrapperHorizontal.WrapperVertical.ty @WrapperHorizontal_WrapperVertical(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_Sx, i64 %n1_bytesSx, float* nocapture out %n1_Gx, i64 %n1_bytesGx, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_Sy, i64 %n2_bytesSy, float* nocapture out %n2_Gy, i64 %n2_bytesGy, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %horizontal_vertical.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n)
-  tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false)
+  %horizontal_vertical.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical to i8*), i32 %n1_m, i32 %n1_n)
+  tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.output(i8* %horizontal_vertical.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 15, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 14, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 13, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 12, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 11, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %horizontal_vertical.node, i32 8, i32 8, i1 false)
   ret %WrapperHorizontal.WrapperVertical.ty undef
 }
 
@@ -2172,9 +2172,9 @@ attributes #7 = { nounwind readnone }
 attributes #8 = { nounwind readonly }
 attributes #9 = { noreturn nounwind }
 
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2, !3, !4}
-!visc_hint_spir = !{}
+!hpvm_hint_gpu = !{!0, !1}
+!hpvm_hint_cpu = !{!2, !3, !4}
+!hpvm_hint_spir = !{}
 
 !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @squareRoot}
 !1 = metadata !{%horizontal.vertical.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @horizontal_vertical}
diff --git a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
similarity index 95%
rename from hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll
rename to hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
index 4b0458625157e1c6535941ec5c663f8a16660c22..aa4a0d19a0ec80910b8d82b03de018ad41470a22 100644
--- a/hpvm/test/pipeline/laplacian.visc.merged.experiments.notimer.ll
+++ b/hpvm/test/pipeline/laplacian.hpvm.merged.experiments.notimer.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'build/Laplacian_default/main.visc.ll'
+; ModuleID = 'build/Laplacian_default/main.hpvm.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -170,9 +170,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #1
 ; Function Attrs: nounwind uwtable
 define %emptyStruct @lincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 {
 entry:
-  %call3 = tail call i8* @llvm.visc.getNode()
-  %call14 = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3)
-  %call25 = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3)
+  %call3 = tail call i8* @llvm.hpvm.getNode()
+  %call14 = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3)
+  %call25 = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3)
   %cmp = icmp slt i32 %call14, %n
   %cmp3 = icmp slt i32 %call25, %m
   %or.cond = and i1 %cmp, %cmp3
@@ -202,55 +202,55 @@ if.end:                                           ; preds = %if.then, %entry
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.23 @WrapperLincomb(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %D, i64 %bytesD, float* nocapture in %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n, i32 %dummyD, i32 %dummyE) #2 {
 entry:
-  %lincomb.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false)
+  %lincomb.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb to i8*), i32 %m, i32 %n)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 8, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %lincomb.node, i32 11, i32 11, i1 false)
   ret %emptyStruct.23 undef
 }
 
 ; Function Attrs: nounwind uwtable
 define %emptyStruct.24 @LaplacianEstimate(float* nocapture in %Is, i64 %bytesIs, float* nocapture in %B, i64 %bytesB, float* nocapture out %D, i64 %bytesD, float* nocapture out %E, i64 %bytesE, float* nocapture out %L, i64 %bytesL, i32 %m, i32 %n) #2 {
 entry:
-  %WrapperDilate_WrapperErode.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*))
-  %WrapperLincomb.node = tail call i8* @llvm.visc.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*))
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false)
-  %output.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false)
-  %output1.repl = tail call i8* @llvm.visc.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false)
+  %WrapperDilate_WrapperErode.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%WrapperDilate.WrapperErode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @WrapperDilate_WrapperErode to i8*))
+  %WrapperLincomb.node = tail call i8* @llvm.hpvm.createNode(i8* bitcast (%emptyStruct.23 (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @WrapperLincomb to i8*))
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 0, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 1, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 2, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 3, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 6, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 7, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 10, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperDilate_WrapperErode.node, i32 11, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 4, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 5, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 6, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 7, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 8, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 9, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 10, i32 8, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %WrapperLincomb.node, i32 11, i32 9, i1 false)
+  %output.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 0, i32 10, i1 false)
+  %output1.repl = tail call i8* @llvm.hpvm.createEdge(i8* %WrapperDilate_WrapperErode.node, i8* %WrapperLincomb.node, i1 false, i32 1, i32 11, i1 false)
   ret %emptyStruct.24 undef
 }
 
@@ -873,7 +873,7 @@ cond.false:                                       ; preds = %land.lhs.true58, %l
 
 cond.end:                                         ; preds = %land.lhs.true58
   call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %103 = load i32** %p.i.i.i.i, align 8, !tbaa !5
   %104 = load i32* %103, align 4, !tbaa !9
   %arrayidx.i290 = getelementptr inbounds i32* %103, i64 1
@@ -1062,18 +1062,18 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328,
   call void @llvm.lifetime.end(i64 24, i8* %134) #1
   %data = getelementptr inbounds %"class.cv::Mat"* %src, i64 0, i32 4
   %139 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %139, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %139, i64 %mul65) #1
   %arraydecay = getelementptr inbounds [9 x float]* %B, i64 0, i64 0
-  call void @llvm_visc_track_mem(i8* %106, i64 36) #1
+  call void @llvm_hpvm_track_mem(i8* %106, i64 36) #1
   %data81 = getelementptr inbounds %"class.cv::Mat"* %D, i64 0, i32 4
   %140 = load i8** %data81, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %140, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %140, i64 %mul65) #1
   %data82 = getelementptr inbounds %"class.cv::Mat"* %E, i64 0, i32 4
   %141 = load i8** %data82, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %141, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %141, i64 %mul65) #1
   %data83 = getelementptr inbounds %"class.cv::Mat"* %L, i64 0, i32 4
   %142 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_track_mem(i8* %142, i64 %mul65) #1
+  call void @llvm_hpvm_track_mem(i8* %142, i64 %mul65) #1
   %143 = load i8** %data, align 8, !tbaa !5
   %144 = bitcast i8* %143 to float*
   %145 = load i8** %data81, align 8, !tbaa !5
@@ -1126,8 +1126,8 @@ _Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332: ; preds = %if.then.i328,
 
 for.body:                                         ; preds = %for.body, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332
   %j.0474 = phi i32 [ 0, %_Z12getNextFrameRN2cv12VideoCaptureERNS_3MatE.exit332 ], [ %inc, %for.body ]
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false)
-  call void @llvm.visc.wait(i8* %graphID)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%emptyStruct.24 (float*, i64, float*, i64, float*, i64, float*, i64, float*, i64, i32, i32)* @LaplacianEstimate to i8*), i8* %call66, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %inc = add nsw i32 %j.0474, 1
   %exitcond = icmp eq i32 %inc, 2994
   br i1 %exitcond, label %for.end, label %for.body
@@ -1135,18 +1135,18 @@ for.body:                                         ; preds = %for.body, %_Z12getN
 for.end:                                          ; preds = %for.body
   call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
   %165 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_request_mem(i8* %165, i64 %mul65) #1
+  call void @llvm_hpvm_request_mem(i8* %165, i64 %mul65) #1
   %166 = load i8** %data, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %166) #1
-  call void @llvm_visc_untrack_mem(i8* %106) #1
+  call void @llvm_hpvm_untrack_mem(i8* %166) #1
+  call void @llvm_hpvm_untrack_mem(i8* %106) #1
   %167 = load i8** %data81, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %167) #1
+  call void @llvm_hpvm_untrack_mem(i8* %167) #1
   %168 = load i8** %data82, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %168) #1
+  call void @llvm_hpvm_untrack_mem(i8* %168) #1
   %169 = load i8** %data83, align 8, !tbaa !5
-  call void @llvm_visc_untrack_mem(i8* %169) #1
+  call void @llvm_hpvm_untrack_mem(i8* %169) #1
   call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   call void @pb_FreeParameters(%struct.pb_Parameters* %call3) #1
   %u.i.i.i336 = getelementptr inbounds %"class.cv::Mat"* %out, i64 0, i32 9
   %170 = load %"struct.cv::UMatData"** %u.i.i.i336, align 8, !tbaa !5
@@ -1614,13 +1614,13 @@ declare noalias i8* @malloc(i64) #5
 
 declare void @_ZN2cv12VideoCaptureD1Ev(%"class.cv::VideoCapture"*) #0
 
-declare void @llvm_visc_track_mem(i8*, i64) #0
+declare void @llvm_hpvm_track_mem(i8*, i64) #0
 
 declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
 
-declare void @llvm_visc_request_mem(i8*, i64) #0
+declare void @llvm_hpvm_request_mem(i8*, i64) #0
 
-declare void @llvm_visc_untrack_mem(i8*) #0
+declare void @llvm_hpvm_untrack_mem(i8*) #0
 
 declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
 
@@ -1677,47 +1677,47 @@ declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
 
 ; Function Attrs: nounwind readnone
-declare i8* @llvm.visc.getNode() #7
+declare i8* @llvm.hpvm.getNode() #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.x(i8*) #7
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #7
+declare i32 @llvm.hpvm.getNodeInstanceID.y(i8*) #7
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #1
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #1
+declare i8* @llvm.hpvm.createNode(i8*) #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #1
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #1
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #1
+declare void @llvm.hpvm.wait(i8*) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
 define %dilate.erode.ty @dilate_erode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %call3.i = tail call i8* @llvm.visc.getNode() #1
-  %call14.i = tail call i32 @llvm.visc.getNodeInstanceID.x(i8* %call3.i) #1
-  %call25.i = tail call i32 @llvm.visc.getNodeInstanceID.y(i8* %call3.i) #1
+  %call3.i = tail call i8* @llvm.hpvm.getNode() #1
+  %call14.i = tail call i32 @llvm.hpvm.getNodeInstanceID.x(i8* %call3.i) #1
+  %call25.i = tail call i32 @llvm.hpvm.getNodeInstanceID.y(i8* %call3.i) #1
   %cmp.i = icmp slt i32 %call14.i, %n1_n
   %cmp3.i = icmp slt i32 %call25.i, %n1_m
   %or.cond.i = and i1 %cmp.i, %cmp3.i
@@ -2070,25 +2070,25 @@ erode.exit:                                       ; preds = %dilate.exit, %cond.
 ; Function Attrs: nounwind
 define %WrapperDilate.WrapperErode.ty @WrapperDilate_WrapperErode(float* nocapture in %n1_Is, i64 %n1_bytesIs, float* nocapture in %n1_B, i64 %n1_bytesB, float* nocapture out %n1_D, i64 %n1_bytesD, i32 %n1_m, i32 %n1_n, float* nocapture in %n2_Is, i64 %n2_bytesIs, float* nocapture in %n2_B, i64 %n2_bytesB, float* nocapture out %n2_E, i64 %n2_bytesE, i32 %n2_m, i32 %n2_n) #1 {
 entry:
-  %dilate_erode.node = tail call i8* @llvm.visc.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n)
-  tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
-  tail call void @llvm.visc.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false)
-  tail call void @llvm.visc.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false)
+  %dilate_erode.node = tail call i8* @llvm.hpvm.createNode2D(i8* bitcast (%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode to i8*), i32 %n1_m, i32 %n1_n)
+  tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 7, i32 7, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 6, i32 6, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 5, i32 5, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 4, i32 4, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 3, i32 3, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 2, i32 2, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 0, i32 0, i1 false)
+  tail call void @llvm.hpvm.bind.output(i8* %dilate_erode.node, i32 1, i32 1, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 15, i32 15, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 14, i32 14, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 13, i32 13, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 12, i32 12, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 11, i32 11, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 10, i32 10, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 9, i32 9, i1 false)
+  tail call void @llvm.hpvm.bind.input(i8* %dilate_erode.node, i32 8, i32 8, i1 false)
   ret %WrapperDilate.WrapperErode.ty undef
 }
 
@@ -2103,9 +2103,9 @@ attributes #7 = { nounwind readnone }
 attributes #8 = { noreturn nounwind }
 attributes #9 = { nounwind readonly }
 
-!visc_hint_gpu = !{!0, !1}
-!visc_hint_cpu = !{!2, !3, !4}
-!visc_hint_spir = !{}
+!hpvm_hint_gpu = !{!0, !1}
+!hpvm_hint_cpu = !{!2, !3, !4}
+!hpvm_hint_spir = !{}
 
 !0 = metadata !{%emptyStruct (float*, i64, float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @lincomb}
 !1 = metadata !{%dilate.erode.ty (float*, i64, float*, i64, float*, i64, i32, i32, float*, i64, float*, i64, float*, i64, i32, i32)* @dilate_erode}
diff --git a/hpvm/test/pipeline/run.sh b/hpvm/test/pipeline/run.sh
index 0c8435764bd87c92dd30ad51aa97011ddb07b339..5ac734026bf839c511dfdfb843b07382e6d8d4d6 100755
--- a/hpvm/test/pipeline/run.sh
+++ b/hpvm/test/pipeline/run.sh
@@ -4,7 +4,7 @@ echo Pipeline Script $1 $2
 version=$1
 pos=$2
 
-if [[ ($version == *"GPU"*) ||  ($version == "visc_parallel") ]]
+if [[ ($version == *"GPU"*) ||  ($version == "hpvm_parallel") ]]
 then
   target=""
 elif [[ $version == *"Vector"* ]]
diff --git a/hpvm/test/pipeline/runscript.sh b/hpvm/test/pipeline/runscript.sh
index 5a2933e78801993ee440ead6e19f84aae66b3577..c95af8f831eeeb7f5f464e4acbc90dd49fcb67a1 100755
--- a/hpvm/test/pipeline/runscript.sh
+++ b/hpvm/test/pipeline/runscript.sh
@@ -2,21 +2,21 @@
 echo Pipeline Script
 
 # Compile all version
-make VERSION=viscGPU clean
-make VERSION=viscVector TARGET=x86 clean
-make VERSION=viscScalar TARGET=seq clean
+make VERSION=hpvmGPU clean
+make VERSION=hpvmVector TARGET=x86 clean
+make VERSION=hpvmScalar TARGET=seq clean
 
 
-make VERSION=viscGPU
-make VERSION=viscVector TARGET=x86
-make VERSION=viscScalar TARGET=seq
+make VERSION=hpvmGPU
+make VERSION=hpvmVector TARGET=x86
+make VERSION=hpvmScalar TARGET=seq
 
 #Run all version
-make VERSION=viscGPU run &
+make VERSION=hpvmGPU run &
 ID_GPU=$!
-make VERSION=viscVector TARGET=x86 run &
+make VERSION=hpvmVector TARGET=x86 run &
 ID_Vector=$!
-make VERSION=viscScalar TARGET=seq run
+make VERSION=hpvmScalar TARGET=seq run
 ID_Scalar=$!
 
 #echo Wait 60 seconds
diff --git a/hpvm/test/pipeline/src/Makefile b/hpvm/test/pipeline/src/Makefile
index ec39b86f1cf71e2e8b6131b076c2953b566cbb56..55acb2e0982edc2a914340f2bfacbbfc1d06397f 100644
--- a/hpvm/test/pipeline/src/Makefile
+++ b/hpvm/test/pipeline/src/Makefile
@@ -1,8 +1,8 @@
 # (c) 2010 The Board of Trustees of the University of Illinois.
 
-LANGUAGE=visc
+LANGUAGE=hpvm
 SRCDIR_OBJS=io.ll #compute_gold.o
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc
index 9314833d25d0a3a25f13dfb24fb8a239b94956b1..ef9d8412c70813fcae123b0ef84de1850fa6b28c 100644
--- a/hpvm/test/pipeline/src/main.cc
+++ b/hpvm/test/pipeline/src/main.cc
@@ -13,6 +13,7 @@
 #include "opencv2/ocl/ocl.hpp"
 #include "opencv2/opencv.hpp"
 #include <cassert>
+#include <hpvm.h>
 #include <iostream>
 #include <malloc.h>
 #include <math.h>
@@ -20,7 +21,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <visc.h>
 
 #define NUM_RUNS 100
 #define DEPTH 3
@@ -147,12 +147,12 @@ void packData(struct InStruct *args, float *I, size_t bytesI, float *Is,
 void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs,
                        float *Is, size_t bytesIs, long m, long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, I, Gs, 1, Is);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, I, Gs, 1, Is);
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   int gloc = gx + gy * n;
 
@@ -187,26 +187,26 @@ void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs,
 
     Is[gloc] = smoothedVal;
   }
-  __visc__return(2, bytesIs, bytesIs);
+  __hpvm__return(2, bytesIs, bytesIs);
 }
 
 void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs,
                               size_t bytesGs, float *Is, size_t bytesIs, long m,
                               long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, I, Gs, 1, Is);
-  void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
-  __visc__bindIn(GSNode, 0, 0, 0); // Bind I
-  __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
-  __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
-  __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
-  __visc__bindIn(GSNode, 4, 4, 0); // Bind Is
-  __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
-  __visc__bindIn(GSNode, 6, 6, 0); // Bind m
-  __visc__bindIn(GSNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(GSNode, 0, 0, 0); // bind output bytesIs
-  __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, I, Gs, 1, Is);
+  void *GSNode = __hpvm__createNodeND(2, gaussianSmoothing, m, n);
+  __hpvm__bindIn(GSNode, 0, 0, 0); // Bind I
+  __hpvm__bindIn(GSNode, 1, 1, 0); // Bind bytesI
+  __hpvm__bindIn(GSNode, 2, 2, 0); // Bind Gs
+  __hpvm__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
+  __hpvm__bindIn(GSNode, 4, 4, 0); // Bind Is
+  __hpvm__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
+  __hpvm__bindIn(GSNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(GSNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(GSNode, 0, 0, 0); // bind output bytesIs
+  __hpvm__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
 }
 
 /* Compute a non-linear laplacian estimate of input image I of size m x n */
@@ -220,14 +220,14 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs,
 void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB,
                        float *L, size_t bytesL, long m, long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(2, Is, B, 1, L);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB * SZB];
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
@@ -300,25 +300,25 @@ void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB,
     float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1];
     L[gy * n + gx] = laplacian;
   }
-  __visc__return(1, bytesL);
+  __hpvm__return(1, bytesL);
 }
 
 void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B,
                               size_t bytesB, float *L, size_t bytesL, long m,
                               long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, Is, B, 1, L);
-  void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
-  __visc__bindIn(LNode, 0, 0, 0); // Bind Is
-  __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
-  __visc__bindIn(LNode, 2, 2, 0); // Bind B
-  __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB
-  __visc__bindIn(LNode, 4, 4, 0); // Bind L
-  __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL
-  __visc__bindIn(LNode, 6, 6, 0); // Bind m
-  __visc__bindIn(LNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, Is, B, 1, L);
+  void *LNode = __hpvm__createNodeND(2, laplacianEstimate, m, n);
+  __hpvm__bindIn(LNode, 0, 0, 0); // Bind Is
+  __hpvm__bindIn(LNode, 1, 1, 0); // Bind bytesIs
+  __hpvm__bindIn(LNode, 2, 2, 0); // Bind B
+  __hpvm__bindIn(LNode, 3, 3, 0); // Bind bytesB
+  __hpvm__bindIn(LNode, 4, 4, 0); // Bind L
+  __hpvm__bindIn(LNode, 5, 5, 0); // Bind bytesL
+  __hpvm__bindIn(LNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(LNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(LNode, 0, 0, 0); // bind output bytesL
 }
 
 /* Compute the zero crossings of input image L of size m x n */
@@ -331,16 +331,16 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B,
  */
 void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB,
                           float *S, size_t bytesS, long m, long n) {
-  __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, L, B, 1, S);
+  __hpvm__hint(hpvm::DEVICE);
+  //__hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, L, B, 1, S);
 
   // 3x3 image area
   float imageArea[SZB][SZB];
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
@@ -416,25 +416,25 @@ void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB,
     float pixelSign = dilatedPixel - erodedPixel;
     S[gy * n + gx] = pixelSign;
   }
-  __visc__return(1, bytesS);
+  __hpvm__return(1, bytesS);
 }
 
 void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B,
                                  size_t bytesB, float *S, size_t bytesS, long m,
                                  long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, L, B, 1, S);
-  void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
-  __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
-  __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
-  __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
-  __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
-  __visc__bindIn(ZCNode, 4, 4, 0); // Bind S
-  __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
-  __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
-  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
-
-  __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, L, B, 1, S);
+  void *ZCNode = __hpvm__createNodeND(2, computeZeroCrossings, m, n);
+  __hpvm__bindIn(ZCNode, 0, 0, 0); // Bind L
+  __hpvm__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
+  __hpvm__bindIn(ZCNode, 2, 2, 0); // Bind B
+  __hpvm__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
+  __hpvm__bindIn(ZCNode, 4, 4, 0); // Bind S
+  __hpvm__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
+  __hpvm__bindIn(ZCNode, 6, 6, 0); // Bind m
+  __hpvm__bindIn(ZCNode, 7, 7, 0); // Bind n
+
+  __hpvm__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
 }
 
 /*
@@ -458,12 +458,12 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx,
                      float *Sy, size_t bytesSy, float *G, size_t bytesG, long m,
                      long n) {
 
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, Is, Sx, Sy, 1, G);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, Is, Sx, Sy, 1, G);
 
-  void *thisNode = __visc__getNode();
-  long gx = __visc__getNodeInstanceID_x(thisNode);
-  long gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  long gx = __hpvm__getNodeInstanceID_x(thisNode);
+  long gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   int gloc = gx + gy * n;
 
@@ -498,27 +498,27 @@ void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx,
 
     G[gloc] = sqrt(Gx * Gx + Gy * Gy);
   }
-  __visc__return(1, bytesG);
+  __hpvm__return(1, bytesG);
 }
 
 void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx,
                             size_t bytesSx, float *Sy, size_t bytesSy, float *G,
                             size_t bytesG, long m, long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, Is, Sx, Sy, 1, G);
-  void *CGNode = __visc__createNodeND(2, computeGradient, m, n);
-  __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
-  __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
-  __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
-  __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
-  __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy
-  __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
-  __visc__bindIn(CGNode, 6, 6, 0); // Bind G
-  __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG
-  __visc__bindIn(CGNode, 8, 8, 0); // Bind m
-  __visc__bindIn(CGNode, 9, 9, 0); // Bind n
-
-  __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, Is, Sx, Sy, 1, G);
+  void *CGNode = __hpvm__createNodeND(2, computeGradient, m, n);
+  __hpvm__bindIn(CGNode, 0, 0, 0); // Bind Is
+  __hpvm__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
+  __hpvm__bindIn(CGNode, 2, 2, 0); // Bind Sx
+  __hpvm__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
+  __hpvm__bindIn(CGNode, 4, 4, 0); // Bind Sy
+  __hpvm__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
+  __hpvm__bindIn(CGNode, 6, 6, 0); // Bind G
+  __hpvm__bindIn(CGNode, 7, 7, 0); // Bind bytesG
+  __hpvm__bindIn(CGNode, 8, 8, 0); // Bind m
+  __hpvm__bindIn(CGNode, 9, 9, 0); // Bind n
+
+  __hpvm__bindOut(CGNode, 0, 0, 0); // bind output bytesG
 }
 
 /*
@@ -531,13 +531,13 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx,
 void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG,
                             size_t bytesMaxG, long m, long n) {
 
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(1, G, 1, maxG);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(1, G, 1, maxG);
 
-  void *thisNode = __visc__getNode();
+  void *thisNode = __hpvm__getNode();
 
-  long lx = __visc__getNodeInstanceID_x(thisNode);     // threadIdx.x
-  long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x
+  long lx = __hpvm__getNodeInstanceID_x(thisNode);     // threadIdx.x
+  long dimx = __hpvm__getNumNodeInstances_x(thisNode); // blockDim.x
 
   // Assume a single thread block
   // Thread block iterates over all elements
@@ -556,39 +556,39 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG,
     *maxG = G[lx];
   }
 
-  __visc__return(1, bytesMaxG);
+  __hpvm__return(1, bytesMaxG);
 }
 
 void computeMaxGradientTB(float *G, size_t bytesG, float *maxG,
                           size_t bytesMaxG, long m, long n, long block_x) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, G, maxG, 1, maxG);
-  void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
-  __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
-  __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
-  __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
-  __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
-  __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
-  __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
-
-  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, G, maxG, 1, maxG);
+  void *CMGLeafNode = __hpvm__createNodeND(1, computeMaxGradientLeaf, block_x);
+  __hpvm__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
+  __hpvm__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
+  __hpvm__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
+  __hpvm__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
+  __hpvm__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
+  __hpvm__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
+
+  __hpvm__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG,
                                size_t bytesMaxG, long m, long n, long block_x,
                                long grid_x) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(2, G, maxG, 1, maxG);
-  void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
-  __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
-  __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
-  __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
-  __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
-  __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
-  __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
-  __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
-
-  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(2, G, maxG, 1, maxG);
+  void *CMGTBNode = __hpvm__createNodeND(1, computeMaxGradientTB, grid_x);
+  __hpvm__bindIn(CMGTBNode, 0, 0, 0); // Bind G
+  __hpvm__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
+  __hpvm__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
+  __hpvm__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
+  __hpvm__bindIn(CMGTBNode, 4, 4, 0); // Bind m
+  __hpvm__bindIn(CMGTBNode, 5, 5, 0); // Bind n
+  __hpvm__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
+
+  __hpvm__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 /* Reject the zero crossings where the gradient is below a threshold */
@@ -604,39 +604,39 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG,
 void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG,
                          float *maxG, size_t bytesMaxG, float *E, size_t bytesE,
                          long m, long n) {
-  __visc__hint(visc::DEVICE);
-  __visc__attributes(3, S, G, maxG, 1, E);
+  __hpvm__hint(hpvm::DEVICE);
+  __hpvm__attributes(3, S, G, maxG, 1, E);
 
-  void *thisNode = __visc__getNode();
-  int gx = __visc__getNodeInstanceID_x(thisNode);
-  int gy = __visc__getNodeInstanceID_y(thisNode);
+  void *thisNode = __hpvm__getNode();
+  int gx = __hpvm__getNodeInstanceID_x(thisNode);
+  int gy = __hpvm__getNodeInstanceID_y(thisNode);
 
   float mG = *maxG;
   if ((gx < n) && (gy < m)) {
     E[gy * n + gx] =
         ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0;
   }
-  __visc__return(1, bytesE);
+  __hpvm__return(1, bytesE);
 }
 
 void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G,
                                 size_t bytesG, float *maxG, size_t bytesMaxG,
                                 float *E, size_t bytesE, long m, long n) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, S, G, maxG, 1, E);
-  void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
-  __visc__bindIn(RZCNode, 0, 0, 0); // Bind S
-  __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
-  __visc__bindIn(RZCNode, 2, 2, 0); // Bind G
-  __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
-  __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG
-  __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG
-  __visc__bindIn(RZCNode, 6, 6, 0); // Bind E
-  __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE
-  __visc__bindIn(RZCNode, 8, 8, 0); // Bind m
-  __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
-
-  __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes(3, S, G, maxG, 1, E);
+  void *RZCNode = __hpvm__createNodeND(2, rejectZeroCrossings, m, n);
+  __hpvm__bindIn(RZCNode, 0, 0, 0); // Bind S
+  __hpvm__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
+  __hpvm__bindIn(RZCNode, 2, 2, 0); // Bind G
+  __hpvm__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
+  __hpvm__bindIn(RZCNode, 4, 4, 0); // Bind maxG
+  __hpvm__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG
+  __hpvm__bindIn(RZCNode, 6, 6, 0); // Bind E
+  __hpvm__bindIn(RZCNode, 7, 7, 0); // Bind bytesE
+  __hpvm__bindIn(RZCNode, 8, 8, 0); // Bind m
+  __hpvm__bindIn(RZCNode, 9, 9, 0); // Bind n
+
+  __hpvm__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
 }
 
 // Pipelined Root node
@@ -656,80 +656,80 @@ void edgeDetection(float *I, size_t bytesI,       // 0
                    long block_x,                  // 24
                    long grid_x                    // 25
 ) {
-  __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
-  __visc__hint(visc::CPU_TARGET);
-  void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
-  void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
-  void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
-  void *CGNode = __visc__createNodeND(0, WrapperComputeGradient);
-  void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
-  void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
+  __hpvm__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
+  __hpvm__hint(hpvm::CPU_TARGET);
+  void *GSNode = __hpvm__createNodeND(0, WrapperGaussianSmoothing);
+  void *LNode = __hpvm__createNodeND(0, WrapperlaplacianEstimate);
+  void *CZCNode = __hpvm__createNodeND(0, WrapperComputeZeroCrossings);
+  void *CGNode = __hpvm__createNodeND(0, WrapperComputeGradient);
+  void *CMGNode = __hpvm__createNodeND(0, WrapperComputeMaxGradient);
+  void *RZCNode = __hpvm__createNodeND(0, WrapperRejectZeroCrossings);
 
   // Gaussian Inputs
-  __visc__bindIn(GSNode, 0, 0, 1);  // Bind I
-  __visc__bindIn(GSNode, 1, 1, 1);  // Bind bytesI
-  __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
-  __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
-  __visc__bindIn(GSNode, 2, 4, 1);  // Bind Is
-  __visc__bindIn(GSNode, 3, 5, 1);  // Bind bytesIs
-  __visc__bindIn(GSNode, 22, 6, 1); // Bind m
-  __visc__bindIn(GSNode, 23, 7, 1); // Bind n
+  __hpvm__bindIn(GSNode, 0, 0, 1);  // Bind I
+  __hpvm__bindIn(GSNode, 1, 1, 1);  // Bind bytesI
+  __hpvm__bindIn(GSNode, 14, 2, 1); // Bind Gs
+  __hpvm__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
+  __hpvm__bindIn(GSNode, 2, 4, 1);  // Bind Is
+  __hpvm__bindIn(GSNode, 3, 5, 1);  // Bind bytesIs
+  __hpvm__bindIn(GSNode, 22, 6, 1); // Bind m
+  __hpvm__bindIn(GSNode, 23, 7, 1); // Bind n
 
   // Laplacian Inputs
-  __visc__bindIn(LNode, 2, 0, 1);          // Bind Is
-  __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
-  __visc__bindIn(LNode, 16, 2, 1);         // Bind B
-  __visc__bindIn(LNode, 17, 3, 1);         // Bind bytesB
-  __visc__bindIn(LNode, 4, 4, 1);          // Bind L
-  __visc__bindIn(LNode, 5, 5, 1);          // Bind bytesL
-  __visc__bindIn(LNode, 22, 6, 1);         // Bind m
-  __visc__bindIn(LNode, 23, 7, 1);         // Bind n
+  __hpvm__bindIn(LNode, 2, 0, 1);          // Bind Is
+  __hpvm__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
+  __hpvm__bindIn(LNode, 16, 2, 1);         // Bind B
+  __hpvm__bindIn(LNode, 17, 3, 1);         // Bind bytesB
+  __hpvm__bindIn(LNode, 4, 4, 1);          // Bind L
+  __hpvm__bindIn(LNode, 5, 5, 1);          // Bind bytesL
+  __hpvm__bindIn(LNode, 22, 6, 1);         // Bind m
+  __hpvm__bindIn(LNode, 23, 7, 1);         // Bind n
 
   // Compute ZC Inputs
-  __visc__bindIn(CZCNode, 4, 0, 1);         // Bind L
-  __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
-  __visc__bindIn(CZCNode, 16, 2, 1);        // Bind B
-  __visc__bindIn(CZCNode, 17, 3, 1);        // Bind bytesB
-  __visc__bindIn(CZCNode, 6, 4, 1);         // Bind S
-  __visc__bindIn(CZCNode, 7, 5, 1);         // Bind bytesS
-  __visc__bindIn(CZCNode, 22, 6, 1);        // Bind m
-  __visc__bindIn(CZCNode, 23, 7, 1);        // Bind n
+  __hpvm__bindIn(CZCNode, 4, 0, 1);         // Bind L
+  __hpvm__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
+  __hpvm__bindIn(CZCNode, 16, 2, 1);        // Bind B
+  __hpvm__bindIn(CZCNode, 17, 3, 1);        // Bind bytesB
+  __hpvm__bindIn(CZCNode, 6, 4, 1);         // Bind S
+  __hpvm__bindIn(CZCNode, 7, 5, 1);         // Bind bytesS
+  __hpvm__bindIn(CZCNode, 22, 6, 1);        // Bind m
+  __hpvm__bindIn(CZCNode, 23, 7, 1);        // Bind n
 
   // Gradient Inputs
-  __visc__bindIn(CGNode, 2, 0, 1);          // Bind Is
-  __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
-  __visc__bindIn(CGNode, 18, 2, 1);         // Bind Sx
-  __visc__bindIn(CGNode, 19, 3, 1);         // Bind bytesSx
-  __visc__bindIn(CGNode, 20, 4, 1);         // Bind Sy
-  __visc__bindIn(CGNode, 21, 5, 1);         // Bind bytesSy
-  __visc__bindIn(CGNode, 8, 6, 1);          // Bind G
-  __visc__bindIn(CGNode, 9, 7, 1);          // Bind bytesG
-  __visc__bindIn(CGNode, 22, 8, 1);         // Bind m
-  __visc__bindIn(CGNode, 23, 9, 1);         // Bind n
+  __hpvm__bindIn(CGNode, 2, 0, 1);          // Bind Is
+  __hpvm__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
+  __hpvm__bindIn(CGNode, 18, 2, 1);         // Bind Sx
+  __hpvm__bindIn(CGNode, 19, 3, 1);         // Bind bytesSx
+  __hpvm__bindIn(CGNode, 20, 4, 1);         // Bind Sy
+  __hpvm__bindIn(CGNode, 21, 5, 1);         // Bind bytesSy
+  __hpvm__bindIn(CGNode, 8, 6, 1);          // Bind G
+  __hpvm__bindIn(CGNode, 9, 7, 1);          // Bind bytesG
+  __hpvm__bindIn(CGNode, 22, 8, 1);         // Bind m
+  __hpvm__bindIn(CGNode, 23, 9, 1);         // Bind n
 
   // Max Gradient Inputs
-  __visc__bindIn(CMGNode, 8, 0, 1);          // Bind G
-  __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
-  __visc__bindIn(CMGNode, 10, 2, 1);         // Bind maxG
-  __visc__bindIn(CMGNode, 11, 3, 1);         // Bind bytesMaxG
-  __visc__bindIn(CMGNode, 22, 4, 1);         // Bind m
-  __visc__bindIn(CMGNode, 23, 5, 1);         // Bind n
-  __visc__bindIn(CMGNode, 24, 6, 1);         // Bind block_x
-  __visc__bindIn(CMGNode, 25, 7, 1);         // Bind grid_x
+  __hpvm__bindIn(CMGNode, 8, 0, 1);          // Bind G
+  __hpvm__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
+  __hpvm__bindIn(CMGNode, 10, 2, 1);         // Bind maxG
+  __hpvm__bindIn(CMGNode, 11, 3, 1);         // Bind bytesMaxG
+  __hpvm__bindIn(CMGNode, 22, 4, 1);         // Bind m
+  __hpvm__bindIn(CMGNode, 23, 5, 1);         // Bind n
+  __hpvm__bindIn(CMGNode, 24, 6, 1);         // Bind block_x
+  __hpvm__bindIn(CMGNode, 25, 7, 1);         // Bind grid_x
 
   // Reject ZC Inputs
-  __visc__bindIn(RZCNode, 6, 0, 1);           // Bind S
-  __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
-  __visc__bindIn(RZCNode, 8, 2, 1);           // Bind G
-  __visc__bindIn(RZCNode, 9, 3, 1);           // Bind bytesG
-  __visc__bindIn(RZCNode, 10, 4, 1);          // Bind maxG
-  __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
-  __visc__bindIn(RZCNode, 12, 6, 1);          // Bind E
-  __visc__bindIn(RZCNode, 13, 7, 1);          // Bind bytesE
-  __visc__bindIn(RZCNode, 22, 8, 1);          // Bind m
-  __visc__bindIn(RZCNode, 23, 9, 1);          // Bind n
-
-  __visc__bindOut(RZCNode, 0, 0, 1); // Bind output
+  __hpvm__bindIn(RZCNode, 6, 0, 1);           // Bind S
+  __hpvm__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
+  __hpvm__bindIn(RZCNode, 8, 2, 1);           // Bind G
+  __hpvm__bindIn(RZCNode, 9, 3, 1);           // Bind bytesG
+  __hpvm__bindIn(RZCNode, 10, 4, 1);          // Bind maxG
+  __hpvm__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
+  __hpvm__bindIn(RZCNode, 12, 6, 1);          // Bind E
+  __hpvm__bindIn(RZCNode, 13, 7, 1);          // Bind bytesE
+  __hpvm__bindIn(RZCNode, 22, 8, 1);          // Bind m
+  __hpvm__bindIn(RZCNode, 23, 9, 1);          // Bind n
+
+  __hpvm__bindOut(RZCNode, 0, 0, 1); // Bind output
 }
 }
 
@@ -796,7 +796,7 @@ int main(int argc, char *argv[]) {
   assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() &&
          S.isContinuous() && G.isContinuous() && E.isContinuous());
 
-  __visc__init();
+  __hpvm__init();
 
   // copy A to device memory
   I_sz = src.size[0] * src.size[1] * sizeof(float);
@@ -843,7 +843,7 @@ int main(int argc, char *argv[]) {
 
   for (unsigned j = 0; j < NUM_RUNS; j++) {
     std::cout << "Run: " << j << "\n";
-    void *DFG = __visc__launch(1, edgeDetection, (void *)args);
+    void *DFG = __hpvm__launch(1, edgeDetection, (void *)args);
 
     cap = VideoCapture(inFile);
     getNextFrame(cap, src);
@@ -855,25 +855,25 @@ int main(int argc, char *argv[]) {
 
         *maxG = 0.0;
 
-        llvm_visc_track_mem(src.data, I_sz);
-        llvm_visc_track_mem(Is.data, I_sz);
-        llvm_visc_track_mem(L.data, I_sz);
-        llvm_visc_track_mem(S.data, I_sz);
-        llvm_visc_track_mem(G.data, I_sz);
-        llvm_visc_track_mem(maxG, bytesMaxG);
-        llvm_visc_track_mem(E.data, I_sz);
-        llvm_visc_track_mem(Gs, bytesGs);
-        llvm_visc_track_mem(B, bytesB);
-        llvm_visc_track_mem(Sx, bytesSx);
-        llvm_visc_track_mem(Sy, bytesSy);
-
-        __visc__push(DFG, args);
-        void *ret = __visc__pop(DFG);
+        llvm_hpvm_track_mem(src.data, I_sz);
+        llvm_hpvm_track_mem(Is.data, I_sz);
+        llvm_hpvm_track_mem(L.data, I_sz);
+        llvm_hpvm_track_mem(S.data, I_sz);
+        llvm_hpvm_track_mem(G.data, I_sz);
+        llvm_hpvm_track_mem(maxG, bytesMaxG);
+        llvm_hpvm_track_mem(E.data, I_sz);
+        llvm_hpvm_track_mem(Gs, bytesGs);
+        llvm_hpvm_track_mem(B, bytesB);
+        llvm_hpvm_track_mem(Sx, bytesSx);
+        llvm_hpvm_track_mem(Sy, bytesSy);
+
+        __hpvm__push(DFG, args);
+        void *ret = __hpvm__pop(DFG);
         std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz
                   << '\n';
 
-        llvm_visc_request_mem(maxG, bytesMaxG);
-        llvm_visc_request_mem(E.data, I_sz);
+        llvm_hpvm_request_mem(maxG, bytesMaxG);
+        llvm_hpvm_request_mem(E.data, I_sz);
 
         Mat in, out;
         resize(src, in, Size(HEIGHT, WIDTH));
@@ -882,26 +882,26 @@ int main(int argc, char *argv[]) {
         imshow(input_window, in);
         waitKey(1);
 
-        llvm_visc_untrack_mem(src.data);
-        llvm_visc_untrack_mem(Is.data);
-        llvm_visc_untrack_mem(L.data);
-        llvm_visc_untrack_mem(S.data);
-        llvm_visc_untrack_mem(G.data);
-        llvm_visc_untrack_mem(maxG);
-        llvm_visc_untrack_mem(E.data);
-        llvm_visc_untrack_mem(Gs);
-        llvm_visc_untrack_mem(B);
-        llvm_visc_untrack_mem(Sx);
-        llvm_visc_untrack_mem(Sy);
+        llvm_hpvm_untrack_mem(src.data);
+        llvm_hpvm_untrack_mem(Is.data);
+        llvm_hpvm_untrack_mem(L.data);
+        llvm_hpvm_untrack_mem(S.data);
+        llvm_hpvm_untrack_mem(G.data);
+        llvm_hpvm_untrack_mem(maxG);
+        llvm_hpvm_untrack_mem(E.data);
+        llvm_hpvm_untrack_mem(Gs);
+        llvm_hpvm_untrack_mem(B);
+        llvm_hpvm_untrack_mem(Sx);
+        llvm_hpvm_untrack_mem(Sy);
 
         getNextFrame(cap, src);
       }
     } else {
-      __visc__push(DFG, args);
-      __visc__pop(DFG);
+      __hpvm__push(DFG, args);
+      __hpvm__pop(DFG);
     }
-    __visc__wait(DFG);
+    __hpvm__wait(DFG);
   }
-  __visc__cleanup();
+  __hpvm__cleanup();
   return 0;
 }
diff --git a/hpvm/test/template/Makefile b/hpvm/test/template/Makefile
index 3aa4bd1d6f2c3f7bb2be07ba5e662c5b6faf1655..d3344887e1cb516b09c4eb92bcad8f9b646d94a3 100644
--- a/hpvm/test/template/Makefile
+++ b/hpvm/test/template/Makefile
@@ -25,12 +25,12 @@ CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 EXE = $(EXE_NAME)-$(TARGET)
 
 INCLUDES += -I$(SRC_DIR)
-INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(VISC_BUILD_DIR)/include
+INCLUDES += -I$(LLVM_SRC_ROOT)/include -I../include -I$(HPVM_BUILD_DIR)/include
 
 ## BEGIN HPVM MAKEFILE
 SRCDIR_OBJS= io.ll
 OBJS_SRC=src/io.cc
-VISC_OBJS=main.visc.ll
+HPVM_OBJS=main.hpvm.ll
 APP = $(EXE)
 APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
 APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
@@ -41,21 +41,21 @@ OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
 LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
 
-VISC_RT_PATH = $(LLVM_BUILD_ROOT)/tools/hpvm/projects/visc-rt
-VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.bc
+HPVM_RT_PATH = $(LLVM_BUILD_ROOT)/tools/hpvm/projects/hpvm-rt
+HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
 
-TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
 
 ifeq ($(TARGET),seq)
   DEVICE = CPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86
 else
   DEVICE = GPU_TARGET
-  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
-  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  HPVM_OPTFLAGS += -hpvm-timers-x86 -hpvm-timers-ptx
 endif
-  TESTGEN_OPTFLAGS += -visc-timers-gen
+  TESTGEN_OPTFLAGS += -hpvm-timers-gen
 
 CFLAGS += -DDEVICE=$(DEVICE)
 CXXFLAGS += -DDEVICE=$(DEVICE)
@@ -66,7 +66,7 @@ INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
 .PRECIOUS: $(BUILD_DIR)/%.ll
 
 OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
 KERNEL = $(TEST_OBJS).kernels.ll
 
 ifeq ($(TARGET),gpu)
@@ -91,11 +91,11 @@ $(KERNEL_OCL) : $(KERNEL)
 $(EXE) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@
 
-$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
 	$(LLVM_LINK) $^ -S -o $@
 
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
-	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
+	$(OPT) $(HPVM_OPTFLAGS) -S $< -o $(HOST)
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -106,7 +106,7 @@ $(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc
 $(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc
 	$(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $<
 
-$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
+$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 
 ## END HPVM MAKEFILE
diff --git a/hpvm/test/template/README.md b/hpvm/test/template/README.md
index 198604817d5a8463e555451a1188b426ec4e31cd..ca51cbb90018e316f7a7f775223c0741b7328841 100644
--- a/hpvm/test/template/README.md
+++ b/hpvm/test/template/README.md
@@ -6,19 +6,19 @@ Let's look at the compilation of the `pipeline` test for gpu as an example.
 /.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -DDEVICE=GPU_TARGET -emit-llvm -S -o build/main.ll src/main.cc
 ```
 
-`opt` is used to invoke the GenVISC pass, which converts the HPVM function calls to LLVM intrinsics.
+`opt` is used to invoke the GenHPVM pass, which converts the HPVM function calls to LLVM intrinsics.
 ```
-/.../hpvm/build/bin/opt -debug-only=genvisc -load LLVMGenVISC.so -genvisc -globaldce -visc-timers-gen build/main.ll -S -o build/main.visc.ll
+/.../hpvm/build/bin/opt -debug-only=genhpvm -load LLVMGenHPVM.so -genhpvm -globaldce -hpvm-timers-gen build/main.ll -S -o build/main.hpvm.ll
 ```
 
 `opt` is used again to invoke the BuildDFG pass, which converts the textual representation to the internal HPVM representation.
 ```
-/.../hpvm/build/bin/opt -debug -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -visc-timers-x86 -visc-timers-ptx -S build/main.visc.ll -o build/pipeline-gpu.host.ll
+/.../hpvm/build/bin/opt -debug -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -hpvm-timers-x86 -hpvm-timers-ptx -S build/main.hpvm.ll -o build/pipeline-gpu.host.ll
 ```
 
 `llvm-cbe` is a C backend for LLVM. It is used here to create the OpenCL kernel.
 ```
-/.../hpvm/build/bin/llvm-cbe -debug build/gpu/main.visc.ll.kernels.ll -o build/gpu/main.visc.ll.kernels.cl
+/.../hpvm/build/bin/llvm-cbe -debug build/gpu/main.hpvm.ll.kernels.ll -o build/gpu/main.hpvm.ll.kernels.cl
 ```
 
 `clang` is used again to compile a separate source file that contains I/O code.
@@ -26,9 +26,9 @@ Let's look at the compilation of the `pipeline` test for gpu as an example.
 /.../hpvm/build/bin/clang -Isrc/ -I -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize -I/.../hpvm/llvm/include -I../include -I/.../hpvm/build/include -emit-llvm -S -o build/gpu/io.ll src/io.cc
 ```
 
-`llvm-link` is used to link against the VISC runtime.
+`llvm-link` is used to link against the HPVM runtime.
 ```
-/.../hpvm/build/bin/llvm-link build/gpu/pipeline-gpu.host.ll build/gpu/io.ll /.../hpvm/llvm/tools/hpvm/projects/visc-rt/visc-rt.ll -S -o build/gpu/pipeline-gpu.linked.ll
+/.../hpvm/build/bin/llvm-link build/gpu/pipeline-gpu.host.ll build/gpu/io.ll /.../hpvm/llvm/tools/hpvm/projects/hpvm-rt/hpvm-rt.ll -S -o build/gpu/pipeline-gpu.linked.ll
 ```
 
 `clang++` is used to do the final linking against OpenCL and emit the binary.
diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c
index 1b6b1cff211d5af5a909065af988aadbe979f2ec..c3f58c95d631b5c49a47de1cbe41ed5ea871f5f4 100644
--- a/hpvm/test/unitTests/CreateNodeAndEdge.c
+++ b/hpvm/test/unitTests/CreateNodeAndEdge.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdio.h>
 
 struct Root {
@@ -7,33 +7,33 @@ struct Root {
 };
 
 void Func1(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 void Func2(int *BindIn, int *SrcIn, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(2, BindIn, SrcIn, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(2, BindIn, SrcIn, 1, Out);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
+  __hpvm__hint(CPU_TARGET);
 
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__attributes(1, In, 1, Out);
 
-  void *SrcNode = __visc__createNodeND(0, Func1);
-  void *DestNode = __visc__createNodeND(0, Func2);
+  void *SrcNode = __hpvm__createNodeND(0, Func1);
+  void *DestNode = __hpvm__createNodeND(0, Func2);
 
-  __visc__bindIn(SrcNode, 0, 0, 0);
+  __hpvm__bindIn(SrcNode, 0, 0, 0);
 
-  __visc__bindIn(DestNode, 0, 0, 0);
-  __visc__edge(SrcNode, DestNode, 1, 0, 1, 0);
+  __hpvm__bindIn(DestNode, 0, 0, 0);
+  __hpvm__edge(SrcNode, DestNode, 1, 0, 1, 0);
 
-  __visc__bindOut(SrcNode, 0, 0, 0);
+  __hpvm__bindOut(SrcNode, 0, 0, 0);
 }
 
 int main(void) {
@@ -41,10 +41,10 @@ int main(void) {
   int Out = 0;
   struct Root RootArgs = {(int *)&In, (int *)&Out};
 
-  __visc__init();
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs);
-  __visc__wait(PipeDFG);
-  __visc__cleanup();
+  __hpvm__init();
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)&RootArgs);
+  __hpvm__wait(PipeDFG);
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/Makefile b/hpvm/test/unitTests/Makefile
index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644
--- a/hpvm/test/unitTests/Makefile
+++ b/hpvm/test/unitTests/Makefile
@@ -2,8 +2,8 @@ PASSES :=
 
 .PHONY: clean
 
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc
 HOST:=gemm_opencl
 KERNELS:=matrixMul
 LLVM_CC:=$(LLVM_INSTALL)/bin/clang
diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c
index cfd041a991d976c24b372a81b35842598b571d89..173f6b3b16d1090a98242d345cefa330910d862d 100644
--- a/hpvm/test/unitTests/MallocIntrinsic.c
+++ b/hpvm/test/unitTests/MallocIntrinsic.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,12 +7,12 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
 
-  Out = (int *)__visc__malloc(*In);
+  Out = (int *)__hpvm__malloc(*In);
 
-  __visc__return(1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
@@ -26,12 +26,12 @@ int main(void) {
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  __visc__init();
+  __hpvm__init();
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c
index 2a9bf83402891beddf13d96c6346e8fed924d17e..43ba0ef56cf160acb1fab6ea334732e56e0359d2 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,9 +7,9 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
-  __visc__return(1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
@@ -23,12 +23,12 @@ int main(void) {
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  __visc__init();
+  __hpvm__init();
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
index 36fc02d22b066025be4a57695265779d8e55652a..c2deed98679bf794316f283acef8e3c1db9ffa88 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
@@ -1,4 +1,4 @@
-#include "visc.h"
+#include "hpvm.h"
 #include <stdlib.h>
 
 struct Root {
@@ -7,24 +7,24 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint(CPU_TARGET);
-  __visc__attributes(1, In, 1, Out);
-  __visc__return(1, Out);
+  __hpvm__hint(CPU_TARGET);
+  __hpvm__attributes(1, In, 1, Out);
+  __hpvm__return(1, Out);
 }
 
 int main(void) {
   int In, Out;
 
-  __visc__init();
+  __hpvm__init();
 
   struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root));
   RootArgs->input = (int *)&In;
   RootArgs->output = (int *)&Out;
 
-  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
-  __visc__wait(PipeDFG);
+  void *PipeDFG = __hpvm__launch(0, PipeRoot, (void *)RootArgs);
+  __hpvm__wait(PipeDFG);
 
-  __visc__cleanup();
+  __hpvm__cleanup();
 
   return 0;
 }
diff --git a/hpvm/test/unitTests/temp/3level.ll b/hpvm/test/unitTests/temp/3level.ll
index 168e7b42322c8f7fa4be83a64cbd06d44dd9e428..2e3753f1400798d0989e2a01be78ab338205a291 100644
--- a/hpvm/test/unitTests/temp/3level.ll
+++ b/hpvm/test/unitTests/temp/3level.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/3level.ll'
@@ -13,31 +13,31 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
@@ -47,18 +47,18 @@ entry:
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output1 = extractvalue %rtype %outputstruct, 0
   %output2 = extractvalue %rtype %outputstruct, 1
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output1) #0
   %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output2) #0
   ret i32 0
@@ -83,21 +83,21 @@ define %rtype_internal @foo(i32 %id) {
 }
 
 define %rtype_internal @subNode(i32 %id) {
-  %foo_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*))
-  call void @llvm.visc.bind.input(i8* %foo_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %foo_node, i32 0, i32 0)
+  %foo_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @foo to i8*))
+  call void @llvm.hpvm.bind.input(i8* %foo_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %foo_node, i32 0, i32 0)
   ret %rtype_internal zeroinitializer
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*))
-  %sub_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %sub_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %sub_node, i32 0, i32 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @consumer to i8*))
+  %sub_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype_internal (i32)* @subNode to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %sub_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %sub_node, i32 0, i32 1)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/Makefile b/hpvm/test/unitTests/temp/Makefile
index 539ee5e8fbf010d33663c98470b245bb2710eeea..15580e9300a119f55e4a828b645c27dd00b62ff8 100644
--- a/hpvm/test/unitTests/temp/Makefile
+++ b/hpvm/test/unitTests/temp/Makefile
@@ -2,8 +2,8 @@ PASSES :=
 
 .PHONY: clean
 
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+LLVM_INSTALL:=/home/psrivas2/Hetero/HPVM/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/HPVM/Code/trunk/libclc
 HOST:=gemm_opencl
 KERNELS:=matrixMul
 LLVM_CC:=$(LLVM_INSTALL)/bin/clang
diff --git a/hpvm/test/unitTests/temp/query2D.ll b/hpvm/test/unitTests/temp/query2D.ll
index c994c2a3ff5b166b2f192f4b900982b3b7afc508..48358a3527553c8f4a31ff89454010289d02c072 100644
--- a/hpvm/test/unitTests/temp/query2D.ll
+++ b/hpvm/test/unitTests/temp/query2D.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query2D.ll'
@@ -12,46 +12,46 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
@@ -61,25 +61,25 @@ entry:
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -92,11 +92,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode2D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 %dimension)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/query3D.ll b/hpvm/test/unitTests/temp/query3D.ll
index 438fe60a3bc6c2dfe718da76d55041addc47367f..d2ff16ef56628752b997577891c44fd904be4405 100644
--- a/hpvm/test/unitTests/temp/query3D.ll
+++ b/hpvm/test/unitTests/temp/query3D.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query3D.ll'
@@ -12,57 +12,57 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+declare i8* @llvm.hpvm.createNode2D(i8*, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
+declare i8* @llvm.hpvm.createNode3D(i8*, i32, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.y(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -71,21 +71,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.y(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -98,11 +98,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode3D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension, i32 10, i32 30)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNodeInst.ll b/hpvm/test/unitTests/temp/queryNodeInst.ll
index 24d6a3f0d30e6661c0f1396e082f889d54dc50be..4e3dd7553045d466199c726416db220a6be2d1aa 100644
--- a/hpvm/test/unitTests/temp/queryNodeInst.ll
+++ b/hpvm/test/unitTests/temp/queryNodeInst.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,40 +12,40 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -54,21 +54,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -81,11 +81,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNumDim.ll b/hpvm/test/unitTests/temp/queryNumDim.ll
index 500e2ff41bd52f29a56cfd49563927bf6323482b..caa0978dabab0bf6295853e35f23e3ed68f00840 100644
--- a/hpvm/test/unitTests/temp/queryNumDim.ll
+++ b/hpvm/test/unitTests/temp/queryNumDim.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,42 +12,42 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -56,21 +56,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -83,11 +83,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/queryNumNodeInst.ll b/hpvm/test/unitTests/temp/queryNumNodeInst.ll
index 48add92f16125bdf33c9691896a8b7259339fe78..07418ff725c277e2e8adbe6a39d8831e2b77bc59 100644
--- a/hpvm/test/unitTests/temp/queryNumNodeInst.ll
+++ b/hpvm/test/unitTests/temp/queryNumNodeInst.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -12,48 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
+declare i8* @llvm.hpvm.createNode1D(i8*, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
+declare i32 @llvm.hpvm.getNumNodeInstances.x(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -62,21 +62,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %dim = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %dim = call i32 @llvm.hpvm.getNumNodeInstances.x(i8* %this_node)
   %sum2 = add i32 %sum, %dim
   %output = insertvalue %rtype undef, i32 %sum2, 0
   ret %rtype %output
@@ -89,11 +89,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %dimension) {
-  %p_node = call i8* @llvm.visc.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension)
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode1D(i8* bitcast (%rtype (i32)* @producer to i8*), i32 %dimension)
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/singleNode.ll b/hpvm/test/unitTests/temp/singleNode.ll
index 20713e955fb457acec2e2968d1b4a2ae61396fe0..99e53181317a6b27a83916682bcf1457895c0bfc 100644
--- a/hpvm/test/unitTests/temp/singleNode.ll
+++ b/hpvm/test/unitTests/temp/singleNode.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll'
@@ -12,43 +12,43 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -59,8 +59,8 @@ define %rtype @foo() {
 }
 
 define %rtype @Root() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/singleNodeStream.ll b/hpvm/test/unitTests/temp/singleNodeStream.ll
index fce75df6714240286e9a676e40e37c3f14e537a6..aa0243603c420a21f51f9842d467f9da814f1814 100644
--- a/hpvm/test/unitTests/temp/singleNodeStream.ll
+++ b/hpvm/test/unitTests/temp/singleNodeStream.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.push(i8*, i8*) #0
+declare void @llvm.hpvm.push(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.pop(i8*) #0
+declare i8* @llvm.hpvm.pop(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %num = alloca i32
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -60,27 +60,27 @@ entry:
   %args = bitcast %struct.arg* %in.addr to i8*
 
   ; Launch the pipeline
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rptype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
 
   ; Push arguments into the pipeline
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
 
   ; Pop out arguments and read the output
-  %graph_output = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output1 = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output2 = call i8* @llvm.visc.pop(i8* %graphID)
-  %graph_output3 = call i8* @llvm.visc.pop(i8* %graphID)
+  %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output1 = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output2 = call i8* @llvm.hpvm.pop(i8* %graphID)
+  %graph_output3 = call i8* @llvm.hpvm.pop(i8* %graphID)
   %output.addr = bitcast i8* %graph_output to %rptype*
   %outputstruct = load %rptype* %output.addr
   %output = extractvalue %rptype %outputstruct, 0
   %output_val = load i32* %output
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0
 
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -95,11 +95,11 @@ define %rptype @producer(i32* %id, i64 %size) {
 }
 
 define %rptype @Root(i32* %id, i64 %size) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 1)
-  call void @llvm.visc.bind.output(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.output(i8* %p_node, i32 1, i32 1, i1 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %p_node, i32 1, i32 1, i1 1)
   ret %rptype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoLaunch.ll b/hpvm/test/unitTests/temp/twoLaunch.ll
index 48c973a7e6f1cc5422fffd8d9e4ae0a0e1a06bf9..ee602f58d82f004a7b19bf54e55e1c0759c17bef 100644
--- a/hpvm/test/unitTests/temp/twoLaunch.ll
+++ b/hpvm/test/unitTests/temp/twoLaunch.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll'
@@ -12,33 +12,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr_1 = alloca %struct.arg
   %in.addr_2= alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -47,12 +47,12 @@ entry:
   %conv.i = trunc i64 %call.i to i32
   %args_1 = bitcast %struct.arg* %in.addr_1 to i8*
   %args_2 = bitcast %struct.arg* %in.addr_2 to i8*
-  %graphID_1 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1)
-  %graphID_2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2)
+  %graphID_1 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_1 to i8*), i8* %args_1)
+  %graphID_2 = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype ()* @Root_2 to i8*), i8* %args_2)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID_1)
-  call void @llvm.visc.wait(i8* %graphID_2)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID_1)
+  call void @llvm.hpvm.wait(i8* %graphID_2)
+  call void @llvm.hpvm.cleanup()
 
   ret i32 0
 }
@@ -70,14 +70,14 @@ define %rtype @foo_2() {
 }
 
 define %rtype @Root_1() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_1 to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
 define %rtype @Root_2() {
-  %node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*))
-  call void @llvm.visc.bind.output(i8* %node, i32 0, i32 0)
+  %node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @foo_2 to i8*))
+  call void @llvm.hpvm.bind.output(i8* %node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNode.ll b/hpvm/test/unitTests/temp/twoNode.ll
index 5e2899830b835ff50c9d2d8e4157451d4bd26f7f..74e4c64d599f7204b375743687c6da2b7ed8c9f6 100644
--- a/hpvm/test/unitTests/temp/twoNode.ll
+++ b/hpvm/test/unitTests/temp/twoNode.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
@@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -46,10 +46,10 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -66,10 +66,10 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype ()* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype ()* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeConnect.ll b/hpvm/test/unitTests/temp/twoNodeConnect.ll
index 06652b94e02c2cac66ab4a07e88dec0a04da49f8..6b23ad691bacb42c39fe681967d4c584179644f1 100644
--- a/hpvm/test/unitTests/temp/twoNodeConnect.ll
+++ b/hpvm/test/unitTests/temp/twoNodeConnect.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -11,33 +11,33 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -46,14 +46,14 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -70,11 +70,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeQuery.ll b/hpvm/test/unitTests/temp/twoNodeQuery.ll
index 2e1ea0dba4659d92b9c1b0600732748c87571671..247d1830dadff69ac5380b939d26c5f850bc08ac 100644
--- a/hpvm/test/unitTests/temp/twoNodeQuery.ll
+++ b/hpvm/test/unitTests/temp/twoNodeQuery.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeQuery.ll'
@@ -11,42 +11,42 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32)
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
+declare i8* @llvm.hpvm.getNode() #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
+declare i8* @llvm.hpvm.getParentNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
+declare i32 @llvm.hpvm.getNumDims(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
@@ -55,21 +55,21 @@ entry:
   %1 = bitcast %struct.arg* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  call void @llvm.visc.wait(i8* %graphID)
+  call void @llvm.hpvm.wait(i8* %graphID)
   %2 = getelementptr %struct.arg* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output = extractvalue %rtype %outputstruct, 0
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
 define %rtype @producer(i32 %id) {
   %sum = add i32 4, %id
-  %this_node = call i8* @llvm.visc.getNode()
-  %numDim = call i32 @llvm.visc.getNumDims(i8* %this_node)
+  %this_node = call i8* @llvm.hpvm.getNode()
+  %numDim = call i32 @llvm.hpvm.getNumDims(i8* %this_node)
   %sum2 = add i32 %sum, %numDim
   %output = insertvalue %rtype undef, i32 %sum, 0
   ret %rtype %output
@@ -82,11 +82,11 @@ define %rtype @consumer(i32 %id) {
 }
 
 define %rtype @Root(i32 %id) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rtype (i32)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0)
   ret %rtype zeroinitializer
 }
 
diff --git a/hpvm/test/unitTests/temp/twoNodeStream.ll b/hpvm/test/unitTests/temp/twoNodeStream.ll
index 6e9925951884775e7ba60bb396a97fd9bc0ef52d..f9820abd19eb7b329b2c7184719d9699b15891e6 100644
--- a/hpvm/test/unitTests/temp/twoNodeStream.ll
+++ b/hpvm/test/unitTests/temp/twoNodeStream.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: llvm-link %t.ll ~/current-src/projects/hpvm-rt/hpvm-rt.ll -S -o %t.linked.ll
 ; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
@@ -14,39 +14,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
+declare void @llvm.hpvm.init() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
+declare void @llvm.hpvm.cleanup() #1
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
+declare i8* @llvm.hpvm.createNode(i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #0
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i1) #0
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.push(i8*, i8*) #0
+declare void @llvm.hpvm.push(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.pop(i8*) #0
+declare i8* @llvm.hpvm.pop(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
+declare void @llvm.hpvm.wait(i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32, i1)
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1)
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  call void @llvm.visc.init()
+  call void @llvm.hpvm.init()
   %in.addr = alloca %struct.arg
   %num = alloca i32
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
@@ -60,21 +60,21 @@ entry:
   %args = bitcast %struct.arg* %in.addr to i8*
 
   ; Launch the pipeline
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%rctype (i32*, i64)* @Root to i8*), i8* %args, i1 1)
 
   ; Push arguments into the pipeline
-  call void @llvm.visc.push(i8* %graphID, i8* %args)
+  call void @llvm.hpvm.push(i8* %graphID, i8* %args)
 
   ; Pop out arguments and read the output
-  %graph_output = call i8* @llvm.visc.pop(i8* %graphID)
+  %graph_output = call i8* @llvm.hpvm.pop(i8* %graphID)
   %output.addr = bitcast i8* %graph_output to %rctype*
   %outputstruct = load %rctype* %output.addr
   %output = extractvalue %rctype %outputstruct, 0
   %output_val = load i32* %output
   %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([18 x i8]* @out.str, i64 0, i64 0), i32 %output_val) #0
 
-  call void @llvm.visc.wait(i8* %graphID)
-  call void @llvm.visc.cleanup()
+  call void @llvm.hpvm.wait(i8* %graphID)
+  call void @llvm.hpvm.cleanup()
   ret i32 0
 }
 
@@ -97,14 +97,14 @@ define %rctype @consumer(i32* %id, i64 %size) {
 }
 
 define %rctype @Root(i32* %id, i64 %size) {
-  %p_node = call i8* @llvm.visc.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
-  %c_node = call i8* @llvm.visc.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*))
-  %edge = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1)
-  %edge2 = call i8* @llvm.visc.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.input(i8* %p_node, i32 1, i32 1, i1 0)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 0, i32 0, i1 1)
-  call void @llvm.visc.bind.output(i8* %c_node, i32 1, i32 1, i1 1)
+  %p_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rptype (i32*, i64)* @producer to i8*))
+  %c_node = call i8* @llvm.hpvm.createNode(i8* bitcast (%rctype (i32*, i64)* @consumer to i8*))
+  %edge = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 0, i32 0, i1 1)
+  %edge2 = call i8* @llvm.hpvm.createEdge(i8* %p_node, i8* %c_node, i1 false, i32 1, i32 1, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.input(i8* %p_node, i32 1, i32 1, i1 0)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 0, i32 0, i1 1)
+  call void @llvm.hpvm.bind.output(i8* %c_node, i32 1, i32 1, i1 1)
   ret %rctype zeroinitializer
 }