diff --git a/README.md b/README.md
index 4970eccbacb5ecd9fb30f2dcde065d43b672d6db..3f054d92b3965ec7cecec1af391c179608e1e719 100644
--- a/README.md
+++ b/README.md
@@ -7,24 +7,42 @@ This repository contains miscellaneous supporting materals for HPVM.
 [PPoPP'18 paper](http://rsim.cs.illinois.edu/Pubs/17-PPOPP-HPVM.pdf)
 
 ## Dependencies
-You would need to download and install the following components for using AVX and NVIDIA GPUs to speed up your programs
+You would need to download and install the following components for using NVIDIA GPUs to speed up your programs
 
 * Intel OpenCL SDK for Linux from [software.intel.com/sdk/opencl](software.intel.com/sdk/opencl). Follow the installation instructions (no special requirements).
 * CUDA
 
-## Build
+## Getting source code and building HPVM
 
-Switch to hpvm-reorg-9
+Checkout HPVM:
 ```shell
-git checkout hpvm-reorg-9
+git clone https://gitlab.engr.illinois.edu/llvm/hpvm.git
+git checkout hpvm-reorg-9 (this step may not be needed once code is mirrored on Github)
 ```
 
-Build hpvm
+HPVM installer script can be used to dowwnload, configure and build HPMV along with LLVM and other subprojects including Clang. 
 ```shell
 bash install.sh
 ```
+Specifically, the HPVM installer downloads the LLVM, Clang, compiler-rt, libcxxabi and lld, copies HPVM source into 
+llvm/tools and build the entire tree. LLVM C-Backend is also built as a part of HPVM and is currently used to perform 
+code generation in OpenCL for GPUs.
 
-Build hpvm runtime 
+Alternatively, CMake can be run manually.
+```shell
+cd hpvm/build
+cmake ../llvm [options]
+```
+Some common options that can be used with CMake are:
+
+* -DCMAKE_INSTALL_PREFIX=directory --- Specify for directory the full pathname of where you want the HPVM tools and libraries to be installed.
+
+* -DCMAKE_BUILD_TYPE=type --- Valid options for type are Debug, Release, RelWithDebInfo, and MinSizeRel. Default is Debug.
+
+* -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types).
+
+## Building hpvm runtime
+HPVM also includes a runtime library which comprises of low-level, target-specific wrappers required by HPVM's code generation.
 ```shell
 cd projects/visc-rt
 make
@@ -40,14 +58,3 @@ export LLVM_SRC_ROOT=<full path to hpvm>/llvm
 Benchmark suites have been migrated to the LLVM 4.0 build. They are located
 in [VISC](/llvm/test/VISC/parboil/benchmarks).
 
-### Running an example (sgemm in parboil)
-```shell
-cd llvm/test/VISC/parboil/benchmarks/sgemm
-make 
-make run
-```
-
-### Other Old Components
-
-Search this repository for "visc", case-insensitive.
-
diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportVISC/DFGraph.h
index de0f5f9945038287b671642e385d5d466e466bce..ccc78a3357b3abc48d12524e05cd8b37e91f0a1d 100644
--- a/hpvm/include/SupportVISC/DFGraph.h
+++ b/hpvm/include/SupportVISC/DFGraph.h
@@ -403,25 +403,16 @@ public:
 
   void setHasX86FuncForTarget(visc::Target T, bool isX86Func) {
     switch (T) {
+      case visc::None:
+	return; // Do nothing.
       case visc::CPU_TARGET:
         GenFuncInfo.cpu_hasX86Func = isX86Func;
         break;
       case visc::GPU_TARGET:
         GenFuncInfo.gpu_hasX86Func = isX86Func;
         break;
-      case visc::SPIR_TARGET:
-        GenFuncInfo.spir_hasX86Func = isX86Func;
-        break;
-      case visc::CUDNN_TARGET:
-        GenFuncInfo.cudnn_hasX86Func = isX86Func;
-        break;
-      case visc::PROMISE_TARGET:
-        GenFuncInfo.promise_hasX86Func = isX86Func;
-        break;
       case visc::CPU_OR_GPU_TARGET:
-      case visc::CPU_OR_SPIR_TARGET:
-        assert(false && "Single target expected (CPU/GPU/SPIR)\n");
-        break;
+      	break;
       default:
        assert(false && "Unknown target\n");
         break;
@@ -431,25 +422,18 @@ public:
 
   bool hasX86GenFuncForTarget(visc::Target T) const {
     switch (T) {
+      case visc::None:
+	return false;
       case visc::CPU_TARGET:
         return GenFuncInfo.cpu_hasX86Func;
       case visc::GPU_TARGET:
         return GenFuncInfo.gpu_hasX86Func;
-      case visc::SPIR_TARGET:
-        return GenFuncInfo.spir_hasX86Func;
-      case visc::CUDNN_TARGET:
-        return GenFuncInfo.cudnn_hasX86Func;
-      case visc::PROMISE_TARGET:
-        return GenFuncInfo.promise_hasX86Func;
       case visc::CPU_OR_GPU_TARGET:
-      case visc::CPU_OR_SPIR_TARGET:
         assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
-        return false;
       default:
        assert(false && "Unknown target\n");
-       return false;
     }
-
+   return false;
   }
 
   void addGenFunc(Function* F, visc::Target T, bool isX86Func) {
@@ -471,39 +455,12 @@ public:
         GenFuncs.GPUGenFunc = F;
         GenFuncInfo.gpu_hasX86Func = isX86Func;
         break;
-      case visc::SPIR_TARGET:
-        if (GenFuncs.SPIRGenFunc != NULL) {
-          errs() << "Warning: Second generated SPIR function for node "
-                 << FuncPointer->getName() << "\n";
-        }
-        GenFuncs.SPIRGenFunc = F;
-        GenFuncInfo.spir_hasX86Func = isX86Func;
-        break;
-      case visc::CUDNN_TARGET:
-        if (GenFuncs.CUDNNGenFunc != NULL) {
-          errs() << "Warning: Second generated CUDNN function for node "
-                 << FuncPointer->getName() << "\n";
-        }
-        GenFuncs.CUDNNGenFunc = F;
-        GenFuncInfo.cudnn_hasX86Func = isX86Func;
-        break;
-      case visc::PROMISE_TARGET:
-        if (GenFuncs.PROMISEGenFunc != NULL) {
-          errs() << "Warning: Second generated PROMISE function for node "
-                 << FuncPointer->getName() << "\n";
-        }
-        GenFuncs.PROMISEGenFunc = F;
-        GenFuncInfo.promise_hasX86Func = isX86Func;
-        break;
       case visc::CPU_OR_GPU_TARGET:
-      case visc::CPU_OR_SPIR_TARGET:
         assert(false &&
                "A node function should be set with a tag specifying its \
                 type, not the node hint itself\n");
-        break;
       default:
         assert(false && "Unknown target for generated function\n");
-        break;
     }
 
     Tag = viscUtils::getUpdatedTag(Tag,T);
@@ -511,30 +468,26 @@ public:
 
   Function* getGenFuncForTarget(visc::Target T)  const {
     switch (T) {
+      case visc::None:
+	return NULL;
       case visc::CPU_TARGET:
         return GenFuncs.CPUGenFunc;
       case visc::GPU_TARGET:
         return GenFuncs.GPUGenFunc;
-      case visc::SPIR_TARGET:
-        return GenFuncs.SPIRGenFunc;
-      case visc::CUDNN_TARGET:
-        return GenFuncs.CUDNNGenFunc;
-      case visc::PROMISE_TARGET:
-        return GenFuncs.PROMISEGenFunc;
       case visc::CPU_OR_GPU_TARGET:
-      case visc::CPU_OR_SPIR_TARGET:
         assert(false &&
                "Requesting genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
-        return NULL;
       default:
         assert(false && "Unknown target for generated function\n");
-        return NULL;
     }
+    return NULL;
   }
 
   void removeGenFuncForTarget(visc::Target T) {
     switch (T) {
+      case visc::None:
+	return;
       case visc::CPU_TARGET:
         GenFuncs.CPUGenFunc = NULL;
         GenFuncInfo.cpu_hasX86Func = false;
@@ -543,27 +496,12 @@ public:
         GenFuncs.GPUGenFunc = NULL;
         GenFuncInfo.gpu_hasX86Func = false;
         break;
-      case visc::SPIR_TARGET:
-        GenFuncs.SPIRGenFunc = NULL;
-        GenFuncInfo.spir_hasX86Func = false;
-        break;
-      case visc::CUDNN_TARGET:
-        GenFuncs.CUDNNGenFunc = NULL;
-        GenFuncInfo.cudnn_hasX86Func = false;
-        break;
-      case visc::PROMISE_TARGET:
-        GenFuncs.PROMISEGenFunc = NULL;
-        GenFuncInfo.promise_hasX86Func = false;
-        break;
       case visc::CPU_OR_GPU_TARGET:
-      case visc::CPU_OR_SPIR_TARGET:
         assert(false &&
                "Removing genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
-        break;
       default:
         assert(false && "Unknown target for generated function\n");
-        break;
     }
     return;
   }
diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportVISC/VISCUtils.h
index cfd93467922dc4c7039168f247deb16bc26a5248..76c80ed94d83b0dd98ed9d896914ab78e142c93b 100644
--- a/hpvm/include/SupportVISC/VISCUtils.h
+++ b/hpvm/include/SupportVISC/VISCUtils.h
@@ -120,12 +120,8 @@ void fixHintMetadata(Module &M, Function* F, Function* G) {
     };
 
     FixHint("visc_hint_gpu");
-    FixHint("visc_hint_spir");
-    FixHint("visc_hint_cudnn");
-    FixHint("visc_hint_promise");
     FixHint("visc_hint_cpu");
     FixHint("visc_hint_cpu_gpu");
-    FixHint("visc_hint_cpu_spir");
 }
 
 // Assuming that the changed function is a node function, it is only used as a
@@ -294,72 +290,6 @@ Function* cloneFunction(Function* F, FunctionType* newFT,
   return newF;
 }
 
-// Create new function F' as a copy of old function F with a new signature.
-// The following two most used cases are handled by this function.
-// 1. When some extra arguments need to be added to this function
-//    - Here we can map the old function arguments to
-//      new ones
-// 2. When each pointer argument needs an additional size argument
-//    - Here, in the absence of VMap, we map the arguments in order, skipping
-//      over extra pointer arguments.
-// The function returns the list of return instructions to the caller to fix in
-// case the return type is also changed.
-//Function* cloneFunction(Function* F, FunctionType* newFT, bool
-//    isAddingPtrSizeArg, SmallVectorImpl<ReturnInst*>* Returns = NULL) {
-//
-//  DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
-//  DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
-//  DEBUG(errs() << "New Function Type: " << *newFT << "\n");
-//
-//  assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams()
-//      && "This function assumes that the new function has more arguments than the old function!");
-//
-//  // Create Function of specified type
-//  Function* newF = Function::Create(newFT, F->getLinkage(), F->getName()+"_cloned", F->getParent());
-//  DEBUG(errs() << "Old Function name: " << F->getName() << "\n");
-//  DEBUG(errs() << "New Function name: " << newF->getName() << "\n");
-//  ValueToValueMapTy VMap;
-//  DEBUG(errs() << "No value map provided. Creating default value map\n");
-//  if(isAddingPtrSizeArg) {
-//    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n");
-//    Function::arg_iterator new_ai = newF->arg_begin();
-//    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-//        ai != ae; ++ai) {
-//      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-//      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
-//      VMap[&*ai] = &*new_ai;
-//      new_ai->takeName(&*ai);
-//      if(ai->getType()->isPointerTy()) {
-//        std::string oldName = new_ai->getName();
-//        // If the current argument is pointer type, the next argument in new
-//        // function would be an i64 type containing the data size of this
-//        // argument. Hence, skip the next arguement in new function.
-//        ++new_ai;
-//        new_ai->setName("bytes_"+oldName);
-//      }
-//      ++new_ai;
-//    }
-//  }
-//  else {
-//    DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n");
-//    Function::arg_iterator new_ai = newF->arg_begin();
-//    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-//        ai != ae; ++ai, ++new_ai) {
-//      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-//      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
-//      VMap[&*ai] = &*new_ai;
-//      new_ai->takeName(&*ai);
-//    }
-//  }
-//
-//  // Clone function
-//  if (Returns == NULL)
-//    Returns = new SmallVector<ReturnInst*, 8>();
-//  CloneFunctionInto(newF, F, VMap, false, *Returns);
-//
-//  return newF;
-//}
-
 // Overloaded version of cloneFunction
 Function *cloneFunction(Function *F, Function *newF,
     bool isAddingPtrSizeArg,
@@ -435,30 +365,12 @@ bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
       if (T == visc::GPU_TARGET)
         return true;
       return false;
-    case visc::SPIR_TARGET:
-      if (T == visc::SPIR_TARGET)
-        return true;
-      return false;
-    case visc::CUDNN_TARGET:
-      if (T == visc::CUDNN_TARGET)
-        return true;
-      return false;
-    case visc::PROMISE_TARGET:
-      if (T == visc::PROMISE_TARGET)
-        return true;
-      return false;
     case visc::CPU_OR_GPU_TARGET:
       if ((T == visc::CPU_TARGET) ||
           (T == visc::GPU_TARGET) ||
           (T == visc::CPU_OR_GPU_TARGET))
         return true;
       return false;
-    case visc::CPU_OR_SPIR_TARGET:
-      if ((T == visc::CPU_TARGET) ||
-          (T == visc::SPIR_TARGET) ||
-          (T == visc::CPU_OR_SPIR_TARGET))
-        return true;
-      return false;
     default:
       assert(false && "Unknown Target\n");
   }
@@ -466,62 +378,32 @@ bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
 
 bool isSingleTargetTag(visc::Target T) {
   return ((T == visc::CPU_TARGET)    ||
-      (T == visc::GPU_TARGET)    ||
-      (T == visc::SPIR_TARGET)   ||
-      (T == visc::CUDNN_TARGET)  ||
-      (T == visc::PROMISE_TARGET));
+      (T == visc::GPU_TARGET));
 }
 
 // Add the specified target to the given tag
 visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
   assert(((T == visc::CPU_TARGET)    ||
-        (T == visc::GPU_TARGET)    ||
-        (T == visc::SPIR_TARGET)   ||
-        (T == visc::CUDNN_TARGET)  ||
-        (T == visc::PROMISE_TARGET)) &&
+        (T == visc::GPU_TARGET)) &&
       "The target is only allowed to be a single target: CPU, GPU, SPIR, CUDNN, PROMISE\n");
 
   switch (Tag) {
     case visc::None:
       return T;
     case visc::CPU_TARGET:
-      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
-          "Unsupported target combination\n");
       if (T == visc::CPU_TARGET)
         return visc::CPU_TARGET;
       if (T == visc::GPU_TARGET)
         return visc::CPU_OR_GPU_TARGET;
-      if (T == visc::SPIR_TARGET)
-        return visc::CPU_OR_SPIR_TARGET;
       return T;
     case visc::GPU_TARGET:
-      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
-      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
-          "Unsupported target combination\n");
       if (T == visc::CPU_TARGET)
         return visc::CPU_OR_GPU_TARGET;
       if (T == visc::GPU_TARGET)
         return visc::GPU_TARGET;
       return T;
-    case visc::SPIR_TARGET:
-      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
-      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
-          "Unsupported target combination\n");
-      if (T == visc::CPU_TARGET)
-        return visc::CPU_OR_SPIR_TARGET;
-      if (T == visc::SPIR_TARGET)
-        return visc::SPIR_TARGET;
-      return T;
     case visc::CPU_OR_GPU_TARGET:
-      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
-          "Unsupported target combination\n");
-      assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n");
       return visc::CPU_OR_GPU_TARGET;
-    case visc::CPU_OR_SPIR_TARGET:
-      assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) &&
-          "Unsupported target combination\n");
-      assert((T != visc::GPU_TARGET) && "Unsupported target combination\n");
-      return visc::CPU_OR_SPIR_TARGET;
     default:
       assert(false && "Unknown Target\n");
   }
@@ -541,18 +423,6 @@ void addHint(Function* F, visc::Target T) {
       DEBUG(errs() << "GPU Target\n");
       HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
       break;
-    case visc::SPIR_TARGET:
-      DEBUG(errs() << "SPIR Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-      break;
-    case visc::CUDNN_TARGET:
-      DEBUG(errs() << "CUDNN Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
-      break;
-    case visc::PROMISE_TARGET:
-      DEBUG(errs() << "PROMISE Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
-      break;
     case visc::CPU_TARGET:
       DEBUG(errs() << "CPU Target\n");
       HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
@@ -561,10 +431,6 @@ void addHint(Function* F, visc::Target T) {
       DEBUG(errs() << "CPU or GPU Target\n");
       HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
       break;
-    case visc::CPU_OR_SPIR_TARGET:
-      DEBUG(errs() << "CPU or SPIR Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
-      break;
     default:
       llvm_unreachable("Unsupported Target Hint!");
       break;
@@ -587,21 +453,9 @@ void removeHint(Function* F, visc::Target T) {
     case visc::GPU_TARGET:
       HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
       break;
-    case visc::SPIR_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-      break;
-    case visc::CUDNN_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
-      break;
-    case visc::PROMISE_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
-      break;
     case visc::CPU_OR_GPU_TARGET:
       HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
       break;
-    case visc::CPU_OR_SPIR_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir");
-      break;
     case visc::CPU_TARGET:
       HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
       break;
@@ -648,11 +502,7 @@ visc::Target getPreferredTarget(Function* F) {
 
   if(FoundPrefTarget("visc_hint_cpu")) return visc::CPU_TARGET;
   if(FoundPrefTarget("visc_hint_gpu")) return visc::GPU_TARGET;
-  if(FoundPrefTarget("visc_hint_spir")) return visc::SPIR_TARGET;
-  if(FoundPrefTarget("visc_hint_cudnn")) return visc::CUDNN_TARGET;
-  if(FoundPrefTarget("visc_hint_promise")) return visc::PROMISE_TARGET;
   if(FoundPrefTarget("visc_hint_cpu_gpu")) return visc::CPU_OR_GPU_TARGET;
-  if(FoundPrefTarget("visc_hint_cpu_spir")) return visc::CPU_OR_SPIR_TARGET;
 
   return visc::None;
 }
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 06e4e79183d726bb80113264f3cc7da0a4701ecf..ca5495f06d3887c506ecf52cccb839d8fdba1555 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -1466,11 +1466,9 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   // For now, use node function name and change it later
   Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
   Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-  Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
 
   bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
   bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-  bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
 
   errs() << "Node: " << N->getFuncPointer()->getName()
                      << " with tag " << N->getTag() << "\n";
@@ -1478,8 +1476,6 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
   errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
   errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
-  errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
-  errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
 
 
   if (N->getTag() == visc::None) {
@@ -1503,24 +1499,12 @@ void CGT_X86::codeGen(DFInternalNode* N) {
         assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
         assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
         assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
         break;
       case visc::GPU_TARGET:
         assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
         assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
         assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
         assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
-        assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
-        break;
-      case visc::SPIR_TARGET:
-        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && "");
         break;
       default:
         assert(false && "Unreachable: we checked that tag was single target!\n");
@@ -1536,9 +1520,6 @@ void CGT_X86::codeGen(DFInternalNode* N) {
         case visc::GPU_TARGET:
           NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
           break;
-        case visc::SPIR_TARGET:
-          NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET);
-          break;
         default:
           break;
       }
@@ -1558,7 +1539,6 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     Function *Ftmp = N->getGenFuncForTarget(N->getTag());
     N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->removeGenFuncForTarget(visc::SPIR_TARGET);
     N->setTag(visc::None);
     N->addGenFunc(Ftmp, visc::CPU_TARGET, true);
     N->setTag(visc::CPU_TARGET);
@@ -1566,11 +1546,9 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     // Sanity checks - to be removed TODO
     CF = N->getGenFuncForTarget(visc::CPU_TARGET);
     GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-    SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
 
     CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
     GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-    SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
 
     errs() << "After editing\n";
     errs() << "Node: " << N->getFuncPointer()->getName()
@@ -1579,8 +1557,6 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n";
     errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n";
     errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n";
-    errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n";
-    errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n";
 
     //  assert(false && "got to the point where we have to select\n");
   } else {
@@ -1591,26 +1567,21 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET);
     Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET);
-    Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET);
 
     bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
     bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
-    bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET);
 
     // These assertions express what we can support with the current runtime.
     // Code generation works the same way even for other target combinations.
     // For now, we want either CPU and GPU, or CPU and SPIR
-    assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n");
-    assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) &&
+    assert((CF && (GF || !GF)) && "Invalid target selection\n");
+    assert((CFx86 && (GFx86 || !GFx86)) &&
            "Generated functions without appropriate x86 wrapper\n");
 
     FunctionType *FT = CF->getFunctionType();
     if (GF)
       assert(FT == GF->getFunctionType() &&
              "Type mismatch between generated functions for GPU and CPU targets.\n");
-    if (SF)
-      assert(FT == SF->getFunctionType() &&
-             "Type mismatch between generated functions for SPIR and CPU targets.\n");
 
     // Code generation of wrapper function
     Function *F_wrapper;
@@ -1700,28 +1671,28 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     // Switch basic block pointers
     BBcurrent = BBfalse;
-    if (SF) {
+   // if (SF) {
       // We have a GPU version. Generate policy check and call
-      CmpConst =
-         ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
-      CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                             RTFInst, CmpConst, "", BBcurrent);
-      BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
-      BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
-      BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
+   //   CmpConst =
+     //    ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true);
+      //CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+        //                     RTFInst, CmpConst, "", BBcurrent);
+      //BBtrue =  BasicBlock::Create(M.getContext(), "version_spir", F_wrapper);
+     // BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper);
+     // BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent);
       
-      GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
-      RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
+     // GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue);
+     // RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue);
 
-      if (DeviceAbstraction) {
+     // if (DeviceAbstraction) {
         // Prepare arguments and function for call to wait for device runtime call
-        std::vector<Value *> Args; // TODO: add the device type as argument?
-        FunctionCallee RTF =
-          M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
-          runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
-        CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
-      }
-    }
+      //  std::vector<Value *> Args; // TODO: add the device type as argument?
+       // FunctionCallee RTF =
+        //  M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+         // runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
+       // CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI);
+     // }
+   // }
 
     RI = ReturnInst::Create(M.getContext(),
                             UndefValue::get(FT->getReturnType()), BBfalse);
@@ -1730,7 +1701,6 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     // Remove all other versions and update the tag
     N->addGenFunc(F_wrapper, visc::CPU_TARGET, true);
     N->removeGenFuncForTarget(visc::GPU_TARGET);
-    N->removeGenFuncForTarget(visc::SPIR_TARGET);
     N->setTag(visc::CPU_TARGET);
 
     // assert(false && "got to the point where we have to combine\n");
@@ -1774,11 +1744,6 @@ void CGT_X86::codeGen(DFLeafNode* N) {
          // by design of DFG2LLVM_NVPTX backend
          assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
          break;
-       case visc::SPIR_TARGET:
-         // A leaf node should not have an x86 function for SPIR
-         // by design of DFG2LLVM_SPIR backend
-         assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && "");
-         break;
        default:
          break;
     }
diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile
index ee7bcb33b4c22f2d47bada51a9d89b0cddf270b2..4ebf6fc0af2f05cd10f6d556e0b52bee186540d8 100644
--- a/hpvm/test/parboil/benchmarks/lbm/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/Makefile
@@ -1,4 +1,4 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/../test/parboil
+PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = lbm
 
 ifeq ($(NUM_CORES),)
diff --git a/hpvm/test/parboil/benchmarks/pipeline/Makefile b/hpvm/test/parboil/benchmarks/pipeline/Makefile
index a83cacc2cbd0b95ba6101c2dfbcb33929171dcf3..5f0fd5b73066b1ebfc9211fa628d7f326c88ca5e 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/Makefile
+++ b/hpvm/test/parboil/benchmarks/pipeline/Makefile
@@ -31,7 +31,7 @@ BIN = $(addsuffix -$(VERSION), $(APP))
 
 SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
+DATASET_DIR ?= $(PARBOIL_ROOT)/datasets/$(APP)
 
 IMAGE = $(DATASET_DIR)/$(TEST)/input/edgetest_10.png
 VIDEO1 = $(DATASET_DIR)/$(TEST)/input/taxi/taxi01.pgm
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
index 6dc39540f209760564005b51d7fcbdd046eaac4b..ea6b44788d0831221b2d289d1904d35f56240615 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
@@ -4,9 +4,11 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
-APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs`
+APP_CFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize  #-I/shared/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize #-I/shared/opencv/include
+OpenCV_DIR=/shared/opencv
+APP_LDFLAGS=`pkg-config opencv --libs`
+#APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs`
 #APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
index 12b3d42a66c369e97fbd3a08a0d179334f245153..892cdabd090412c80f3a2d26ffd3e7c183650ade 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
@@ -11,7 +11,7 @@
  */
 
 #include "opencv2/opencv.hpp"
-#include "opencv2/core/ocl.hpp"
+#include "opencv2/ocl/ocl.hpp"
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
@@ -201,7 +201,7 @@ void gaussianSmoothing(float *I, size_t bytesI,
   
     Is[gloc] = smoothedVal;
   }
-  __visc__return(m, n);
+  __visc__return(2, m, n);
 }
 
 void WrapperGaussianSmoothing(float *I, size_t bytesI,
@@ -210,7 +210,7 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI,
                        long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, I, Gs, 1, Is);
-  void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n);
+  void* GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
   __visc__bindIn(GSNode, 0, 0, 0); // Bind I
   __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
   __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
@@ -241,7 +241,7 @@ void laplacianEstimate(float *Is, size_t bytesIs,
   __visc__hint(visc::DEVICE);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
-  float imageArea[SZB][SZB];
+  float imageArea[SZB*SZB];
 
   //int gx = get_global_id(0);
   //int gy = get_global_id(1);
@@ -255,64 +255,64 @@ void laplacianEstimate(float *Is, size_t bytesIs,
 
   if ((gx < n) && (gy < m)) {
     // Data copy for dilation filter
-    imageArea[1][1] = Is[gy * n + gx];
+    imageArea[1 * SZB +1] = Is[gy * n + gx];
 
     if (gx == 0) {
-      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MIN_BR;
     } else {
-      imageArea[1][0] = Is[gy * n + gx - 1];
-      imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
-      imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+      imageArea[1 * SZB +0] = Is[gy * n + gx - 1];
+      imageArea[0 * SZB +0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2 * SZB +0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MIN_BR;
     } else {
-      imageArea[1][2] = Is[gy * n + gx + 1];
-      imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
-      imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+      imageArea[1 * SZB +2] = Is[gy * n + gx + 1];
+      imageArea[0 * SZB +2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2 * SZB +2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
     }
 
-    imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
-    imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+    imageArea[0 * SZB +1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2 * SZB +1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
 
     // Compute pixel of dilated image
     float dilatedPixel = MIN_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
 
     // Data copy for erotion filter - only change the boundary conditions
     if (gx == 0) {
-      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][0] = MAX_BR;
-      if (gy == m-1) imageArea[2][0] = MAX_BR;
+      if (gy == 0) imageArea[0 * SZB +0] = MAX_BR;
+      if (gy == m-1) imageArea[2 * SZB +0] = MAX_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][2] = MAX_BR;
-      if (gy == m-1) imageArea[2][2] = MAX_BR;
+      if (gy == 0) imageArea[0 * SZB +2] = MAX_BR;
+      if (gy == m-1) imageArea[2 * SZB +2] = MAX_BR;
     }
 
-    if (gy == 0) imageArea[0][1] = MAX_BR;
-    if (gy == m-1) imageArea[2][1] = MAX_BR;
+    if (gy == 0) imageArea[0 * SZB +1] = MAX_BR;
+    if (gy == m-1) imageArea[2 * SZB +1] = MAX_BR;
 
     // Compute pixel of eroded image
     float erodedPixel = MAX_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+        erodedPixel = _MIN(erodedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
 
-    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1];
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1];
     L[gy*n+gx] = laplacian;
   }
   //OutStruct output = {bytesB, bytesL};
   //if(gx == m-1 && gy == n-1)
     //std::cout << "Exit laplacian\n";
-  __visc__return(m);
+  __visc__return(1, m);
 }
 
 void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
@@ -321,7 +321,7 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
                           long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
-  void* LNode = __visc__createNode2D(laplacianEstimate, m, n);
+  void* LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
   __visc__bindIn(LNode, 0, 0, 0); // Bind Is
   __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(LNode, 2, 2, 0); // Bind B
@@ -434,7 +434,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
   //OutStruct output = {bytesB, bytesS};
   //if(gx == n-1 && gy == n-1)
     //std::cout << "Exit ZC\n";
-  __visc__return(m); 
+  __visc__return(1, m); 
 }
 
 void WrapperComputeZeroCrossings(float *L, size_t bytesL,
@@ -443,7 +443,7 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL,
                           long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
-  void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n);
+  void* ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
   __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
   __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
   __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
@@ -518,10 +518,10 @@ void computeGradient(float *Is, size_t bytesIs,
         Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
       }
 
-    G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy);
+    G[gloc] = sqrt(Gx*Gx + Gy*Gy);
     //G[gloc] = Gx*Gx + Gy*Gy;
   }
-  __visc__return(n);
+  __visc__return(1, n);
 }
 
 void WrapperComputeGradient(float *Is, size_t bytesIs,
@@ -531,7 +531,7 @@ void WrapperComputeGradient(float *Is, size_t bytesIs,
                                  long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, Is, Sx, Sy, 1, G);
-  void* CGNode = __visc__createNode2D(computeGradient, m, n);
+  void* CGNode = __visc__createNodeND(2, computeGradient, m, n);
   __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
   __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
@@ -557,7 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             float *maxG, size_t bytesMaxG,
                             long m, long n) {
 
-  __visc__hint(visc::DEVICE);
+  __visc__hint(visc::CPU_TARGET);
   //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
@@ -575,15 +575,16 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   }
 
   // First thread iterates over all elements of the thread block
-  if (lx == 0) {
-    for (int i = 1; (i < dimx) && (i < m*n); i++)
+  long bounds = dimx < m*n ? dimx : m*n;
+	if (lx == 0) {
+    for (int i = 1; i < bounds; i++)
       if (G[lx] < G[i])
         G[lx] = G[i];
 
     *maxG = G[lx];
   }
 
-  __visc__return(n);
+  __visc__return(1, n);
 }
 
 /* 
@@ -626,7 +627,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   //if (lx == 0)
     //__visc__atomic_max(maxG,G[gid]);
 
-  //__visc__return(m);
+  //__visc__return(1, m);
 //}
 
 void computeMaxGradientTB(float *G, size_t bytesG,
@@ -636,7 +637,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
   //__visc__hint(visc::DEVICE);
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
+  void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
@@ -653,7 +654,7 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG,
                            long block_x, long grid_x) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x);
+  void* CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
   __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
@@ -693,7 +694,7 @@ void rejectZeroCrossings(float *S, size_t bytesS,
   if ((gx < n) && (gy < m)) {
     E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
   }
-  __visc__return(m);
+  __visc__return(1, m);
 }
 
 void WrapperRejectZeroCrossings(float *S, size_t bytesS,
@@ -703,7 +704,7 @@ void WrapperRejectZeroCrossings(float *S, size_t bytesS,
                          long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
-  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  void* RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
   __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
   __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
   __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
@@ -739,12 +740,12 @@ void edgeDetection(float *I, size_t bytesI, // 0
                    ) { 
   __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
   __visc__hint(visc::CPU_TARGET);
-  void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
-  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
-  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
-  void* CGNode = __visc__createNode(WrapperComputeGradient);
-  void* CMGNode = __visc__createNode(WrapperComputeMaxGradient);
-  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+  void* GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
+  void* LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
+  void* CGNode = __visc__createNodeND(0, WrapperComputeGradient);
+  void* CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
+  void* RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
 
   // Gaussian Inputs
   __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
@@ -873,7 +874,7 @@ int main (int argc, char *argv[]) {
     }
 
     int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
-    //NUM_FRAMES = 5;
+    NUM_FRAMES = 600;
     std::cout << "Number of frames = " << NUM_FRAMES << "\n";
 
     // Used to store time after each frame computation is completed
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile b/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..d1f6c96d0c279bc2f2e3e70313369d49881b62b8 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/Makefile
@@ -4,5 +4,5 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3
-APP_CXXFLAGS=-ffast-math -O3
+APP_CFLAGS=-ffast-math -O1
+APP_CXXFLAGS=-ffast-math -O1
diff --git a/hpvm/test/parboil/common/include/visc.h b/hpvm/test/parboil/common/include/visc.h
index a27d0d003e45ecbff1efffc7c3e1226833894cd7..b0a0f141e575b104f0f3934416956cf9cd1f1904 100644
--- a/hpvm/test/parboil/common/include/visc.h
+++ b/hpvm/test/parboil/common/include/visc.h
@@ -14,92 +14,92 @@
 
 #ifdef __cplusplus
 extern "C" {
-void __visc__hint(visc::Target);
+void __visc__hint(visc::Target) noexcept;
 #else
-void __visc__hint(enum Target);
+void __visc__hint(enum Target) noexcept;
 #endif
 
 #ifdef __cplusplus
-void* __visc__node(...);
-//void* __visc__createNode(...);
-//void* __visc__createNode1D(...);
-//void* __visc__createNode2D(...);
-//void* __visc__createNode3D(...);
-//void __visc__return(...);
+void* __visc__node(...) noexcept;
+//void* __visc__createNode(...) noexcept;
+//void* __visc__createNode1D(...) noexcept;
+//void* __visc__createNode2D(...) noexcept;
+//void* __visc__createNode3D(...) noexcept;
+//void __visc__return(...) noexcept;
 #endif
-void* __visc__createNodeND(unsigned, ...);
-void __visc__return(unsigned, ...);
-
-void __visc__attributes(unsigned, ...);
-void __visc__init();
-void __visc__cleanup();
-
-void __visc__bindIn(void*, unsigned, unsigned, unsigned);
-void __visc__bindOut(void*, unsigned, unsigned, unsigned);
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
-void __visc__push(void*, void*);
-void* __visc__pop(void*);
-void* __visc__launch(unsigned, ...);
-void __visc__wait(void*);
-
-void* __visc__getNode();
-void* __visc__getParentNode(void*);
-void __visc__barrier();
-void* __visc__malloc(long);
-long __visc__getNodeInstanceID_x(void*);
-long __visc__getNodeInstanceID_y(void*);
-long __visc__getNodeInstanceID_z(void*);
-long __visc__getNumNodeInstances_x(void*);
-long __visc__getNumNodeInstances_y(void*);
-long __visc__getNumNodeInstances_z(void*);
+void* __visc__createNodeND(unsigned, ...) noexcept;
+void __visc__return(unsigned, ...) noexcept;
+
+void __visc__attributes(unsigned, ...) noexcept;
+void __visc__init() noexcept;
+void __visc__cleanup() noexcept;
+
+void __visc__bindIn(void*, unsigned, unsigned, unsigned) noexcept;
+void __visc__bindOut(void*, unsigned, unsigned, unsigned) noexcept;
+void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned) noexcept;
+void __visc__push(void*, void*) noexcept;
+void* __visc__pop(void*) noexcept;
+void* __visc__launch(unsigned, ...) noexcept;
+void __visc__wait(void*) noexcept;
+
+void* __visc__getNode() noexcept;
+void* __visc__getParentNode(void*) noexcept;
+void __visc__barrier() noexcept;
+void* __visc__malloc(long) noexcept;
+long __visc__getNodeInstanceID_x(void*) noexcept;
+long __visc__getNodeInstanceID_y(void*) noexcept;
+long __visc__getNodeInstanceID_z(void*) noexcept;
+long __visc__getNumNodeInstances_x(void*) noexcept;
+long __visc__getNumNodeInstances_y(void*) noexcept;
+long __visc__getNumNodeInstances_z(void*) noexcept;
 
 // Atomic
 // signed int
-int __visc__atomic_cmpxchg(int*, int, int);
-int __visc__atomic_add(int*, int);
-int __visc__atomic_sub(int*, int);
-int __visc__atomic_xchg(int*, int);
-int __visc__atomic_inc(int*);
-int __visc__atomic_dec(int*);
-int __visc__atomic_min(int*, int);
-int __visc__atomic_max(int*, int);
-int __visc__atomic_umax(int*, int);
-int __visc__atomic_umin(int*, int);
-int __visc__atomic_and(int*, int);
-int __visc__atomic_or(int*, int);
-int __visc__atomic_xor(int*, int);
+int __visc__atomic_cmpxchg(int*, int, int) noexcept;
+int __visc__atomic_add(int*, int) noexcept;
+int __visc__atomic_sub(int*, int) noexcept;
+int __visc__atomic_xchg(int*, int) noexcept;
+int __visc__atomic_inc(int*) noexcept;
+int __visc__atomic_dec(int*) noexcept;
+int __visc__atomic_min(int*, int) noexcept;
+int __visc__atomic_max(int*, int) noexcept;
+int __visc__atomic_umax(int*, int) noexcept;
+int __visc__atomic_umin(int*, int) noexcept;
+int __visc__atomic_and(int*, int) noexcept;
+int __visc__atomic_or(int*, int) noexcept;
+int __visc__atomic_xor(int*, int) noexcept;
 
 // Special Func
-float __visc__floor(float);
-float __visc__rsqrt(float);
-float __visc__sqrt(float);
-float __visc__sin(float);
-float __visc__cos(float);
+float __visc__floor(float) noexcept;
+float __visc__rsqrt(float) noexcept;
+float __visc__sqrt(float) noexcept;
+float __visc__sin(float) noexcept;
+float __visc__cos(float) noexcept;
 // unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
-//unsigned __visc__atomic_add(unsigned*, unsigned);
-//unsigned __visc__atomic_sub(unsigned*, unsigned);
-//unsigned __visc__atomic_xchg(unsigned*, unsigned);
-//unsigned __visc__atomic_inc(unsigned*);
-//unsigned __visc__atomic_dec(unsigned*);
-//unsigned __visc__atomic_min(unsigned*, unsigned);
-//unsigned __visc__atomic_max(unsigned*, unsigned);
-//unsigned __visc__atomic_and(unsigned*, unsigned);
-//unsigned __visc__atomic_or(unsigned*, unsigned);
-//unsigned __visc__atomic_xor(unsigned*, unsigned);
+//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned) noexcept;
+//unsigned __visc__atomic_add(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_sub(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_xchg(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_inc(unsigned*) noexcept;
+//unsigned __visc__atomic_dec(unsigned*) noexcept;
+//unsigned __visc__atomic_min(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_max(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_and(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_or(unsigned*, unsigned) noexcept;
+//unsigned __visc__atomic_xor(unsigned*, unsigned) noexcept;
 
 
 #include <unistd.h>
 
-long get_global_id(int);
-long get_group_id(int);
-long get_local_id(int);
-long get_local_size(int);
+long get_global_id(int) noexcept;
+long get_group_id(int) noexcept;
+long get_local_id(int) noexcept;
+long get_local_size(int) noexcept;
 
 
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void llvm_visc_request_mem(void*, size_t);
+void llvm_visc_track_mem(void*, size_t) noexcept;
+void llvm_visc_untrack_mem(void*) noexcept;
+void llvm_visc_request_mem(void*, size_t) noexcept;
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/common/mk/visc.mk b/hpvm/test/parboil/common/mk/visc.mk
index 781e3601469387a7954309d7b55ecda37c6d11a5..eb11371ccdb931d5160e5143af907a308215eb54 100755
--- a/hpvm/test/parboil/common/mk/visc.mk
+++ b/hpvm/test/parboil/common/mk/visc.mk
@@ -2,7 +2,7 @@
 
 # Default language wide options
 LANG_CFLAGS=-I$(PARBOIL_ROOT)/common/include
-LANG_CXXFLAGS=$(LANG_CFLAGS) -fno-exceptions
+LANG_CXXFLAGS=$(LANG_CFLAGS)  
 LANG_LDFLAGS=-lOpenCL -L$(OPENCL_LIB_PATH) -lrt -L$(CUDA_LIB_PATH) -lcudart
 
 CFLAGS=$(LANG_CFLAGS) $(PLATFORM_CFLAGS) $(APP_CFLAGS)
@@ -197,7 +197,7 @@ $(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
 	make -C $(LLVM_LIB_PATH)
 
 $(HOST) $(KERNEL): $(BUILDDIR)/$(VISC_OBJS)
-	$(OPT) $(VISC_OPTFLAGS) -S $< -o $(HOST)
+	$(OPT) --debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
 
 $(RUNDIR) :
 	mkdir -p $(RUNDIR)