diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index e9660aa6e7a4aad995c6003fce9697a8304f3ccf..5f2e483cee789a63fc77d1e99723c77a530747ff 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -807,8 +807,7 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
   if(KernelLaunchNode == NULL)
     errs () << "No kernel launch node\n";
   else {
-    errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n";
-    errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+    errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
   }
 
   if (!KernelLaunchNode) {
@@ -1285,9 +1284,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       break;
       case Intrinsic::visc_barrier:
       {
-        errs() << F_nvptx->getName() << "\t: Handling barrier\n";
-        errs() << "Substitute with barrier()\n";
-        errs() << *II << "\n";
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
+        DEBUG(errs() << "Substitute with barrier()\n");
+        DEBUG(errs() << *II << "\n");
         FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/),
                                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                              false);
@@ -1315,7 +1314,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         //case Intrinsic::visc_atomic_inc:
         //case Intrinsic::visc_atomic_dec:
       {
-        errs() << *II << "\n";
+        DEBUG(errs() << *II << "\n");
         // Only have support for i32 atomic intrinsics
         assert(II->getType() == Type::getInt32Ty(II->getContext())
                && "Only support i32 atomic intrinsics for now");
@@ -1332,7 +1331,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()),
             Ptr, Val, llvm::SequentiallyConsistent, llvm::CrossThread, II);
         AtomicInst->setVolatile(true);
-        errs() << "Substitute with: " << *AtomicInst << "\n";
+        DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n");
         II->replaceAllUsesWith(AtomicInst);
         IItoRemove.push_back(II);
       }
@@ -1348,7 +1347,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
       if(calleeF->isDeclaration()) {
         // Add the declaration to kernel module
-        errs() << "Adding declaration to Kernel module: " << *calleeF << "\n";
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
         KernelM.getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
         if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
           // Now handle a few specific intrinsics
@@ -1357,7 +1356,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           case Intrinsic::sin:
           case Intrinsic::cos:
           {
-            errs() << "Found sincos: " << *II << "\n";
+            DEBUG(errs() << "Found sincos: " << *II << "\n");
             // Get the libclc function
             // libclc uses mangled name for sin cos
             assert(II->getType()->isFloatTy()
@@ -1381,7 +1380,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           }
           case Intrinsic::floor:
           {
-            errs() << "Found floor intrinsic\n";
+            DEBUG(errs() << "Found floor intrinsic\n");
             F = Intrinsic::getDeclaration(&KernelM, Intrinsic::nvvm_floor_f);
             FunctionType* FTy = F->getFunctionType();
             DEBUG(errs() << *F << "\n");
@@ -1434,7 +1433,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // their subsequent uses are still around
   for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
        re = IItoRemove.rend(); ri != re; ++ri) {
-    errs() << "Erasing: " << **ri << "\n";
+    DEBUG(errs() << "Erasing: " << **ri << "\n");
     (*ri)->eraseFromParent();
   }
 
@@ -1709,7 +1708,7 @@ static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector
     }
     else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
       // found a store in use chain
-      errs() << "Found store/atomicrmw instruction: " << *I << "\n";
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
       return true;
     }
     else if(BuildDFG::isViscIntrinsic(I)) {
@@ -1720,7 +1719,7 @@ static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector
       return true;
     }
     else {
-      errs() << "\tTraverse use chain of: " << *I << "\n";
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
       if(findLoadStoreUses(I, UseList, VisitedList))
         return true;
     }
@@ -1770,7 +1769,7 @@ static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*Dependence
 
 // Function to check if argument arg can be changed to a constant memory pointer
 static bool canBePromoted(Argument* arg, Function* F) {
-  errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n";
+  DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
   std::vector<Value*> UseList;
   std::vector<Value*> VisitedList;
   // recursively traverse use chain
@@ -1788,7 +1787,7 @@ static bool canBePromoted(Argument* arg, Function* F) {
     if(isDependentOnNodeInstanceID(U, &DependenceList))
       return false;
   }
-  errs() << "\tYes, Promotable to Constant Memory\n";
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
   return true;
 }
 
@@ -1853,7 +1852,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa
   // Iterate over the number of dimensions and store the global work group
   // size in that dimension
   for(unsigned i=0; i < WGSize.size(); i++) {
-    errs() << *WGSize[i] << "\n";
+    DEBUG(errs() << *WGSize[i] << "\n");
     assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
 
     if(WGSize[i]->getType() != Int64Ty) {
diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
index 31103c218bca660f11563f98093c93b2cda2b353..32612a85c1cd69cbbd1fb67a7920da8571b480b8 100644
--- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
@@ -814,7 +814,6 @@ void CGT_SPIR::codeGen(DFInternalNode* N) {
   if(KernelLaunchNode == NULL)
     errs () << "No kernel launch node\n";
   else {
-    errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n";
     errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
   }
 
@@ -1309,9 +1308,9 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
       break;
       case Intrinsic::visc_barrier:
       {
-        errs() << F_spir->getName() << "\t: Handling barrier\n";
-        errs() << "Substitute with barrier()\n";
-        errs() << *II << "\n";
+        DEBUG(errs() << F_spir->getName() << "\t: Handling barrier\n");
+        DEBUG(errs() << "Substitute with barrier()\n");
+        DEBUG(errs() << *II << "\n");
         FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/),
                               std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                               false);
@@ -1338,7 +1337,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
       case Intrinsic::visc_atomic_inc:
       case Intrinsic::visc_atomic_dec:
       {
-        errs() << *II << "\n";
+        DEBUG(errs() << *II << "\n");
         // Only have support for i32 atomic intrinsics
         assert(II->getType() == Type::getInt32Ty(II->getContext())
             && "Only support i32 atomic intrinsics for now");
@@ -1371,7 +1370,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
                                                ArrayRef<Value*>(atomicArgs, 2),
                                                "", II);
 
-        errs() << "Substitute with: " << *AtomicInst << "\n";
+        DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n");
         II->replaceAllUsesWith(AtomicInst);
         IItoRemove.push_back(II);
       }
@@ -1387,7 +1386,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
       Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
       if(calleeF->isDeclaration()) {
         // Add the declaration to kernel module
-        errs() << "Adding declaration to Kernel module: " << *calleeF << "\n";
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
         KernelM.getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
         if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
           // Now handle a few specific intrinsics
@@ -1399,7 +1398,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
           case Intrinsic::floor:
           case Intrinsic::nvvm_rsqrt_approx_f:
           {
-            errs() << "Found math function: " << *II << "\n";
+            DEBUG(errs() << "Found math function: " << *II << "\n");
             // Get the builtin function
             // SPIR uses mangled name for builtin math functions
             assert(II->getType()->isFloatTy()
@@ -1418,7 +1417,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
             break;
           }
           default:
-            errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ;
+            DEBUG(errs() << "[WARNING] Found Intrinsic: " << *II << "\n" );
           }
         }
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 75280bd50d674be701b7b59f36f75125ceed23e0..e609bb63a0e5ecdaa130e939334b905839f59a34 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -55,6 +55,7 @@ private:
   // VISC Runtime API
   Constant* llvm_visc_x86_launch;
   Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_x86_argument_ptr;
 
   Constant* llvm_visc_streamLaunch;
   Constant* llvm_visc_streamPush;
@@ -175,6 +176,7 @@ void CGT_X86::initRuntimeAPI() {
   DECLARE(llvm_visc_x86_launch);
   DECLARE(malloc);
   DECLARE(llvm_visc_x86_wait);
+  DECLARE(llvm_visc_x86_argument_ptr);
   DECLARE(llvm_visc_streamLaunch);
   DECLARE(llvm_visc_streamPush);
   DECLARE(llvm_visc_streamPop);
@@ -1078,6 +1080,12 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
                               i->getName()+".addr",
                               RI);
       }
+      else if(i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn,
+                                    i->getType(),
+                                    i->getName()+".addr",
+                                    RI);
+      }
       else {
         BI = CastInst::CreateIntegerCast(bufferIn,
                                          i->getType(),
@@ -1092,7 +1100,7 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   }
   /* Add a call to the generated function of the child node */
   DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-  errs() << "Type: " << *C->getGenFunc()->getType() << "\n";
+  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
   CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
                                   C->getGenFunc()->getName()+".output", RI);
 
@@ -1281,6 +1289,36 @@ void CGT_X86::codeGen(DFLeafNode* N) {
   if(!N->getParent()->isChildGraphStreaming())
     addIdxDimArgs(F_X86);
 
+  // Go through the arguments, and any pointer arguments with in attribute need
+  // to have x86_argument_ptr call to get the x86 ptr of the argument
+  // Insert these calls in a new BB which would dominate all other BBs
+  // Create new BB
+  BasicBlock* EntryBB = F_X86->begin();
+  BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB);
+  BranchInst* Terminator = BranchInst::Create(EntryBB, BB);
+  // Insert calls
+  for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end();
+        ai != ae; ++ai) {
+    if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) {
+      assert(ai->getType()->isPointerTy()
+          && "Only pointer arguments can have visc in/out attributes ");
+      Argument* size = ai->getNextNode();
+      assert(size->getType() == Type::getInt64Ty(M.getContext())
+          && "Next argument after a pointer should be an i64 type");
+      CastInst* BI = BitCastInst::CreatePointerCast(ai,
+                                                    Type::getInt8PtrTy(M.getContext()),
+                                                    ai->getName()+".i8ptr",
+                                                    Terminator);
+      Value* ArgPtrCallArgs[] = {BI, size};
+      CallInst::Create(llvm_visc_x86_argument_ptr,
+                                              ArrayRef<Value*>(ArgPtrCallArgs, 2),
+                                              "",
+                                              Terminator);
+
+    }
+  }
+  errs() << *BB << "\n";
+
   // Go through all the instructions
   for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) {
     Instruction *I = &(*i);
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index 4cbe09538c9126021fed6653fe71c1e7b34df8e7..3b4c82b1cf6143de01121968a80b9274747004af 100755
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -1082,17 +1082,17 @@ bool GenVISC::runOnModule(Module &M) {
         // FIXME: What if the child node function has not been visited already.
         // i.e., it's return type has not been fixed.
         Function* F = I->getParent()->getParent();
-        errs() << F->getName() << "\n";
+        DEBUG(errs() << F->getName() << "\n";);
         IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
-        errs() << "Node intrinsic: " << *NodeIntrinsic << "\n";
+        DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
         Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
-        errs() << ChildF->getName() << "\n";
+        DEBUG(errs() << ChildF->getName() << "\n";);
         int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
         int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
         StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
         Type* ReturnType = F->getReturnType();
 
-        errs() << *ReturnType << "\n";
+        DEBUG(errs() << *ReturnType << "\n";);
         assert(ReturnType->isVoidTy() || isa<StructType>(ReturnType)
             && "Return type should either be a struct or void type!");
 
diff --git a/llvm/lib/Transforms/LocalMem/LocalMem.cpp b/llvm/lib/Transforms/LocalMem/LocalMem.cpp
index 0de36b51cbbe11da3c050d90ddf061544468a69a..a6faa3ef23f6b0275a4f1af9f940993941076f9b 100644
--- a/llvm/lib/Transforms/LocalMem/LocalMem.cpp
+++ b/llvm/lib/Transforms/LocalMem/LocalMem.cpp
@@ -29,7 +29,6 @@ namespace {
 // Helper Functions
 
 static AllocationNodeProperty* isAllocationNode(DFLeafNode* N);
-static Value* getOutValueAt(DFLeafNode*N, unsigned i);
 
 // LocalMem - The first implementation.
 struct LocalMem : public ModulePass {
@@ -104,12 +103,12 @@ bool LocalMem::runOnModule(Module &M) {
 }
 
 void AT_OCL::codeGen(DFInternalNode* N) {
-  errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n";
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
 }
 
 // Code generation for leaf nodes
 void AT_OCL::codeGen(DFLeafNode* N) {
-  errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n";
+  DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
   // Skip code generation if it is a dummy node
   if(N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
@@ -123,10 +122,10 @@ void AT_OCL::codeGen(DFLeafNode* N) {
     N->setProperty(DFNode::Allocation, ANP);
     AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation);
     AllocationNodeProperty::AllocationListType AL = anp->getAllocationList();
-    errs() << "Total allocations = " << AL.size() << "\n";
+    DEBUG(errs() << "Total allocations = " << AL.size() << "\n");
     for(auto P: AL) {
-      errs() << " EdgePort: " << P.first->getDestPosition();
-      errs() << " Size: " << *P.second << "\n";
+      DEBUG(errs() << " EdgePort: " << P.first->getDestPosition());
+      DEBUG(errs() << " Size: " << *P.second << "\n");
     } 
 
   }
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
old mode 100755
new mode 100644
index d78b7529c4affa864b58c86fc6f220a944a608f8..18c15167a472f6ae4e2251cf19377bdf201dac6a
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -183,6 +183,10 @@ static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL
   return d_input;
 }
 
+void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) {
+  return llvm_visc_request_mem(ptr, size);
+}
+
 void* llvm_visc_request_mem(void* ptr, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
@@ -1537,6 +1541,7 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
   Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
 
+  DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
   //free(clDevices);
   free(programSource);
 
diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h
old mode 100755
new mode 100644
index 379ac3bf08ff3b2f55399a56edb9840b45e3589f..a5fd214194d29191fb90a8e0de203a10d0eac776
--- a/llvm/projects/visc-rt/visc-rt.h
+++ b/llvm/projects/visc-rt/visc-rt.h
@@ -160,6 +160,8 @@ void* llvm_visc_x86_launch(void* (void*), void*);
 void llvm_visc_x86_wait(void*);
 void* llvm_visc_ocl_initContext(enum visc::Target);
 
+void* llvm_visc_x86_argument_ptr(void*, size_t);
+
 void llvm_visc_ocl_clearContext(void*);
 void llvm_visc_ocl_argument_shared(void*, int, size_t);
 void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile
index 3a265f737f93433d5a83a5a6a503a6c879a09d8a..af1e1ac8d24d3e0468e78b294acefa85086e4381 100644
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile
@@ -21,9 +21,9 @@ ifeq ($(POS),)
 endif
 
 ifeq ($(POS),middle)
-  APP_CXXFLAGS+=-DMIDDLE
+  APP_CXXFLAGS+=-DMIDDLE -DNAME=$(NAME)
 else ifeq ($(POS),right)
-  APP_CXXFLAGS+=-DRIGHT
+  APP_CXXFLAGS+=-DRIGHT -DNAME=$(NAME)
 endif
 
 BIN = $(addsuffix -$(VERSION), $(APP))
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh b/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3b9c19bad6dd86de7eb9a82edc7f17b92265155e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/copyToVersions.sh
@@ -0,0 +1,12 @@
+
+declare -a versionList=("viscGPU" "viscVector" "viscScalar" "viscGPU-Scalar-MaxG" "viscVector-Scalar-MaxG" "viscGPU-Scalar-ZC" "viscVector-Scalar-ZC")
+declare -a fileList=("Makefile" "io.cc" "main.cc")
+
+for version in "${versionList[@]}"; do
+  echo $version
+  for filename in "${fileList[@]}"; do
+    echo cp ./src/visc_parallel/$filename ./src/$version/
+    cp ./src/visc_parallel/$filename ./src/$version/
+  done
+  echo
+done
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh b/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0c8435764bd87c92dd30ad51aa97011ddb07b339
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/run.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+echo Pipeline Script $1 $2
+
+version=$1
+pos=$2
+
+if [[ ($version == *"GPU"*) ||  ($version == "visc_parallel") ]]
+then
+  target=""
+elif [[ $version == *"Vector"* ]]
+then 
+  target="x86"
+else 
+  target="seq"
+fi
+
+make VERSION=$version TARGET=$target clean
+make VERSION=$version TARGET=$target POS=$pos
+make VERSION=$version TARGET=$target POS=$pos run
+
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc
index b501269f0aa55cb808b71a72a445ceb0d1bf67eb..7d609066b0ac6570af7152ca84822c8e9dff816f 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc
@@ -415,8 +415,8 @@ void edgeDetection(float *I, size_t bytesI,
 }
 
 }
-#define NUM_RUNS 1
-#define NUM_FRAMES 20
+#define NUM_RUNS 5
+#define NUM_FRAMES 2000
 using namespace cv;
 
 int main (int argc, char *argv[]) {
@@ -441,7 +441,7 @@ int main (int argc, char *argv[]) {
         exit(-1);
     }
 
-    /* Read in data */
+    //Read in data 
     //std::cout << "Reading video file: " << params->inpFiles[0] << "\n";
     //VideoCapture cap(params->inpFiles[0]);
     //if(!cap.isOpened()) {
@@ -489,6 +489,9 @@ int main (int argc, char *argv[]) {
       exit(-1);
     }
 
+    // Read data from matrix file
+    //readColMajorMatrixFile(params->inpFiles[0],
+                           //matIrow, matIcol, matI);
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile
similarity index 96%
rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile
rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile
index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/Makefile
@@ -4,8 +4,8 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/io.cc
similarity index 100%
rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/io.cc
rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/io.cc
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc
new file mode 100755
index 0000000000000000000000000000000000000000..836cf1a01a56e6548f0fd11b3f11df0f303b3a8e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-MaxG/main.cc
@@ -0,0 +1,1117 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include "opencv2/opencv.hpp"
+#include "opencv2/core/ocl.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <iostream>
+#include <cassert>
+#include <parboil.h>
+#include <visc.h>
+
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "GPU-CPU(No Vector)-MaxG Pipeline - Input Video";
+std::string output_window = "GPU-CPU(No Vector)-MaxG Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
+// Definitions of sizes for edge detection kernels
+
+#define MIN_BR 0.0f
+#define MAX_BR 1.0f
+
+// Code needs to be changed for this to vary
+#define SZB 3
+
+#define REDUCTION_TILE_SZ 1024
+
+#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
+extern "C" {
+
+struct __attribute__((__packed__)) InStruct {
+  float* I ;
+  size_t bytesI;
+  float* Is ;
+  size_t bytesIs;
+  float* L;
+  size_t bytesL;
+  float* S;
+  size_t bytesS;
+  float* G;
+  size_t bytesG;
+  float* maxG;
+  size_t bytesMaxG;
+  float* E;
+  size_t bytesE;
+  float* Gs;
+  size_t bytesGs;
+  float* B;
+  size_t bytesB;
+  float* Sx;
+  size_t bytesSx;
+  float* Sy;
+  size_t bytesSy;
+  int m;
+  int n;
+  int block_x;
+  int grid_x;
+};
+
+
+void packData(struct InStruct* args, float* I, size_t bytesI,
+                                     float* Is, size_t bytesIs,
+                                     float* L, size_t bytesL,
+                                     float* S, size_t bytesS,
+                                     float* G, size_t bytesG,
+                                     float* maxG, size_t bytesMaxG,
+                                     float* E, size_t bytesE,
+                                     float* Gs, size_t bytesGs,
+                                     float* B, size_t bytesB,
+                                     float* Sx, size_t bytesSx,
+                                     float* Sy, size_t bytesSy,
+                                     int m, int n,
+                                     int block_x, int grid_x) {
+  args->I = I;
+  args->bytesI = bytesI;
+  args->Is = Is;
+  args->bytesIs = bytesIs;
+  args->L = L;
+  args->bytesL = bytesL;
+  args->S = S;
+  args->bytesS = bytesS;
+  args->G = G;
+  args->bytesG = bytesG;
+  args->maxG = maxG;
+  args->bytesMaxG = bytesMaxG;
+  args->E = E;
+  args->bytesE = bytesE;
+  args->Gs = Gs;
+  args->bytesGs = bytesGs;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->Sx = Sx;
+  args->bytesSx = bytesSx;
+  args->Sy = Sy;
+  args->bytesSy = bytesSy;
+  args->m = m;
+  args->n = n;
+  args->block_x = block_x;
+  args->grid_x = grid_x;
+}
+
+/* 
+ * Gaussian smoothing of image I of size m x n
+ * I : input image
+ * Gs : gaussian filter
+ * Is: output (smoothed image)
+ * m, n : dimensions
+ * 
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define GAUSSIAN_SIZE 7
+#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2)
+void gaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, I, Gs, 1, Is);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float smoothedVal = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++)
+      for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = I[loadOffset];
+        smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j];
+      }
+  
+    Is[gloc] = smoothedVal;
+  }
+  __visc__return(m, n);
+}
+
+void WrapperGaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, I, Gs, 1, Is);
+  void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n);
+  __visc__bindIn(GSNode, 0, 0, 0); // Bind I
+  __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
+  __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
+  __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
+  __visc__bindIn(GSNode, 4, 4, 0); // Bind Is
+  __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
+  __visc__bindIn(GSNode, 6, 6, 0); // Bind m
+  __visc__bindIn(GSNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(GSNode, 0, 0, 0); // bind output m
+  __visc__bindOut(GSNode, 1, 1, 0); // bind output n
+}
+
+
+/* Compute a non-linear laplacian estimate of input image I of size m x n */
+/*
+ * Is   : blurred imput image
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * L    : output (laplacian of the image)
+ * Need 2D grid, a thread per pixel
+*/
+void laplacianEstimate(float *Is, size_t bytesIs,
+                            float *B, size_t bytesB,
+                            float *L, size_t bytesL,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, Is, B, 1, L);
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered laplacian\n";
+
+  int i, j;
+
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = Is[gy * n + gx];
+
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else {
+      imageArea[1][0] = Is[gy * n + gx - 1];
+      imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = Is[gy * n + gx + 1];
+      imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1];
+    L[gy*n+gx] = laplacian;
+  }
+  //OutStruct output = {bytesB, bytesL};
+  //if(gx == m-1 && gy == n-1)
+    //std::cout << "Exit laplacian\n";
+  __visc__return(m);
+}
+
+void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
+                          float *B, size_t bytesB,
+                          float *L, size_t bytesL,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, Is, B, 1, L);
+  void* LNode = __visc__createNode2D(laplacianEstimate, m, n);
+  __visc__bindIn(LNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(LNode, 2, 2, 0); // Bind B
+  __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(LNode, 4, 4, 0); // Bind L
+  __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL
+  __visc__bindIn(LNode, 6, 6, 0); // Bind m
+  __visc__bindIn(LNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(LNode, 0, 0, 0); // bind output m
+
+}
+
+/* Compute the zero crossings of input image L of size m x n */
+/*
+ * L    : imput image (computed Laplacian)
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * S    : output (sign of the image)
+ * Need 2D grid, a thread per pixel
+ */
+void computeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  int i, j;
+
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered ZC\n";
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR;
+
+    if (gx == 0) { // left most line
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else { 
+      imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][0] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][2] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ?
+                          (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+    imageArea[2][1] = (gy < m - 1)?
+                          (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float pixelSign = dilatedPixel - erodedPixel;
+    S[gy*n+gx] = pixelSign;
+  }
+  //OutStruct output = {bytesB, bytesS};
+  //if(gx == n-1 && gy == n-1)
+    //std::cout << "Exit ZC\n";
+  __visc__return(m); 
+}
+
+void WrapperComputeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+  void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n);
+  __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
+  __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
+  __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
+  __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(ZCNode, 4, 4, 0); // Bind S
+  __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
+  __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
+  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(ZCNode, 0, 0, 0); // bind output m
+
+}
+
+/*
+ * Gradient computation using Sobel filters
+ * Is   : input (smoothed image)
+ * Sx, Sy: Sobel operators
+ * - Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+ * - Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+ * m, n : dimensions
+ * G: output, gradient magnitude : sqrt(Gx^2+Gy^2)
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define SOBEL_SIZE 3
+#define SOBEL_RADIUS (SOBEL_SIZE / 2)
+
+void computeGradient(float *Is, size_t bytesIs,
+              float *Sx, size_t bytesSx,
+              float *Sy, size_t bytesSy,
+              float *G, size_t bytesG,
+              int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float Gx = 0;
+  float Gy = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++)
+      for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = Is[loadOffset];
+        Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+        Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+      }
+
+    G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy);
+    //G[gloc] = Gx*Gx + Gy*Gy;
+  }
+  __visc__return(n);
+}
+
+void WrapperComputeGradient(float *Is, size_t bytesIs,
+                                 float *Sx, size_t bytesSx,
+                                 float *Sy, size_t bytesSy,
+                                 float *G, size_t bytesG,
+                                 int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+  void* CGNode = __visc__createNode2D(computeGradient, m, n);
+  __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
+  __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
+  __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy
+  __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
+  __visc__bindIn(CGNode, 6, 6, 0); // Bind G
+  __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG
+  __visc__bindIn(CGNode, 8, 8, 0); // Bind m
+  __visc__bindIn(CGNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(CGNode, 0, 0, 0); // bind output m
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * m, n: input size
+ * Needs a single thread block
+ */
+void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            float *maxG, size_t bytesMaxG,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(1, G, 1, maxG);
+
+  void* thisNode = __visc__getNode();
+
+  int lx = __visc__getNodeInstanceID_x(thisNode);       // threadIdx.x
+  int dimx = __visc__getNumNodeInstances_x(thisNode);   // blockDim.x
+
+
+  // Assume a single thread block
+  // Thread block iterates over all elements
+  for (int i = lx + dimx; i < m*n; i+= dimx) {
+    if (G[lx] < G[i])
+      G[lx] = G[i];
+  }
+
+  // First thread iterates over all elements of the thread block
+  if (lx == 0) {
+    for (int i = 1; (i < dimx) && (i < m*n); i++)
+      if (G[lx] < G[i])
+        G[lx] = G[i];
+
+    *maxG = G[lx];
+  }
+
+  __visc__return(n);
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * Each static node processes 2*nodeDim elements
+ * Need 1D grid, a thread per 2 pixels
+ */
+//void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            //float *maxG, size_t bytesMaxG,
+                            //int m, int n) {
+
+  //__visc__hint(visc::DEVICE);
+  //TODO: maxG should be initialized to zero (MIN_BR) every time
+  //__visc__attributes(2, G, maxG, 1, maxG);
+
+  //void* thisNode = __visc__getNode();
+  //void* parentNode = __visc__getParentNode(thisNode);
+
+  //int lx = __visc__getNodeInstanceID_x(thisNode);
+  //int px = __visc__getNodeInstanceID_x(parentNode);
+  //int dimx = __visc__getNumNodeInstances_x(thisNode);
+
+  //int gid = lx + 2*px*dimx;
+  
+  //for (unsigned stride = dimx; stride > 32; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+    //__visc__barrier();
+  //}
+
+  //for (unsigned stride = 32; stride >= 1; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+  //}
+
+  //if (lx == 0)
+    //__visc__atomic_max(maxG,G[gid]);
+
+  //__visc__return(m);
+//}
+
+void computeMaxGradientTB(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
+  __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
+
+  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m
+}
+
+void WrapperComputeMaxGradient(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x, int grid_x) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x);
+  __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
+  __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
+
+  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m
+}
+
+/* Reject the zero crossings where the gradient is below a threshold */
+/*
+ * S    : input (computed zero crossings)
+ * m, n : dimensions
+ * G: gradient of (smoothed) image
+ * E    : output (edges of the image)
+ * Need 2D grid, a thread per pixel
+ */
+
+#define THETA 0.1
+void rejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  float mG = *maxG;
+  //float mG = 1.39203;
+  if ((gx < n) && (gy < m)) {
+    E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
+  }
+  __visc__return(m);
+}
+
+void WrapperRejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
+  __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
+  __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
+  __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG
+  __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG
+  __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E
+  __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE
+  __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m
+  __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(RZCNode, 0, 0, 0); // bind output m
+}
+
+
+
+// Pipelined Root node
+void edgeDetection(float *I, size_t bytesI, // 0
+                   float *Is, size_t bytesIs, // 2
+                   float *L, size_t bytesL, // 4
+                   float *S, size_t bytesS, // 6
+                   float *G, size_t bytesG, // 8
+                   float *maxG, size_t bytesMaxG, // 10
+                   float *E, size_t bytesE, // 12
+                   float *Gs, size_t bytesGs, // 14
+                   float *B, size_t bytesB, // 16
+                   float *Sx, size_t bytesSx, // 18
+                   float *Sy, size_t bytesSy, // 20
+                   int m, // 22
+                   int n, // 23
+                   int block_x, // 24
+                   int grid_x // 25
+                   ) { 
+  __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
+  __visc__hint(visc::CPU_TARGET);
+  void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
+  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
+  void* CGNode = __visc__createNode(WrapperComputeGradient);
+  void* CMGNode = __visc__createNode(WrapperComputeMaxGradient);
+  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+
+  // Gaussian Inputs
+  __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
+  __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI
+  __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
+  __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
+  __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is
+  __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs
+  __visc__bindIn(GSNode, 22, 6, 1); // Bind m
+  __visc__bindIn(GSNode, 23, 7, 1); // Bind n
+
+  // Laplacian Inputs
+  __visc__bindIn(LNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(LNode, 16, 2, 1); // Bind B
+  __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(LNode, 4 , 4, 1); // Bind L
+  __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL
+//  __visc__bindIn(LNode, 22, 6, 1); // Bind m
+  __visc__edge(GSNode, LNode, 0, 6, 1); // Get m
+  __visc__bindIn(LNode, 23, 7, 1); // Bind n
+
+  // Compute ZC Inputs
+  __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L
+  __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL
+  __visc__bindIn(CZCNode, 16, 2, 1); // Bind B
+  __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S
+  __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS
+  //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m
+  __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m
+  __visc__bindIn(CZCNode, 23, 7, 1); // Bind n
+
+  // Gradient Inputs
+  __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx
+  __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx
+  __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy
+  __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy
+  __visc__bindIn(CGNode, 8 , 6, 1); // Bind G
+  __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG
+ __visc__bindIn(CGNode, 22, 8, 1); // Bind m
+  //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m
+  //__visc__bindIn(CGNode, 23, 9, 1); // Bind n
+  __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n
+
+  // Max Gradient Inputs
+  __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G
+  __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG
+  __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG
+  __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG
+ __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
+  //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m
+  //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n
+  __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n
+  __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
+  __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
+
+  // Reject ZC Inputs
+  __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
+  __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS
+  __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G
+  __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG
+  __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG
+  __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 12, 6, 1); // Bind E
+  __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE
+  //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m
+  __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m
+  //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n
+  __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n
+
+  __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality
+}
+
+}
+
+using namespace cv;
+
+void getNextFrame(VideoCapture& VC, Mat& F) {
+  VC >> F;
+  /// Convert the image to grayscale if image colored
+  if(F.channels() == 3)
+    cvtColor( F, F, CV_BGR2GRAY );
+
+  F.convertTo(F, CV_32F, 1.0/255.0);
+
+}
+
+//void showInOut(Mat& Input, Mat& Output) {
+    //Mat in, out;
+    //resize(Input, in, Size(512, 768));
+    //resize(Output, out, Size(512, 768));
+    //imshow(input_window, in);
+    //imshow(output_window, out);
+//}
+
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t I_sz;
+    int block_x, grid_x;
+
+    std::cout << "Using OpenCV" << CV_VERSION << "\n";
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] != NULL))
+    {
+        fprintf(stderr, "Expecting input image filename\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    std::cout << "Reading video file: " << params->inpFiles[0] << "\n";
+    VideoCapture cap(params->inpFiles[0]);
+    if(!cap.isOpened()) {
+      std::cout << "Could not open video file" << "\n";
+      return -1;
+    }
+
+    int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
+    std::cout << "Number of frames = " << NUM_FRAMES << "\n";
+    namedWindow(input_window, CV_WINDOW_AUTOSIZE);
+    namedWindow(output_window, CV_WINDOW_AUTOSIZE);
+    moveWindow(input_window, POSX_IN, POSY_IN);
+    moveWindow(output_window, POSX_OUT, POSY_OUT);
+
+    Mat src, Is, L, S, G, E;
+
+    getNextFrame(cap, src);
+
+    std::cout << "Image dimension = " << src.size() << "\n";
+    if(!src.isContinuous()) {
+      std::cout << "Expecting contiguous storage of image in memory!\n";
+      exit(-1);
+    }
+
+    Is = Mat(src.size[0], src.size[1], CV_32F);
+    L = Mat(src.size[0], src.size[1], CV_32F);
+    S = Mat(src.size[0], src.size[1], CV_32F);
+    G = Mat(src.size[0], src.size[1], CV_32F);
+    E = Mat(src.size[0], src.size[1], CV_32F);
+
+    // All these matrices need to have their data array contiguous in memory
+    assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous());
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    I_sz = src.size[0]*src.size[1]*sizeof(float);
+
+    size_t bytesMaxG = sizeof(float);
+    float* maxG = (float*)malloc(bytesMaxG);
+
+    float B[] = { 1, 1, 1,
+                  1, 1, 1,
+                  1, 1, 1 };
+    size_t bytesB = 9*sizeof(float);
+    //Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+    //Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+    float Sx[] = {  -1, 0, 1,
+                    -2, 0, 2,
+                    -1, 0, 1  };
+    size_t bytesSx = 9*sizeof(float);
+    float Sy[] = {  -1, -2, -1,
+                     0,  0,  0,
+                     1,  2,  1  };
+    size_t bytesSy = 9*sizeof(float);
+
+    float Gs [] = {
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 };
+    size_t bytesGs = 7*7*sizeof(float);
+
+    block_x = 256;
+    // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
+    grid_x = 1;
+
+    // Copy A and B^T into device memory
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    //showInOut(src, E);
+    Mat in, out;
+    resize(src, in, Size(HEIGHT, WIDTH));
+    resize(E, out, Size(HEIGHT, WIDTH));
+    imshow(input_window, in);
+    imshow(output_window, out);
+    waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
+    packData(args, (float*)src.data, I_sz,
+                   (float*)Is.data, I_sz,
+                   (float*)L.data, I_sz,
+                   (float*)S.data, I_sz,
+                   (float*)G.data, I_sz,
+                   maxG, bytesMaxG,
+                   (float*)E.data, I_sz,
+                   Gs, bytesGs,
+                   B, bytesB,
+                   Sx, bytesSx,
+                   Sy, bytesSy,
+                   src.size[0], src.size[1],
+                   block_x, grid_x);
+
+    // Check if the total elements is a multiple of block size
+    assert(src.size[0]*src.size[1] % block_x == 0);
+
+    //imshow(input_window, src);
+    //imshow(output_window, E);
+    //waitKey(0);
+    for(unsigned j=0; j<NUM_RUNS; j++) {
+      std::cout << "Run: " << j << "\n";
+      void* DFG = __visc__launch(1, edgeDetection, (void*)args);
+
+      cap = VideoCapture(params->inpFiles[0]);
+      getNextFrame(cap, src);
+      
+      //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
+      if(NUM_FRAMES >=2) {
+        //__visc__push(DFG, args);
+        //__visc__push(DFG, args);
+        for(int i=0; i<NUM_FRAMES; i++) {
+          //std::cout << "Frame " << i << "\n";
+          args->I = (float*) src.data;
+
+          *maxG = 0.0;
+ 
+          llvm_visc_track_mem(src.data, I_sz);
+          llvm_visc_track_mem(Is.data, I_sz);
+          llvm_visc_track_mem(L.data, I_sz);
+          llvm_visc_track_mem(S.data, I_sz);
+          llvm_visc_track_mem(G.data, I_sz);
+          llvm_visc_track_mem(maxG, bytesMaxG);
+          llvm_visc_track_mem(E.data, I_sz);
+          llvm_visc_track_mem(Gs, bytesGs);
+          llvm_visc_track_mem(B, bytesB);
+          llvm_visc_track_mem(Sx, bytesSx);
+          llvm_visc_track_mem(Sy, bytesSy);
+
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+
+          //llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "Show E" << "\n";
+          //imshow(window_name, E);
+          //waitKey(0);
+          //llvm_visc_request_mem(src.data, I_sz);
+          //llvm_visc_request_mem(Is.data, I_sz);
+          //llvm_visc_request_mem(L.data, I_sz);
+          //llvm_visc_request_mem(S.data, I_sz);
+          //llvm_visc_request_mem(G.data, I_sz);
+          llvm_visc_request_mem(maxG, bytesMaxG);
+          llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
+          //std::cout << "Max G = " << *maxG << "\n";
+          
+          Mat in, out;
+          resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
+          resize(E, out, Size(HEIGHT, WIDTH));
+          imshow(output_window, out);
+          imshow(input_window, in);
+          waitKey(1);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+
+          llvm_visc_untrack_mem(src.data);
+          llvm_visc_untrack_mem(Is.data);
+          llvm_visc_untrack_mem(L.data);
+          llvm_visc_untrack_mem(S.data);
+          llvm_visc_untrack_mem(G.data);
+          llvm_visc_untrack_mem(maxG);
+          llvm_visc_untrack_mem(E.data);
+          llvm_visc_untrack_mem(Gs);
+          llvm_visc_untrack_mem(B);
+          llvm_visc_untrack_mem(Sx);
+          llvm_visc_untrack_mem(Sy);
+
+          getNextFrame(cap, src);
+
+        }
+        //__visc__pop(DFG);
+        //__visc__pop(DFG);
+      }
+      else {
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+      }
+
+     
+      __visc__wait(DFG);
+    }
+
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    //if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        //writeColMajorMatrixFile(params->outFile,
+                                //src.size[0], src.size[1], matE);
+    //}
+   
+    //std::cout << "Show Is" << "\n";
+    //Mat output(src.size[0], src.size[1], CV_32F);
+    //imshow(output_window, Is);
+    //waitKey(0);
+
+    //std::cout << "Show G" << "\n";
+    //imshow(output_window, L);
+    //waitKey(0);
+
+    //std::cout << "Show L" << "\n";
+    //imshow(output_window, S);
+    //waitKey(0);
+    
+    //std::cout << "Show S" << "\n";
+    //imshow(output_window, G);
+    //waitKey(0);
+    
+    //std::cout << "Show E" << "\n";
+    //imshow(output_window, E);
+    //waitKey(0);
+
+    //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl;
+    pb_FreeParameters(params);
+
+    return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/Makefile
@@ -0,0 +1,12 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
+
+#OpenCV link flags all
+#/usr/bin/c++   -std=c++0x   CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o  -o EdgeDetect  -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc
similarity index 94%
rename from llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc
rename to llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc
index e67683e36e7f0fbe44f1f48c2eaa27d4cf821842..c77934beaedb52c020fc1d41aca3686671ab75e2 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU-Scalar-ZC/main.cc
@@ -23,6 +23,39 @@
 #include <parboil.h>
 #include <visc.h>
 
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "GPU-CPU(No Vector)-ZC Pipeline - Input Video";
+std::string output_window = "GPU-CPU(No Vector)-ZC Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
 // Definitions of sizes for edge detection kernels
 
 #define MIN_BR 0.0f
@@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::GPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB][SZB];
@@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
                           float *S, size_t bytesS,
                           int m, int n) {
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::SPIR_TARGET);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
 
   // 3x3 image area
@@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
   void* thisNode = __visc__getNode();
@@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
                            int m, int n,
                            int block_x) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
   void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
@@ -645,7 +679,8 @@ void rejectZeroCrossings(float *S, size_t bytesS,
                          float *maxG, size_t bytesMaxG,
                          float *E, size_t bytesE,
                          int m, int n) {
-  __visc__hint(visc::CPU_TARGET);
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
 
   void* thisNode = __visc__getNode();
@@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0
 }
 
 }
-#define NUM_RUNS 1
-#define DEPTH 3
-#define HEIGHT 640
-#define WIDTH 480
-std::string input_window = "GPU Pipeline - Input Video";
-std::string output_window = "GPU Pipeline - Edge Mapping";
-#define POSX_IN     0
-#define POSY_IN     0
-#define POSX_OUT    0
-#define POSY_OUT    540
 
-//#define NUM_FRAMES 20
 using namespace cv;
 
 void getNextFrame(VideoCapture& VC, Mat& F) {
@@ -826,8 +850,6 @@ int main (int argc, char *argv[]) {
 
     size_t I_sz;
     int block_x, grid_x;
-    std::string input_window = "Input Frame";
-    std::string output_window = "Edge Map";
 
     std::cout << "Using OpenCV" << CV_VERSION << "\n";
 
@@ -878,7 +900,7 @@ int main (int argc, char *argv[]) {
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     I_sz = src.size[0]*src.size[1]*sizeof(float);
 
@@ -915,17 +937,19 @@ int main (int argc, char *argv[]) {
     grid_x = 1;
 
     // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
-    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
-    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
-    Mat in, out;
     //showInOut(src, E);
+    Mat in, out;
     resize(src, in, Size(HEIGHT, WIDTH));
     resize(E, out, Size(HEIGHT, WIDTH));
     imshow(input_window, in);
     imshow(output_window, out);
     waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
     packData(args, (float*)src.data, I_sz,
                    (float*)Is.data, I_sz,
                    (float*)L.data, I_sz,
@@ -952,7 +976,9 @@ int main (int argc, char *argv[]) {
 
       cap = VideoCapture(params->inpFiles[0]);
       getNextFrame(cap, src);
+      
       //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
       if(NUM_FRAMES >=2) {
         //__visc__push(DFG, args);
         //__visc__push(DFG, args);
@@ -961,6 +987,7 @@ int main (int argc, char *argv[]) {
           args->I = (float*) src.data;
 
           *maxG = 0.0;
+ 
           llvm_visc_track_mem(src.data, I_sz);
           llvm_visc_track_mem(Is.data, I_sz);
           llvm_visc_track_mem(L.data, I_sz);
@@ -972,7 +999,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_track_mem(B, bytesB);
           llvm_visc_track_mem(Sx, bytesSx);
           llvm_visc_track_mem(Sy, bytesSy);
- 
+
           __visc__push(DFG, args);
           __visc__pop(DFG);
 
@@ -981,38 +1008,44 @@ int main (int argc, char *argv[]) {
           //imshow(window_name, E);
           //waitKey(0);
           //llvm_visc_request_mem(src.data, I_sz);
-          llvm_visc_request_mem(Is.data, I_sz);
-          llvm_visc_request_mem(L.data, I_sz);
-          llvm_visc_request_mem(S.data, I_sz);
-          llvm_visc_request_mem(G.data, I_sz);
+          //llvm_visc_request_mem(Is.data, I_sz);
+          //llvm_visc_request_mem(L.data, I_sz);
+          //llvm_visc_request_mem(S.data, I_sz);
+          //llvm_visc_request_mem(G.data, I_sz);
           llvm_visc_request_mem(maxG, bytesMaxG);
           llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
           //std::cout << "Max G = " << *maxG << "\n";
           
           Mat in, out;
           resize(src, in, Size(HEIGHT, WIDTH));
-          std::cout << "Show E\n";
+          //std::cout << "Show E\n";
           resize(E, out, Size(HEIGHT, WIDTH));
-          imshow(input_window, in);
-          imshow(output_window, out);
-          waitKey(0);
-          std::cout << "Show Is\n";
-          resize(Is, out, Size(HEIGHT, WIDTH));
-          imshow(output_window, out);
-          waitKey(0);
-          std::cout << "Show L\n";
-          resize(L, out, Size(HEIGHT, WIDTH));
-          imshow(output_window, out);
-          waitKey(0);
-          std::cout << "Show S\n";
-          resize(S, out, Size(HEIGHT, WIDTH));
           imshow(output_window, out);
-          waitKey(0);
-          std::cout << "Show G\n";
-          resize(G, out, Size(HEIGHT, WIDTH));
-          imshow(output_window, out);
-          waitKey(0);
-          getNextFrame(cap, src);
+          imshow(input_window, in);
+          waitKey(1);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
 
           llvm_visc_untrack_mem(src.data);
           llvm_visc_untrack_mem(Is.data);
@@ -1026,7 +1059,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_untrack_mem(Sx);
           llvm_visc_untrack_mem(Sy);
 
-
+          getNextFrame(cap, src);
 
         }
         //__visc__pop(DFG);
@@ -1036,7 +1069,8 @@ int main (int argc, char *argv[]) {
           __visc__push(DFG, args);
           __visc__pop(DFG);
       }
-      
+
+     
       __visc__wait(DFG);
     }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile
index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/Makefile
@@ -4,8 +4,8 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc
index 0da2c030aa83aee13481f2c160d2cc99f7acb2e7..8f5c8451b85e2b02c8e4879e53d871002fc3b171 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscGPU/main.cc
@@ -23,6 +23,39 @@
 #include <parboil.h>
 #include <visc.h>
 
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "GPU Pipeline - Input Video";
+std::string output_window = "GPU Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
 // Definitions of sizes for edge detection kernels
 
 #define MIN_BR 0.0f
@@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::GPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB][SZB];
@@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
                           float *S, size_t bytesS,
                           int m, int n) {
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::SPIR_TARGET);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
 
   // 3x3 image area
@@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
   void* thisNode = __visc__getNode();
@@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
                            int m, int n,
                            int block_x) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
   void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
@@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS,
                          float *E, size_t bytesE,
                          int m, int n) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
 
   void* thisNode = __visc__getNode();
@@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0
 }
 
 }
-#define NUM_RUNS 100
-#define DEPTH 3
-#define HEIGHT 640
-#define WIDTH 480
-std::string input_window = "GPU Pipeline - Input Video";
-std::string output_window = "GPU Pipeline - Edge Mapping";
-#define POSX_IN     0
-#define POSY_IN     0
-#define POSX_OUT    0
-#define POSY_OUT    540
 
-//#define NUM_FRAMES 20
 using namespace cv;
 
 void getNextFrame(VideoCapture& VC, Mat& F) {
@@ -826,8 +850,6 @@ int main (int argc, char *argv[]) {
 
     size_t I_sz;
     int block_x, grid_x;
-    std::string input_window = "Input Frame";
-    std::string output_window = "Edge Map";
 
     std::cout << "Using OpenCV" << CV_VERSION << "\n";
 
@@ -878,7 +900,7 @@ int main (int argc, char *argv[]) {
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     I_sz = src.size[0]*src.size[1]*sizeof(float);
 
@@ -915,17 +937,19 @@ int main (int argc, char *argv[]) {
     grid_x = 1;
 
     // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
-    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
-    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
-    Mat in, out;
     //showInOut(src, E);
+    Mat in, out;
     resize(src, in, Size(HEIGHT, WIDTH));
     resize(E, out, Size(HEIGHT, WIDTH));
     imshow(input_window, in);
     imshow(output_window, out);
     waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
     packData(args, (float*)src.data, I_sz,
                    (float*)Is.data, I_sz,
                    (float*)L.data, I_sz,
@@ -952,7 +976,9 @@ int main (int argc, char *argv[]) {
 
       cap = VideoCapture(params->inpFiles[0]);
       getNextFrame(cap, src);
+      
       //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
       if(NUM_FRAMES >=2) {
         //__visc__push(DFG, args);
         //__visc__push(DFG, args);
@@ -961,6 +987,7 @@ int main (int argc, char *argv[]) {
           args->I = (float*) src.data;
 
           *maxG = 0.0;
+ 
           llvm_visc_track_mem(src.data, I_sz);
           llvm_visc_track_mem(Is.data, I_sz);
           llvm_visc_track_mem(L.data, I_sz);
@@ -972,7 +999,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_track_mem(B, bytesB);
           llvm_visc_track_mem(Sx, bytesSx);
           llvm_visc_track_mem(Sy, bytesSy);
- 
+
           __visc__push(DFG, args);
           __visc__pop(DFG);
 
@@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) {
           //llvm_visc_request_mem(G.data, I_sz);
           llvm_visc_request_mem(maxG, bytesMaxG);
           llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
           //std::cout << "Max G = " << *maxG << "\n";
           
           Mat in, out;
           resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
           resize(E, out, Size(HEIGHT, WIDTH));
           imshow(output_window, out);
           imshow(input_window, in);
           waitKey(1);
-          getNextFrame(cap, src);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
 
           llvm_visc_untrack_mem(src.data);
           llvm_visc_untrack_mem(Is.data);
@@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_untrack_mem(Sx);
           llvm_visc_untrack_mem(Sy);
 
-
+          getNextFrame(cap, src);
 
         }
         //__visc__pop(DFG);
@@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) {
           __visc__push(DFG, args);
           __visc__pop(DFG);
       }
-      
+
+     
       __visc__wait(DFG);
     }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile
index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/Makefile
@@ -4,8 +4,8 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc
index 717255cc9f9f3300566601cdc9a060503487d8b4..6d4b6e4304496ed555b52e630d80ff016a57d8e1 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscScalar/main.cc
@@ -23,6 +23,39 @@
 #include <parboil.h>
 #include <visc.h>
 
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "CPU(No vector) Pipeline - Input Video";
+std::string output_window = "CPU(No vector) Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
 // Definitions of sizes for edge detection kernels
 
 #define MIN_BR 0.0f
@@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::GPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB][SZB];
@@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
                           float *S, size_t bytesS,
                           int m, int n) {
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::SPIR_TARGET);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
 
   // 3x3 image area
@@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
   void* thisNode = __visc__getNode();
@@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
                            int m, int n,
                            int block_x) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
   void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
@@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS,
                          float *E, size_t bytesE,
                          int m, int n) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
 
   void* thisNode = __visc__getNode();
@@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0
 }
 
 }
-#define NUM_RUNS 100
-#define DEPTH 3
-#define HEIGHT 640
-#define WIDTH 480
-std::string input_window = "Scalar Pipeline - Input Video";
-std::string output_window = "Scalar Pipeline - Edge Mapping";
-#define POSX_IN     1280
-#define POSY_IN     0
-#define POSX_OUT    1280
-#define POSY_OUT    540
 
-//#define NUM_FRAMES 20
 using namespace cv;
 
 void getNextFrame(VideoCapture& VC, Mat& F) {
@@ -826,8 +850,6 @@ int main (int argc, char *argv[]) {
 
     size_t I_sz;
     int block_x, grid_x;
-    std::string input_window = "Input Frame";
-    std::string output_window = "Edge Map";
 
     std::cout << "Using OpenCV" << CV_VERSION << "\n";
 
@@ -878,7 +900,7 @@ int main (int argc, char *argv[]) {
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     I_sz = src.size[0]*src.size[1]*sizeof(float);
 
@@ -915,17 +937,19 @@ int main (int argc, char *argv[]) {
     grid_x = 1;
 
     // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
-    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
-    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
-    Mat in, out;
     //showInOut(src, E);
+    Mat in, out;
     resize(src, in, Size(HEIGHT, WIDTH));
     resize(E, out, Size(HEIGHT, WIDTH));
     imshow(input_window, in);
     imshow(output_window, out);
     waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
     packData(args, (float*)src.data, I_sz,
                    (float*)Is.data, I_sz,
                    (float*)L.data, I_sz,
@@ -952,7 +976,9 @@ int main (int argc, char *argv[]) {
 
       cap = VideoCapture(params->inpFiles[0]);
       getNextFrame(cap, src);
+      
       //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
       if(NUM_FRAMES >=2) {
         //__visc__push(DFG, args);
         //__visc__push(DFG, args);
@@ -961,6 +987,7 @@ int main (int argc, char *argv[]) {
           args->I = (float*) src.data;
 
           *maxG = 0.0;
+ 
           llvm_visc_track_mem(src.data, I_sz);
           llvm_visc_track_mem(Is.data, I_sz);
           llvm_visc_track_mem(L.data, I_sz);
@@ -972,7 +999,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_track_mem(B, bytesB);
           llvm_visc_track_mem(Sx, bytesSx);
           llvm_visc_track_mem(Sy, bytesSy);
- 
+
           __visc__push(DFG, args);
           __visc__pop(DFG);
 
@@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) {
           //llvm_visc_request_mem(G.data, I_sz);
           llvm_visc_request_mem(maxG, bytesMaxG);
           llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
           //std::cout << "Max G = " << *maxG << "\n";
           
           Mat in, out;
           resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
           resize(E, out, Size(HEIGHT, WIDTH));
           imshow(output_window, out);
           imshow(input_window, in);
           waitKey(1);
-          getNextFrame(cap, src);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
 
           llvm_visc_untrack_mem(src.data);
           llvm_visc_untrack_mem(Is.data);
@@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_untrack_mem(Sx);
           llvm_visc_untrack_mem(Sy);
 
-
+          getNextFrame(cap, src);
 
         }
         //__visc__pop(DFG);
@@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) {
           __visc__push(DFG, args);
           __visc__pop(DFG);
       }
-      
+
+     
       __visc__wait(DFG);
     }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/Makefile
@@ -0,0 +1,12 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
+
+#OpenCV link flags all
+#/usr/bin/c++   -std=c++0x   CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o  -o EdgeDetect  -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc
new file mode 100755
index 0000000000000000000000000000000000000000..22197ad6fb4905660fedef106210d27f31543709
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-MaxG/main.cc
@@ -0,0 +1,1117 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include "opencv2/opencv.hpp"
+#include "opencv2/core/ocl.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <iostream>
+#include <cassert>
+#include <parboil.h>
+#include <visc.h>
+
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "Vector-CPU(No Vector)-MaxG Pipeline - Input Video";
+std::string output_window = "Vector-CPU(No Vector)-MaxG Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
+// Definitions of sizes for edge detection kernels
+
+#define MIN_BR 0.0f
+#define MAX_BR 1.0f
+
+// Code needs to be changed for this to vary
+#define SZB 3
+
+#define REDUCTION_TILE_SZ 1024
+
+#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
+extern "C" {
+
+struct __attribute__((__packed__)) InStruct {
+  float* I ;
+  size_t bytesI;
+  float* Is ;
+  size_t bytesIs;
+  float* L;
+  size_t bytesL;
+  float* S;
+  size_t bytesS;
+  float* G;
+  size_t bytesG;
+  float* maxG;
+  size_t bytesMaxG;
+  float* E;
+  size_t bytesE;
+  float* Gs;
+  size_t bytesGs;
+  float* B;
+  size_t bytesB;
+  float* Sx;
+  size_t bytesSx;
+  float* Sy;
+  size_t bytesSy;
+  int m;
+  int n;
+  int block_x;
+  int grid_x;
+};
+
+
+void packData(struct InStruct* args, float* I, size_t bytesI,
+                                     float* Is, size_t bytesIs,
+                                     float* L, size_t bytesL,
+                                     float* S, size_t bytesS,
+                                     float* G, size_t bytesG,
+                                     float* maxG, size_t bytesMaxG,
+                                     float* E, size_t bytesE,
+                                     float* Gs, size_t bytesGs,
+                                     float* B, size_t bytesB,
+                                     float* Sx, size_t bytesSx,
+                                     float* Sy, size_t bytesSy,
+                                     int m, int n,
+                                     int block_x, int grid_x) {
+  args->I = I;
+  args->bytesI = bytesI;
+  args->Is = Is;
+  args->bytesIs = bytesIs;
+  args->L = L;
+  args->bytesL = bytesL;
+  args->S = S;
+  args->bytesS = bytesS;
+  args->G = G;
+  args->bytesG = bytesG;
+  args->maxG = maxG;
+  args->bytesMaxG = bytesMaxG;
+  args->E = E;
+  args->bytesE = bytesE;
+  args->Gs = Gs;
+  args->bytesGs = bytesGs;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->Sx = Sx;
+  args->bytesSx = bytesSx;
+  args->Sy = Sy;
+  args->bytesSy = bytesSy;
+  args->m = m;
+  args->n = n;
+  args->block_x = block_x;
+  args->grid_x = grid_x;
+}
+
+/* 
+ * Gaussian smoothing of image I of size m x n
+ * I : input image
+ * Gs : gaussian filter
+ * Is: output (smoothed image)
+ * m, n : dimensions
+ * 
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define GAUSSIAN_SIZE 7
+#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2)
+void gaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, I, Gs, 1, Is);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float smoothedVal = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++)
+      for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = I[loadOffset];
+        smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j];
+      }
+  
+    Is[gloc] = smoothedVal;
+  }
+  __visc__return(m, n);
+}
+
+void WrapperGaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, I, Gs, 1, Is);
+  void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n);
+  __visc__bindIn(GSNode, 0, 0, 0); // Bind I
+  __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
+  __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
+  __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
+  __visc__bindIn(GSNode, 4, 4, 0); // Bind Is
+  __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
+  __visc__bindIn(GSNode, 6, 6, 0); // Bind m
+  __visc__bindIn(GSNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(GSNode, 0, 0, 0); // bind output m
+  __visc__bindOut(GSNode, 1, 1, 0); // bind output n
+}
+
+
+/* Compute a non-linear laplacian estimate of input image I of size m x n */
+/*
+ * Is   : blurred imput image
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * L    : output (laplacian of the image)
+ * Need 2D grid, a thread per pixel
+*/
+void laplacianEstimate(float *Is, size_t bytesIs,
+                            float *B, size_t bytesB,
+                            float *L, size_t bytesL,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, Is, B, 1, L);
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered laplacian\n";
+
+  int i, j;
+
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = Is[gy * n + gx];
+
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else {
+      imageArea[1][0] = Is[gy * n + gx - 1];
+      imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = Is[gy * n + gx + 1];
+      imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1];
+    L[gy*n+gx] = laplacian;
+  }
+  //OutStruct output = {bytesB, bytesL};
+  //if(gx == m-1 && gy == n-1)
+    //std::cout << "Exit laplacian\n";
+  __visc__return(m);
+}
+
+void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
+                          float *B, size_t bytesB,
+                          float *L, size_t bytesL,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, Is, B, 1, L);
+  void* LNode = __visc__createNode2D(laplacianEstimate, m, n);
+  __visc__bindIn(LNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(LNode, 2, 2, 0); // Bind B
+  __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(LNode, 4, 4, 0); // Bind L
+  __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL
+  __visc__bindIn(LNode, 6, 6, 0); // Bind m
+  __visc__bindIn(LNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(LNode, 0, 0, 0); // bind output m
+
+}
+
+/* Compute the zero crossings of input image L of size m x n */
+/*
+ * L    : imput image (computed Laplacian)
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * S    : output (sign of the image)
+ * Need 2D grid, a thread per pixel
+ */
+void computeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  int i, j;
+
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered ZC\n";
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR;
+
+    if (gx == 0) { // left most line
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else { 
+      imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][0] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][2] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ?
+                          (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+    imageArea[2][1] = (gy < m - 1)?
+                          (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float pixelSign = dilatedPixel - erodedPixel;
+    S[gy*n+gx] = pixelSign;
+  }
+  //OutStruct output = {bytesB, bytesS};
+  //if(gx == n-1 && gy == n-1)
+    //std::cout << "Exit ZC\n";
+  __visc__return(m); 
+}
+
+void WrapperComputeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+  void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n);
+  __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
+  __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
+  __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
+  __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(ZCNode, 4, 4, 0); // Bind S
+  __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
+  __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
+  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(ZCNode, 0, 0, 0); // bind output m
+
+}
+
+/*
+ * Gradient computation using Sobel filters
+ * Is   : input (smoothed image)
+ * Sx, Sy: Sobel operators
+ * - Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+ * - Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+ * m, n : dimensions
+ * G: output, gradient magnitude : sqrt(Gx^2+Gy^2)
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define SOBEL_SIZE 3
+#define SOBEL_RADIUS (SOBEL_SIZE / 2)
+
+void computeGradient(float *Is, size_t bytesIs,
+              float *Sx, size_t bytesSx,
+              float *Sy, size_t bytesSy,
+              float *G, size_t bytesG,
+              int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float Gx = 0;
+  float Gy = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++)
+      for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = Is[loadOffset];
+        Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+        Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+      }
+
+    G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy);
+    //G[gloc] = Gx*Gx + Gy*Gy;
+  }
+  __visc__return(n);
+}
+
+void WrapperComputeGradient(float *Is, size_t bytesIs,
+                                 float *Sx, size_t bytesSx,
+                                 float *Sy, size_t bytesSy,
+                                 float *G, size_t bytesG,
+                                 int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+  void* CGNode = __visc__createNode2D(computeGradient, m, n);
+  __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
+  __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
+  __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy
+  __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
+  __visc__bindIn(CGNode, 6, 6, 0); // Bind G
+  __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG
+  __visc__bindIn(CGNode, 8, 8, 0); // Bind m
+  __visc__bindIn(CGNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(CGNode, 0, 0, 0); // bind output m
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * m, n: input size
+ * Needs a single thread block
+ */
+void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            float *maxG, size_t bytesMaxG,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(1, G, 1, maxG);
+
+  void* thisNode = __visc__getNode();
+
+  int lx = __visc__getNodeInstanceID_x(thisNode);       // threadIdx.x
+  int dimx = __visc__getNumNodeInstances_x(thisNode);   // blockDim.x
+
+
+  // Assume a single thread block
+  // Thread block iterates over all elements
+  for (int i = lx + dimx; i < m*n; i+= dimx) {
+    if (G[lx] < G[i])
+      G[lx] = G[i];
+  }
+
+  // First thread iterates over all elements of the thread block
+  if (lx == 0) {
+    for (int i = 1; (i < dimx) && (i < m*n); i++)
+      if (G[lx] < G[i])
+        G[lx] = G[i];
+
+    *maxG = G[lx];
+  }
+
+  __visc__return(n);
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * Each static node processes 2*nodeDim elements
+ * Need 1D grid, a thread per 2 pixels
+ */
+//void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            //float *maxG, size_t bytesMaxG,
+                            //int m, int n) {
+
+  //__visc__hint(visc::DEVICE);
+  //TODO: maxG should be initialized to zero (MIN_BR) every time
+  //__visc__attributes(2, G, maxG, 1, maxG);
+
+  //void* thisNode = __visc__getNode();
+  //void* parentNode = __visc__getParentNode(thisNode);
+
+  //int lx = __visc__getNodeInstanceID_x(thisNode);
+  //int px = __visc__getNodeInstanceID_x(parentNode);
+  //int dimx = __visc__getNumNodeInstances_x(thisNode);
+
+  //int gid = lx + 2*px*dimx;
+  
+  //for (unsigned stride = dimx; stride > 32; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+    //__visc__barrier();
+  //}
+
+  //for (unsigned stride = 32; stride >= 1; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+  //}
+
+  //if (lx == 0)
+    //__visc__atomic_max(maxG,G[gid]);
+
+  //__visc__return(m);
+//}
+
+void computeMaxGradientTB(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
+  __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
+
+  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m
+}
+
+void WrapperComputeMaxGradient(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x, int grid_x) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x);
+  __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
+  __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
+
+  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m
+}
+
+/* Reject the zero crossings where the gradient is below a threshold */
+/*
+ * S    : input (computed zero crossings)
+ * m, n : dimensions
+ * G: gradient of (smoothed) image
+ * E    : output (edges of the image)
+ * Need 2D grid, a thread per pixel
+ */
+
+#define THETA 0.1
+void rejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  float mG = *maxG;
+  //float mG = 1.39203;
+  if ((gx < n) && (gy < m)) {
+    E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
+  }
+  __visc__return(m);
+}
+
+void WrapperRejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
+  __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
+  __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
+  __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG
+  __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG
+  __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E
+  __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE
+  __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m
+  __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(RZCNode, 0, 0, 0); // bind output m
+}
+
+
+
+// Pipelined Root node
+void edgeDetection(float *I, size_t bytesI, // 0
+                   float *Is, size_t bytesIs, // 2
+                   float *L, size_t bytesL, // 4
+                   float *S, size_t bytesS, // 6
+                   float *G, size_t bytesG, // 8
+                   float *maxG, size_t bytesMaxG, // 10
+                   float *E, size_t bytesE, // 12
+                   float *Gs, size_t bytesGs, // 14
+                   float *B, size_t bytesB, // 16
+                   float *Sx, size_t bytesSx, // 18
+                   float *Sy, size_t bytesSy, // 20
+                   int m, // 22
+                   int n, // 23
+                   int block_x, // 24
+                   int grid_x // 25
+                   ) { 
+  __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
+  __visc__hint(visc::CPU_TARGET);
+  void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
+  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
+  void* CGNode = __visc__createNode(WrapperComputeGradient);
+  void* CMGNode = __visc__createNode(WrapperComputeMaxGradient);
+  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+
+  // Gaussian Inputs
+  __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
+  __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI
+  __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
+  __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
+  __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is
+  __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs
+  __visc__bindIn(GSNode, 22, 6, 1); // Bind m
+  __visc__bindIn(GSNode, 23, 7, 1); // Bind n
+
+  // Laplacian Inputs
+  __visc__bindIn(LNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(LNode, 16, 2, 1); // Bind B
+  __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(LNode, 4 , 4, 1); // Bind L
+  __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL
+//  __visc__bindIn(LNode, 22, 6, 1); // Bind m
+  __visc__edge(GSNode, LNode, 0, 6, 1); // Get m
+  __visc__bindIn(LNode, 23, 7, 1); // Bind n
+
+  // Compute ZC Inputs
+  __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L
+  __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL
+  __visc__bindIn(CZCNode, 16, 2, 1); // Bind B
+  __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S
+  __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS
+  //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m
+  __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m
+  __visc__bindIn(CZCNode, 23, 7, 1); // Bind n
+
+  // Gradient Inputs
+  __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx
+  __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx
+  __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy
+  __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy
+  __visc__bindIn(CGNode, 8 , 6, 1); // Bind G
+  __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG
+ __visc__bindIn(CGNode, 22, 8, 1); // Bind m
+  //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m
+  //__visc__bindIn(CGNode, 23, 9, 1); // Bind n
+  __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n
+
+  // Max Gradient Inputs
+  __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G
+  __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG
+  __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG
+  __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG
+ __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
+  //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m
+  //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n
+  __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n
+  __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
+  __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
+
+  // Reject ZC Inputs
+  __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
+  __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS
+  __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G
+  __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG
+  __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG
+  __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 12, 6, 1); // Bind E
+  __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE
+  //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m
+  __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m
+  //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n
+  __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n
+
+  __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality
+}
+
+}
+
+using namespace cv;
+
+void getNextFrame(VideoCapture& VC, Mat& F) {
+  VC >> F;
+  /// Convert the image to grayscale if image colored
+  if(F.channels() == 3)
+    cvtColor( F, F, CV_BGR2GRAY );
+
+  F.convertTo(F, CV_32F, 1.0/255.0);
+
+}
+
+//void showInOut(Mat& Input, Mat& Output) {
+    //Mat in, out;
+    //resize(Input, in, Size(512, 768));
+    //resize(Output, out, Size(512, 768));
+    //imshow(input_window, in);
+    //imshow(output_window, out);
+//}
+
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t I_sz;
+    int block_x, grid_x;
+
+    std::cout << "Using OpenCV" << CV_VERSION << "\n";
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] != NULL))
+    {
+        fprintf(stderr, "Expecting input image filename\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    std::cout << "Reading video file: " << params->inpFiles[0] << "\n";
+    VideoCapture cap(params->inpFiles[0]);
+    if(!cap.isOpened()) {
+      std::cout << "Could not open video file" << "\n";
+      return -1;
+    }
+
+    int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
+    std::cout << "Number of frames = " << NUM_FRAMES << "\n";
+    namedWindow(input_window, CV_WINDOW_AUTOSIZE);
+    namedWindow(output_window, CV_WINDOW_AUTOSIZE);
+    moveWindow(input_window, POSX_IN, POSY_IN);
+    moveWindow(output_window, POSX_OUT, POSY_OUT);
+
+    Mat src, Is, L, S, G, E;
+
+    getNextFrame(cap, src);
+
+    std::cout << "Image dimension = " << src.size() << "\n";
+    if(!src.isContinuous()) {
+      std::cout << "Expecting contiguous storage of image in memory!\n";
+      exit(-1);
+    }
+
+    Is = Mat(src.size[0], src.size[1], CV_32F);
+    L = Mat(src.size[0], src.size[1], CV_32F);
+    S = Mat(src.size[0], src.size[1], CV_32F);
+    G = Mat(src.size[0], src.size[1], CV_32F);
+    E = Mat(src.size[0], src.size[1], CV_32F);
+
+    // All these matrices need to have their data array contiguous in memory
+    assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous());
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    I_sz = src.size[0]*src.size[1]*sizeof(float);
+
+    size_t bytesMaxG = sizeof(float);
+    float* maxG = (float*)malloc(bytesMaxG);
+
+    float B[] = { 1, 1, 1,
+                  1, 1, 1,
+                  1, 1, 1 };
+    size_t bytesB = 9*sizeof(float);
+    //Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+    //Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+    float Sx[] = {  -1, 0, 1,
+                    -2, 0, 2,
+                    -1, 0, 1  };
+    size_t bytesSx = 9*sizeof(float);
+    float Sy[] = {  -1, -2, -1,
+                     0,  0,  0,
+                     1,  2,  1  };
+    size_t bytesSy = 9*sizeof(float);
+
+    float Gs [] = {
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 };
+    size_t bytesGs = 7*7*sizeof(float);
+
+    block_x = 256;
+    // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
+    grid_x = 1;
+
+    // Copy A and B^T into device memory
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    //showInOut(src, E);
+    Mat in, out;
+    resize(src, in, Size(HEIGHT, WIDTH));
+    resize(E, out, Size(HEIGHT, WIDTH));
+    imshow(input_window, in);
+    imshow(output_window, out);
+    waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
+    packData(args, (float*)src.data, I_sz,
+                   (float*)Is.data, I_sz,
+                   (float*)L.data, I_sz,
+                   (float*)S.data, I_sz,
+                   (float*)G.data, I_sz,
+                   maxG, bytesMaxG,
+                   (float*)E.data, I_sz,
+                   Gs, bytesGs,
+                   B, bytesB,
+                   Sx, bytesSx,
+                   Sy, bytesSy,
+                   src.size[0], src.size[1],
+                   block_x, grid_x);
+
+    // Check if the total elements is a multiple of block size
+    assert(src.size[0]*src.size[1] % block_x == 0);
+
+    //imshow(input_window, src);
+    //imshow(output_window, E);
+    //waitKey(0);
+    for(unsigned j=0; j<NUM_RUNS; j++) {
+      std::cout << "Run: " << j << "\n";
+      void* DFG = __visc__launch(1, edgeDetection, (void*)args);
+
+      cap = VideoCapture(params->inpFiles[0]);
+      getNextFrame(cap, src);
+      
+      //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
+      if(NUM_FRAMES >=2) {
+        //__visc__push(DFG, args);
+        //__visc__push(DFG, args);
+        for(int i=0; i<NUM_FRAMES; i++) {
+          //std::cout << "Frame " << i << "\n";
+          args->I = (float*) src.data;
+
+          *maxG = 0.0;
+ 
+          llvm_visc_track_mem(src.data, I_sz);
+          llvm_visc_track_mem(Is.data, I_sz);
+          llvm_visc_track_mem(L.data, I_sz);
+          llvm_visc_track_mem(S.data, I_sz);
+          llvm_visc_track_mem(G.data, I_sz);
+          llvm_visc_track_mem(maxG, bytesMaxG);
+          llvm_visc_track_mem(E.data, I_sz);
+          llvm_visc_track_mem(Gs, bytesGs);
+          llvm_visc_track_mem(B, bytesB);
+          llvm_visc_track_mem(Sx, bytesSx);
+          llvm_visc_track_mem(Sy, bytesSy);
+
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+
+          //llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "Show E" << "\n";
+          //imshow(window_name, E);
+          //waitKey(0);
+          //llvm_visc_request_mem(src.data, I_sz);
+          //llvm_visc_request_mem(Is.data, I_sz);
+          //llvm_visc_request_mem(L.data, I_sz);
+          //llvm_visc_request_mem(S.data, I_sz);
+          //llvm_visc_request_mem(G.data, I_sz);
+          llvm_visc_request_mem(maxG, bytesMaxG);
+          llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
+          //std::cout << "Max G = " << *maxG << "\n";
+          
+          Mat in, out;
+          resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
+          resize(E, out, Size(HEIGHT, WIDTH));
+          imshow(output_window, out);
+          imshow(input_window, in);
+          waitKey(1);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+
+          llvm_visc_untrack_mem(src.data);
+          llvm_visc_untrack_mem(Is.data);
+          llvm_visc_untrack_mem(L.data);
+          llvm_visc_untrack_mem(S.data);
+          llvm_visc_untrack_mem(G.data);
+          llvm_visc_untrack_mem(maxG);
+          llvm_visc_untrack_mem(E.data);
+          llvm_visc_untrack_mem(Gs);
+          llvm_visc_untrack_mem(B);
+          llvm_visc_untrack_mem(Sx);
+          llvm_visc_untrack_mem(Sy);
+
+          getNextFrame(cap, src);
+
+        }
+        //__visc__pop(DFG);
+        //__visc__pop(DFG);
+      }
+      else {
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+      }
+
+     
+      __visc__wait(DFG);
+    }
+
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    //if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        //writeColMajorMatrixFile(params->outFile,
+                                //src.size[0], src.size[1], matE);
+    //}
+   
+    //std::cout << "Show Is" << "\n";
+    //Mat output(src.size[0], src.size[1], CV_32F);
+    //imshow(output_window, Is);
+    //waitKey(0);
+
+    //std::cout << "Show G" << "\n";
+    //imshow(output_window, L);
+    //waitKey(0);
+
+    //std::cout << "Show L" << "\n";
+    //imshow(output_window, S);
+    //waitKey(0);
+    
+    //std::cout << "Show S" << "\n";
+    //imshow(output_window, G);
+    //waitKey(0);
+    
+    //std::cout << "Show E" << "\n";
+    //imshow(output_window, E);
+    //waitKey(0);
+
+    //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl;
+    pb_FreeParameters(params);
+
+    return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f87cb91102c01826ecf87c2e698822d7caaaef5e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/Makefile
@@ -0,0 +1,12 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
+
+#OpenCV link flags all
+#/usr/bin/c++   -std=c++0x   CMakeFiles/EdgeDetect.dir/EdgeDetect.cpp.o  -o EdgeDetect  -L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_cudev.so.3.0.0 /opt/opencv/lib/libopencv_cudawarping.so.3.0.0 /opt/opencv/lib/libopencv_cudastereo.so.3.0.0 /opt/opencv/lib/libopencv_cudaoptflow.so.3.0.0 /opt/opencv/lib/libopencv_cudaobjdetect.so.3.0.0 /opt/opencv/lib/libopencv_cudalegacy.so.3.0.0 /opt/opencv/lib/libopencv_cudaimgproc.so.3.0.0 /opt/opencv/lib/libopencv_cudafilters.so.3.0.0 /opt/opencv/lib/libopencv_cudafeatures2d.so.3.0.0 /opt/opencv/lib/libopencv_cudacodec.so.3.0.0 /opt/opencv/lib/libopencv_cudabgsegm.so.3.0.0 /opt/opencv/lib/libopencv_cudaarithm.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -lcudart -lnpp -lcufft -lcudart -lnpp -lcufft -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc
new file mode 100755
index 0000000000000000000000000000000000000000..a061fc03e9e63d9dd12b2628ca3d72eeac9f8a6f
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector-Scalar-ZC/main.cc
@@ -0,0 +1,1117 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include "opencv2/opencv.hpp"
+#include "opencv2/core/ocl.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <iostream>
+#include <cassert>
+#include <parboil.h>
+#include <visc.h>
+
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "Vector-CPU(No Vector)-ZC Pipeline - Input Video";
+std::string output_window = "Vector-CPU(No Vector)-ZC Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
+// Definitions of sizes for edge detection kernels
+
+#define MIN_BR 0.0f
+#define MAX_BR 1.0f
+
+// Code needs to be changed for this to vary
+#define SZB 3
+
+#define REDUCTION_TILE_SZ 1024
+
+#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
+extern "C" {
+
+struct __attribute__((__packed__)) InStruct {
+  float* I ;
+  size_t bytesI;
+  float* Is ;
+  size_t bytesIs;
+  float* L;
+  size_t bytesL;
+  float* S;
+  size_t bytesS;
+  float* G;
+  size_t bytesG;
+  float* maxG;
+  size_t bytesMaxG;
+  float* E;
+  size_t bytesE;
+  float* Gs;
+  size_t bytesGs;
+  float* B;
+  size_t bytesB;
+  float* Sx;
+  size_t bytesSx;
+  float* Sy;
+  size_t bytesSy;
+  int m;
+  int n;
+  int block_x;
+  int grid_x;
+};
+
+
+void packData(struct InStruct* args, float* I, size_t bytesI,
+                                     float* Is, size_t bytesIs,
+                                     float* L, size_t bytesL,
+                                     float* S, size_t bytesS,
+                                     float* G, size_t bytesG,
+                                     float* maxG, size_t bytesMaxG,
+                                     float* E, size_t bytesE,
+                                     float* Gs, size_t bytesGs,
+                                     float* B, size_t bytesB,
+                                     float* Sx, size_t bytesSx,
+                                     float* Sy, size_t bytesSy,
+                                     int m, int n,
+                                     int block_x, int grid_x) {
+  args->I = I;
+  args->bytesI = bytesI;
+  args->Is = Is;
+  args->bytesIs = bytesIs;
+  args->L = L;
+  args->bytesL = bytesL;
+  args->S = S;
+  args->bytesS = bytesS;
+  args->G = G;
+  args->bytesG = bytesG;
+  args->maxG = maxG;
+  args->bytesMaxG = bytesMaxG;
+  args->E = E;
+  args->bytesE = bytesE;
+  args->Gs = Gs;
+  args->bytesGs = bytesGs;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->Sx = Sx;
+  args->bytesSx = bytesSx;
+  args->Sy = Sy;
+  args->bytesSy = bytesSy;
+  args->m = m;
+  args->n = n;
+  args->block_x = block_x;
+  args->grid_x = grid_x;
+}
+
+/* 
+ * Gaussian smoothing of image I of size m x n
+ * I : input image
+ * Gs : gaussian filter
+ * Is: output (smoothed image)
+ * m, n : dimensions
+ * 
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define GAUSSIAN_SIZE 7
+#define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2)
+void gaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, I, Gs, 1, Is);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float smoothedVal = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++)
+      for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = I[loadOffset];
+        smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j];
+      }
+  
+    Is[gloc] = smoothedVal;
+  }
+  __visc__return(m, n);
+}
+
+void WrapperGaussianSmoothing(float *I, size_t bytesI,
+                       float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs,
+                       int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, I, Gs, 1, Is);
+  void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n);
+  __visc__bindIn(GSNode, 0, 0, 0); // Bind I
+  __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
+  __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
+  __visc__bindIn(GSNode, 3, 3, 0); // Bind bytesGs
+  __visc__bindIn(GSNode, 4, 4, 0); // Bind Is
+  __visc__bindIn(GSNode, 5, 5, 0); // Bind bytesIs
+  __visc__bindIn(GSNode, 6, 6, 0); // Bind m
+  __visc__bindIn(GSNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(GSNode, 0, 0, 0); // bind output m
+  __visc__bindOut(GSNode, 1, 1, 0); // bind output n
+}
+
+
+/* Compute a non-linear laplacian estimate of input image I of size m x n */
+/*
+ * Is   : blurred imput image
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * L    : output (laplacian of the image)
+ * Need 2D grid, a thread per pixel
+*/
+void laplacianEstimate(float *Is, size_t bytesIs,
+                            float *B, size_t bytesB,
+                            float *L, size_t bytesL,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, Is, B, 1, L);
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered laplacian\n";
+
+  int i, j;
+
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = Is[gy * n + gx];
+
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else {
+      imageArea[1][0] = Is[gy * n + gx - 1];
+      imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = Is[gy * n + gx + 1];
+      imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1];
+    L[gy*n+gx] = laplacian;
+  }
+  //OutStruct output = {bytesB, bytesL};
+  //if(gx == m-1 && gy == n-1)
+    //std::cout << "Exit laplacian\n";
+  __visc__return(m);
+}
+
+void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
+                          float *B, size_t bytesB,
+                          float *L, size_t bytesL,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, Is, B, 1, L);
+  void* LNode = __visc__createNode2D(laplacianEstimate, m, n);
+  __visc__bindIn(LNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(LNode, 2, 2, 0); // Bind B
+  __visc__bindIn(LNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(LNode, 4, 4, 0); // Bind L
+  __visc__bindIn(LNode, 5, 5, 0); // Bind bytesL
+  __visc__bindIn(LNode, 6, 6, 0); // Bind m
+  __visc__bindIn(LNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(LNode, 0, 0, 0); // bind output m
+
+}
+
+/* Compute the zero crossings of input image L of size m x n */
+/*
+ * L    : imput image (computed Laplacian)
+ * m, n : dimensions
+ * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
+ * S    : output (sign of the image)
+ * Need 2D grid, a thread per pixel
+ */
+void computeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+
+  // 3x3 image area
+  float imageArea[SZB][SZB];
+
+  //int gx = get_global_id(0);
+  //int gy = get_global_id(1);
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  int i, j;
+
+  //if(gx == 0 && gy == 0)
+    //std::cout << "Entered ZC\n";
+  if ((gx < n) && (gy < m)) {
+    // Data copy for dilation filter
+    imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR;
+
+    if (gx == 0) { // left most line
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+    } else { 
+      imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][0] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][0] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+    } else {
+      imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR;
+      imageArea[0][2] = (gy > 0) ?
+                            (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+      imageArea[2][2] = (gy < m - 1) ?
+                            (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
+                            : MIN_BR;
+    }
+
+    imageArea[0][1] = (gy > 0) ?
+                          (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+    imageArea[2][1] = (gy < m - 1)?
+                          (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+                          : MIN_BR;
+
+    // Compute pixel of dilated image
+    float dilatedPixel = MIN_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    // Data copy for erotion filter - only change the boundary conditions
+    if (gx == 0) {
+      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][0] = MAX_BR;
+      if (gy == m-1) imageArea[2][0] = MAX_BR;
+    }
+
+    if (gx == n - 1) {
+      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+    } else {
+      if (gy == 0) imageArea[0][2] = MAX_BR;
+      if (gy == m-1) imageArea[2][2] = MAX_BR;
+    }
+
+    if (gy == 0) imageArea[0][1] = MAX_BR;
+    if (gy == m-1) imageArea[2][1] = MAX_BR;
+
+    // Compute pixel of eroded image
+    float erodedPixel = MAX_BR;
+    for (i = 0; i < SZB; i++)
+      for (j = 0; j < SZB; j++)
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+
+    float pixelSign = dilatedPixel - erodedPixel;
+    S[gy*n+gx] = pixelSign;
+  }
+  //OutStruct output = {bytesB, bytesS};
+  //if(gx == n-1 && gy == n-1)
+    //std::cout << "Exit ZC\n";
+  __visc__return(m); 
+}
+
+void WrapperComputeZeroCrossings(float *L, size_t bytesL,
+                          float *B, size_t bytesB,
+                          float *S, size_t bytesS,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, L, B, 1, S);
+  void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n);
+  __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
+  __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
+  __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
+  __visc__bindIn(ZCNode, 3, 3, 0); // Bind bytesB
+  __visc__bindIn(ZCNode, 4, 4, 0); // Bind S
+  __visc__bindIn(ZCNode, 5, 5, 0); // Bind bytesS
+  __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
+  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(ZCNode, 0, 0, 0); // bind output m
+
+}
+
+/*
+ * Gradient computation using Sobel filters
+ * Is   : input (smoothed image)
+ * Sx, Sy: Sobel operators
+ * - Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+ * - Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+ * m, n : dimensions
+ * G: output, gradient magnitude : sqrt(Gx^2+Gy^2)
+ * Need 2D grid, a thread per pixel
+ * No use of separable algorithm because we need to do this in one kernel
+ * No use of shared memory because
+ * - we don't handle it in the X86 pass
+ */
+
+#define SOBEL_SIZE 3
+#define SOBEL_RADIUS (SOBEL_SIZE / 2)
+
+void computeGradient(float *Is, size_t bytesIs,
+              float *Sx, size_t bytesSx,
+              float *Sy, size_t bytesSy,
+              float *G, size_t bytesG,
+              int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  int gloc = gx + gy*n;
+
+  float Gx = 0;
+  float Gy = 0;
+  float gval;
+  int loadOffset;
+
+  if ((gx < n) && (gy < m)) {
+    for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++)
+      for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) {
+
+        loadOffset = gloc + i*n + j;
+      
+        if ((gy + i) < 0) // top contour
+          loadOffset = gx + j;
+        else if ((gy + i) > m-1 ) // bottom contour
+          loadOffset = (m-1)*n + gx + j;
+        else 
+          loadOffset = gloc + i*n + j; // within image vertically
+
+        // Adjust so we are within image horizonally
+        if ((gx + j) < 0) // left contour
+          loadOffset -= (gx+j);
+        else if ((gx + j) > n-1 ) // right contour
+          loadOffset = loadOffset - gx - j + n - 1;
+
+        gval = Is[loadOffset];
+        Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+        Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+      }
+
+    G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy);
+    //G[gloc] = Gx*Gx + Gy*Gy;
+  }
+  __visc__return(n);
+}
+
+void WrapperComputeGradient(float *Is, size_t bytesIs,
+                                 float *Sx, size_t bytesSx,
+                                 float *Sy, size_t bytesSy,
+                                 float *G, size_t bytesG,
+                                 int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, Is, Sx, Sy, 1, G);
+  void* CGNode = __visc__createNode2D(computeGradient, m, n);
+  __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
+  __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
+  __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
+  __visc__bindIn(CGNode, 3, 3, 0); // Bind bytesSx
+  __visc__bindIn(CGNode, 4, 4, 0); // Bind Sy
+  __visc__bindIn(CGNode, 5, 5, 0); // Bind bytesSy
+  __visc__bindIn(CGNode, 6, 6, 0); // Bind G
+  __visc__bindIn(CGNode, 7, 7, 0); // Bind bytesG
+  __visc__bindIn(CGNode, 8, 8, 0); // Bind m
+  __visc__bindIn(CGNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(CGNode, 0, 0, 0); // bind output m
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * m, n: input size
+ * Needs a single thread block
+ */
+void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            float *maxG, size_t bytesMaxG,
+                            int m, int n) {
+
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(1, G, 1, maxG);
+
+  void* thisNode = __visc__getNode();
+
+  int lx = __visc__getNodeInstanceID_x(thisNode);       // threadIdx.x
+  int dimx = __visc__getNumNodeInstances_x(thisNode);   // blockDim.x
+
+
+  // Assume a single thread block
+  // Thread block iterates over all elements
+  for (int i = lx + dimx; i < m*n; i+= dimx) {
+    if (G[lx] < G[i])
+      G[lx] = G[i];
+  }
+
+  // First thread iterates over all elements of the thread block
+  if (lx == 0) {
+    for (int i = 1; (i < dimx) && (i < m*n); i++)
+      if (G[lx] < G[i])
+        G[lx] = G[i];
+
+    *maxG = G[lx];
+  }
+
+  __visc__return(n);
+}
+
+/* 
+ * Reduction
+ * G : input
+ * maxG: output
+ * Each static node processes 2*nodeDim elements
+ * Need 1D grid, a thread per 2 pixels
+ */
+//void computeMaxGradientLeaf(float *G, size_t bytesG,
+                            //float *maxG, size_t bytesMaxG,
+                            //int m, int n) {
+
+  //__visc__hint(visc::DEVICE);
+  //TODO: maxG should be initialized to zero (MIN_BR) every time
+  //__visc__attributes(2, G, maxG, 1, maxG);
+
+  //void* thisNode = __visc__getNode();
+  //void* parentNode = __visc__getParentNode(thisNode);
+
+  //int lx = __visc__getNodeInstanceID_x(thisNode);
+  //int px = __visc__getNodeInstanceID_x(parentNode);
+  //int dimx = __visc__getNumNodeInstances_x(thisNode);
+
+  //int gid = lx + 2*px*dimx;
+  
+  //for (unsigned stride = dimx; stride > 32; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+    //__visc__barrier();
+  //}
+
+  //for (unsigned stride = 32; stride >= 1; stride >>= 1) {
+    //if ((gid + stride < m*n) && (lx < stride))
+      //if (G[gid + stride] > G[gid])
+        //G[gid] = G[gid + stride];
+  //}
+
+  //if (lx == 0)
+    //__visc__atomic_max(maxG,G[gid]);
+
+  //__visc__return(m);
+//}
+
+void computeMaxGradientTB(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
+  __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
+
+  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m
+}
+
+void WrapperComputeMaxGradient(float *G, size_t bytesG,
+                           float *maxG, size_t bytesMaxG,
+                           int m, int n,
+                           int block_x, int grid_x) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, G, maxG, 1, maxG);
+  void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x);
+  __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
+  __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
+  __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
+  __visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
+  __visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
+  __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
+  __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
+
+  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m
+}
+
+/* Reject the zero crossings where the gradient is below a threshold */
+/*
+ * S    : input (computed zero crossings)
+ * m, n : dimensions
+ * G: gradient of (smoothed) image
+ * E    : output (edges of the image)
+ * Need 2D grid, a thread per pixel
+ */
+
+#define THETA 0.1
+void rejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+
+  float mG = *maxG;
+  //float mG = 1.39203;
+  if ((gx < n) && (gy < m)) {
+    E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
+  }
+  __visc__return(m);
+}
+
+void WrapperRejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG,
+                         float *E, size_t bytesE,
+                         int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, S, G, maxG, 1, E);
+  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
+  __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
+  __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
+  __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG
+  __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG
+  __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E
+  __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE
+  __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m
+  __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
+
+  __visc__bindOut(RZCNode, 0, 0, 0); // bind output m
+}
+
+
+
+// Pipelined Root node
+void edgeDetection(float *I, size_t bytesI, // 0
+                   float *Is, size_t bytesIs, // 2
+                   float *L, size_t bytesL, // 4
+                   float *S, size_t bytesS, // 6
+                   float *G, size_t bytesG, // 8
+                   float *maxG, size_t bytesMaxG, // 10
+                   float *E, size_t bytesE, // 12
+                   float *Gs, size_t bytesGs, // 14
+                   float *B, size_t bytesB, // 16
+                   float *Sx, size_t bytesSx, // 18
+                   float *Sy, size_t bytesSy, // 20
+                   int m, // 22
+                   int n, // 23
+                   int block_x, // 24
+                   int grid_x // 25
+                   ) { 
+  __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
+  __visc__hint(visc::CPU_TARGET);
+  void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
+  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
+  void* CGNode = __visc__createNode(WrapperComputeGradient);
+  void* CMGNode = __visc__createNode(WrapperComputeMaxGradient);
+  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+
+  // Gaussian Inputs
+  __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
+  __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI
+  __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
+  __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
+  __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is
+  __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs
+  __visc__bindIn(GSNode, 22, 6, 1); // Bind m
+  __visc__bindIn(GSNode, 23, 7, 1); // Bind n
+
+  // Laplacian Inputs
+  __visc__bindIn(LNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(LNode, 16, 2, 1); // Bind B
+  __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(LNode, 4 , 4, 1); // Bind L
+  __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL
+//  __visc__bindIn(LNode, 22, 6, 1); // Bind m
+  __visc__edge(GSNode, LNode, 0, 6, 1); // Get m
+  __visc__bindIn(LNode, 23, 7, 1); // Bind n
+
+  // Compute ZC Inputs
+  __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L
+  __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL
+  __visc__bindIn(CZCNode, 16, 2, 1); // Bind B
+  __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB
+  __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S
+  __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS
+  //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m
+  __visc__edge(LNode, CZCNode, 0, 6, 1); // Get m
+  __visc__bindIn(CZCNode, 23, 7, 1); // Bind n
+
+  // Gradient Inputs
+  __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs
+  __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx
+  __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx
+  __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy
+  __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy
+  __visc__bindIn(CGNode, 8 , 6, 1); // Bind G
+  __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG
+ __visc__bindIn(CGNode, 22, 8, 1); // Bind m
+  //__visc__edge(CZCNode, CGNode, 0, 8, 1); // Get m
+  //__visc__bindIn(CGNode, 23, 9, 1); // Bind n
+  __visc__edge(GSNode, CGNode, 1, 9, 1); // Get n
+
+  // Max Gradient Inputs
+  __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G
+  __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG
+  __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG
+  __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG
+ __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
+  //__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m
+  //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n
+  __visc__edge(CGNode, CMGNode, 0, 5, 1); // Get n
+  __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
+  __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
+
+  // Reject ZC Inputs
+  __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
+  __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS
+  __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G
+  __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG
+  __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG
+  __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 12, 6, 1); // Bind E
+  __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE
+  //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m
+  __visc__edge(CZCNode, RZCNode, 0, 8, 1); // Get m
+  //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n
+  __visc__edge(CMGNode, RZCNode, 0, 9, 1); // Get n
+
+  __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality
+}
+
+}
+
+using namespace cv;
+
+void getNextFrame(VideoCapture& VC, Mat& F) {
+  VC >> F;
+  /// Convert the image to grayscale if image colored
+  if(F.channels() == 3)
+    cvtColor( F, F, CV_BGR2GRAY );
+
+  F.convertTo(F, CV_32F, 1.0/255.0);
+
+}
+
+//void showInOut(Mat& Input, Mat& Output) {
+    //Mat in, out;
+    //resize(Input, in, Size(512, 768));
+    //resize(Output, out, Size(512, 768));
+    //imshow(input_window, in);
+    //imshow(output_window, out);
+//}
+
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t I_sz;
+    int block_x, grid_x;
+
+    std::cout << "Using OpenCV" << CV_VERSION << "\n";
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] != NULL))
+    {
+        fprintf(stderr, "Expecting input image filename\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    std::cout << "Reading video file: " << params->inpFiles[0] << "\n";
+    VideoCapture cap(params->inpFiles[0]);
+    if(!cap.isOpened()) {
+      std::cout << "Could not open video file" << "\n";
+      return -1;
+    }
+
+    int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
+    std::cout << "Number of frames = " << NUM_FRAMES << "\n";
+    namedWindow(input_window, CV_WINDOW_AUTOSIZE);
+    namedWindow(output_window, CV_WINDOW_AUTOSIZE);
+    moveWindow(input_window, POSX_IN, POSY_IN);
+    moveWindow(output_window, POSX_OUT, POSY_OUT);
+
+    Mat src, Is, L, S, G, E;
+
+    getNextFrame(cap, src);
+
+    std::cout << "Image dimension = " << src.size() << "\n";
+    if(!src.isContinuous()) {
+      std::cout << "Expecting contiguous storage of image in memory!\n";
+      exit(-1);
+    }
+
+    Is = Mat(src.size[0], src.size[1], CV_32F);
+    L = Mat(src.size[0], src.size[1], CV_32F);
+    S = Mat(src.size[0], src.size[1], CV_32F);
+    G = Mat(src.size[0], src.size[1], CV_32F);
+    E = Mat(src.size[0], src.size[1], CV_32F);
+
+    // All these matrices need to have their data array contiguous in memory
+    assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous());
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    I_sz = src.size[0]*src.size[1]*sizeof(float);
+
+    size_t bytesMaxG = sizeof(float);
+    float* maxG = (float*)malloc(bytesMaxG);
+
+    float B[] = { 1, 1, 1,
+                  1, 1, 1,
+                  1, 1, 1 };
+    size_t bytesB = 9*sizeof(float);
+    //Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
+    //Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
+    float Sx[] = {  -1, 0, 1,
+                    -2, 0, 2,
+                    -1, 0, 1  };
+    size_t bytesSx = 9*sizeof(float);
+    float Sy[] = {  -1, -2, -1,
+                     0,  0,  0,
+                     1,  2,  1  };
+    size_t bytesSy = 9*sizeof(float);
+
+    float Gs [] = {
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291,
+            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 };
+    size_t bytesGs = 7*7*sizeof(float);
+
+    block_x = 256;
+    // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
+    grid_x = 1;
+
+    // Copy A and B^T into device memory
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    //showInOut(src, E);
+    Mat in, out;
+    resize(src, in, Size(HEIGHT, WIDTH));
+    resize(E, out, Size(HEIGHT, WIDTH));
+    imshow(input_window, in);
+    imshow(output_window, out);
+    waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
+    packData(args, (float*)src.data, I_sz,
+                   (float*)Is.data, I_sz,
+                   (float*)L.data, I_sz,
+                   (float*)S.data, I_sz,
+                   (float*)G.data, I_sz,
+                   maxG, bytesMaxG,
+                   (float*)E.data, I_sz,
+                   Gs, bytesGs,
+                   B, bytesB,
+                   Sx, bytesSx,
+                   Sy, bytesSy,
+                   src.size[0], src.size[1],
+                   block_x, grid_x);
+
+    // Check if the total elements is a multiple of block size
+    assert(src.size[0]*src.size[1] % block_x == 0);
+
+    //imshow(input_window, src);
+    //imshow(output_window, E);
+    //waitKey(0);
+    for(unsigned j=0; j<NUM_RUNS; j++) {
+      std::cout << "Run: " << j << "\n";
+      void* DFG = __visc__launch(1, edgeDetection, (void*)args);
+
+      cap = VideoCapture(params->inpFiles[0]);
+      getNextFrame(cap, src);
+      
+      //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
+      if(NUM_FRAMES >=2) {
+        //__visc__push(DFG, args);
+        //__visc__push(DFG, args);
+        for(int i=0; i<NUM_FRAMES; i++) {
+          //std::cout << "Frame " << i << "\n";
+          args->I = (float*) src.data;
+
+          *maxG = 0.0;
+ 
+          llvm_visc_track_mem(src.data, I_sz);
+          llvm_visc_track_mem(Is.data, I_sz);
+          llvm_visc_track_mem(L.data, I_sz);
+          llvm_visc_track_mem(S.data, I_sz);
+          llvm_visc_track_mem(G.data, I_sz);
+          llvm_visc_track_mem(maxG, bytesMaxG);
+          llvm_visc_track_mem(E.data, I_sz);
+          llvm_visc_track_mem(Gs, bytesGs);
+          llvm_visc_track_mem(B, bytesB);
+          llvm_visc_track_mem(Sx, bytesSx);
+          llvm_visc_track_mem(Sy, bytesSy);
+
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+
+          //llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "Show E" << "\n";
+          //imshow(window_name, E);
+          //waitKey(0);
+          //llvm_visc_request_mem(src.data, I_sz);
+          //llvm_visc_request_mem(Is.data, I_sz);
+          //llvm_visc_request_mem(L.data, I_sz);
+          //llvm_visc_request_mem(S.data, I_sz);
+          //llvm_visc_request_mem(G.data, I_sz);
+          llvm_visc_request_mem(maxG, bytesMaxG);
+          llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
+          //std::cout << "Max G = " << *maxG << "\n";
+          
+          Mat in, out;
+          resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
+          resize(E, out, Size(HEIGHT, WIDTH));
+          imshow(output_window, out);
+          imshow(input_window, in);
+          waitKey(1);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+
+          llvm_visc_untrack_mem(src.data);
+          llvm_visc_untrack_mem(Is.data);
+          llvm_visc_untrack_mem(L.data);
+          llvm_visc_untrack_mem(S.data);
+          llvm_visc_untrack_mem(G.data);
+          llvm_visc_untrack_mem(maxG);
+          llvm_visc_untrack_mem(E.data);
+          llvm_visc_untrack_mem(Gs);
+          llvm_visc_untrack_mem(B);
+          llvm_visc_untrack_mem(Sx);
+          llvm_visc_untrack_mem(Sy);
+
+          getNextFrame(cap, src);
+
+        }
+        //__visc__pop(DFG);
+        //__visc__pop(DFG);
+      }
+      else {
+          __visc__push(DFG, args);
+          __visc__pop(DFG);
+      }
+
+     
+      __visc__wait(DFG);
+    }
+
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    //if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        //writeColMajorMatrixFile(params->outFile,
+                                //src.size[0], src.size[1], matE);
+    //}
+   
+    //std::cout << "Show Is" << "\n";
+    //Mat output(src.size[0], src.size[1], CV_32F);
+    //imshow(output_window, Is);
+    //waitKey(0);
+
+    //std::cout << "Show G" << "\n";
+    //imshow(output_window, L);
+    //waitKey(0);
+
+    //std::cout << "Show L" << "\n";
+    //imshow(output_window, S);
+    //waitKey(0);
+    
+    //std::cout << "Show S" << "\n";
+    //imshow(output_window, G);
+    //waitKey(0);
+    
+    //std::cout << "Show E" << "\n";
+    //imshow(output_window, E);
+    //waitKey(0);
+
+    //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl;
+    pb_FreeParameters(params);
+
+    return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile
index f6c7ebfede0b947aad50dec89b2ecee55c1a36cd..f87cb91102c01826ecf87c2e698822d7caaaef5e 100644
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/Makefile
@@ -4,8 +4,8 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS=-ffast-math -O3 -I/opt/opencv/include
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc
index b1ff17bec86167e5b44ff3aa16a6af01f41dd8cc..9cbcf57b3d384fd96dacc7844bdff7412d8c24a3 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/viscVector/main.cc
@@ -23,6 +23,39 @@
 #include <parboil.h>
 #include <visc.h>
 
+
+#define NUM_RUNS 10000 
+#define DEPTH 3
+#define HEIGHT 640
+#define WIDTH 480
+std::string input_window = "Vector Pipeline - Input Video";
+std::string output_window = "Vector Pipeline - Edge Mapping";
+
+
+#ifdef MIDDLE
+  #define POSX_IN     640
+  #define POSY_IN     0
+  #define POSX_OUT    640
+  #define POSY_OUT    540
+
+#elif RIGHT
+  #define POSX_IN     1280
+  #define POSY_IN     0
+  #define POSX_OUT    1280
+  #define POSY_OUT    540
+
+#else // LEFT
+  #define POSX_IN     0
+  #define POSY_IN     0
+  #define POSX_OUT    0
+  #define POSY_OUT    540
+#endif
+
+
+//#define NUM_FRAMES 20
+
+
+
 // Definitions of sizes for edge detection kernels
 
 #define MIN_BR 0.0f
@@ -205,7 +238,6 @@ void laplacianEstimate(float *Is, size_t bytesIs,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::GPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
   float imageArea[SZB][SZB];
@@ -315,7 +347,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
                           float *S, size_t bytesS,
                           int m, int n) {
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::SPIR_TARGET);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
 
   // 3x3 image area
@@ -525,6 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             int m, int n) {
 
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
   void* thisNode = __visc__getNode();
@@ -600,6 +633,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
                            int m, int n,
                            int block_x) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
   void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
@@ -646,6 +680,7 @@ void rejectZeroCrossings(float *S, size_t bytesS,
                          float *E, size_t bytesE,
                          int m, int n) {
   __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
 
   void* thisNode = __visc__getNode();
@@ -786,18 +821,7 @@ void edgeDetection(float *I, size_t bytesI, // 0
 }
 
 }
-#define NUM_RUNS 100
-#define DEPTH 3
-#define HEIGHT 640
-#define WIDTH 480
-std::string input_window = "Vector Pipeline - Input Video";
-std::string output_window = "Vector Pipeline - Edge Mapping";
-#define POSX_IN     640
-#define POSY_IN     0
-#define POSX_OUT    640
-#define POSY_OUT    540
 
-//#define NUM_FRAMES 20
 using namespace cv;
 
 void getNextFrame(VideoCapture& VC, Mat& F) {
@@ -826,8 +850,6 @@ int main (int argc, char *argv[]) {
 
     size_t I_sz;
     int block_x, grid_x;
-    std::string input_window = "Input Frame";
-    std::string output_window = "Edge Map";
 
     std::cout << "Using OpenCV" << CV_VERSION << "\n";
 
@@ -878,7 +900,7 @@ int main (int argc, char *argv[]) {
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     I_sz = src.size[0]*src.size[1]*sizeof(float);
 
@@ -915,17 +937,19 @@ int main (int argc, char *argv[]) {
     grid_x = 1;
 
     // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
-    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
-    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
-    Mat in, out;
     //showInOut(src, E);
+    Mat in, out;
     resize(src, in, Size(HEIGHT, WIDTH));
     resize(E, out, Size(HEIGHT, WIDTH));
     imshow(input_window, in);
     imshow(output_window, out);
     waitKey(0);
+
+    //NUM_FRAMES = 20;
+    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
     packData(args, (float*)src.data, I_sz,
                    (float*)Is.data, I_sz,
                    (float*)L.data, I_sz,
@@ -952,7 +976,9 @@ int main (int argc, char *argv[]) {
 
       cap = VideoCapture(params->inpFiles[0]);
       getNextFrame(cap, src);
+      
       //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
+
       if(NUM_FRAMES >=2) {
         //__visc__push(DFG, args);
         //__visc__push(DFG, args);
@@ -961,6 +987,7 @@ int main (int argc, char *argv[]) {
           args->I = (float*) src.data;
 
           *maxG = 0.0;
+ 
           llvm_visc_track_mem(src.data, I_sz);
           llvm_visc_track_mem(Is.data, I_sz);
           llvm_visc_track_mem(L.data, I_sz);
@@ -972,7 +999,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_track_mem(B, bytesB);
           llvm_visc_track_mem(Sx, bytesSx);
           llvm_visc_track_mem(Sy, bytesSy);
- 
+
           __visc__push(DFG, args);
           __visc__pop(DFG);
 
@@ -987,15 +1014,38 @@ int main (int argc, char *argv[]) {
           //llvm_visc_request_mem(G.data, I_sz);
           llvm_visc_request_mem(maxG, bytesMaxG);
           llvm_visc_request_mem(E.data, I_sz);
+          //std::cout << "src.data = " << (float*)src.data << "\n";
+          //std::cout << "Is.data = " << (float*)Is.data << "\n";
+          //std::cout << "L.data = " << (float*)L.data << "\n";
+          //std::cout << "S.data = " << (float*)S.data << "\n";
+          //std::cout << "G.data = " << (float*)G.data << "\n";
+          //std::cout << "E.data = " << (float*)E.data << "\n";
           //std::cout << "Max G = " << *maxG << "\n";
           
           Mat in, out;
           resize(src, in, Size(HEIGHT, WIDTH));
+          //std::cout << "Show E\n";
           resize(E, out, Size(HEIGHT, WIDTH));
           imshow(output_window, out);
           imshow(input_window, in);
           waitKey(1);
-          getNextFrame(cap, src);
+          //waitKey(0);
+          //std::cout << "Show Is\n";
+          //resize(Is, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show L\n";
+          //resize(L, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show S\n";
+          //resize(S, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
+          //std::cout << "Show G\n";
+          //resize(G, out, Size(HEIGHT, WIDTH));
+          //imshow(output_window, out);
+          //waitKey(0);
 
           llvm_visc_untrack_mem(src.data);
           llvm_visc_untrack_mem(Is.data);
@@ -1009,7 +1059,7 @@ int main (int argc, char *argv[]) {
           llvm_visc_untrack_mem(Sx);
           llvm_visc_untrack_mem(Sy);
 
-
+          getNextFrame(cap, src);
 
         }
         //__visc__pop(DFG);
@@ -1019,7 +1069,8 @@ int main (int argc, char *argv[]) {
           __visc__push(DFG, args);
           __visc__pop(DFG);
       }
-      
+
+     
       __visc__wait(DFG);
     }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
index 838d461e247df0f58e9c7d701e9714f4065048dd..5953021cf2fe8f27e0899464ca493bf47bef876f 100755
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
@@ -24,11 +24,12 @@
 #include <visc.h>
 
 
-#define NUM_RUNS 5 
+#define NUM_RUNS 10000 
 #define DEPTH 3
 #define HEIGHT 640
 #define WIDTH 480
-std::string input_window = "GPU Pipeline - Input Video";
+
+std::string input_window =  "GPU Pipeline - Input Video";
 std::string output_window = "GPU Pipeline - Edge Mapping";