diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp
index 2f3fc6282ab6f31ae9b0af043df2fcd0a2c06e57..eff618548f3405b668249791738015043a537f17 100644
--- a/hpvm/projects/visc-rt/visc-rt.cpp
+++ b/hpvm/projects/visc-rt/visc-rt.cpp
@@ -74,7 +74,8 @@ void llvm_visc_policy_init() {
 //  policy = new NodePolicy();
 //  policy = new IterationPolicy();
 //  policy = new DeviceStatusPolicy();
-  policy = new InteractivePolicy();
+  // policy = new InteractivePolicy();
+  policy = new ConstPolicy(0);
   cout << "DONE: Initializing policy object.\n";
 }
 
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
index ea6b44788d0831221b2d289d1904d35f56240615..ec39b86f1cf71e2e8b6131b076c2953b566cbb56 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
@@ -4,11 +4,9 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize  #-I/shared/opencv/include
-APP_CXXFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize #-I/shared/opencv/include
-OpenCV_DIR=/shared/opencv
+APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
 APP_LDFLAGS=`pkg-config opencv --libs`
-#APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs`
 #APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
index 892cdabd090412c80f3a2d26ffd3e7c183650ade..b516f860a8c6b8317b11860127b8dc210c66b246 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
@@ -11,7 +11,7 @@
  */
 
 #include "opencv2/opencv.hpp"
-#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/core/ocl.hpp"
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
@@ -201,7 +201,7 @@ void gaussianSmoothing(float *I, size_t bytesI,
   
     Is[gloc] = smoothedVal;
   }
-  __visc__return(2, m, n);
+  __visc__return(2, bytesIs, bytesIs);
 }
 
 void WrapperGaussianSmoothing(float *I, size_t bytesI,
@@ -220,8 +220,8 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI,
   __visc__bindIn(GSNode, 6, 6, 0); // Bind m
   __visc__bindIn(GSNode, 7, 7, 0); // Bind n
 
-  __visc__bindOut(GSNode, 0, 0, 0); // bind output m
-  __visc__bindOut(GSNode, 1, 1, 0); // bind output n
+  __visc__bindOut(GSNode, 0, 0, 0); // bind output bytesIs
+  __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
 }
 
 
@@ -243,14 +243,9 @@ void laplacianEstimate(float *Is, size_t bytesIs,
   // 3x3 image area
   float imageArea[SZB*SZB];
 
-  //int gx = get_global_id(0);
-  //int gy = get_global_id(1);
   void* thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
-  //if(gx == 0 && gy == 0)
-    //std::cout << "Entered laplacian\n";
-
   int i, j;
 
   if ((gx < n) && (gy < m)) {
@@ -309,10 +304,7 @@ void laplacianEstimate(float *Is, size_t bytesIs,
     float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1];
     L[gy*n+gx] = laplacian;
   }
-  //OutStruct output = {bytesB, bytesL};
-  //if(gx == m-1 && gy == n-1)
-    //std::cout << "Exit laplacian\n";
-  __visc__return(1, m);
+  __visc__return(1, bytesL);
 }
 
 void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
@@ -331,7 +323,7 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
   __visc__bindIn(LNode, 6, 6, 0); // Bind m
   __visc__bindIn(LNode, 7, 7, 0); // Bind n
 
-  __visc__bindOut(LNode, 0, 0, 0); // bind output m
+  __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL
 
 }
 
@@ -354,15 +346,11 @@ void computeZeroCrossings(float *L, size_t bytesL,
   // 3x3 image area
   float imageArea[SZB][SZB];
 
-  //int gx = get_global_id(0);
-  //int gy = get_global_id(1);
   void* thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
   int i, j;
 
-  //if(gx == 0 && gy == 0)
-    //std::cout << "Entered ZC\n";
   if ((gx < n) && (gy < m)) {
     // Data copy for dilation filter
     imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR;
@@ -431,10 +419,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
     float pixelSign = dilatedPixel - erodedPixel;
     S[gy*n+gx] = pixelSign;
   }
-  //OutStruct output = {bytesB, bytesS};
-  //if(gx == n-1 && gy == n-1)
-    //std::cout << "Exit ZC\n";
-  __visc__return(1, m); 
+  __visc__return(1, bytesS); 
 }
 
 void WrapperComputeZeroCrossings(float *L, size_t bytesL,
@@ -453,7 +438,7 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL,
   __visc__bindIn(ZCNode, 6, 6, 0); // Bind m
   __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
 
-  __visc__bindOut(ZCNode, 0, 0, 0); // bind output m
+  __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
 
 }
 
@@ -519,9 +504,8 @@ void computeGradient(float *Is, size_t bytesIs,
       }
 
     G[gloc] = sqrt(Gx*Gx + Gy*Gy);
-    //G[gloc] = Gx*Gx + Gy*Gy;
   }
-  __visc__return(1, n);
+  __visc__return(1, bytesG);
 }
 
 void WrapperComputeGradient(float *Is, size_t bytesIs,
@@ -543,7 +527,7 @@ void WrapperComputeGradient(float *Is, size_t bytesIs,
   __visc__bindIn(CGNode, 8, 8, 0); // Bind m
   __visc__bindIn(CGNode, 9, 9, 0); // Bind n
 
-  __visc__bindOut(CGNode, 0, 0, 0); // bind output m
+  __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG
 }
 
 /* 
@@ -558,7 +542,6 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             long m, long n) {
 
   __visc__hint(visc::CPU_TARGET);
-  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
   void* thisNode = __visc__getNode();
@@ -566,7 +549,6 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   long lx = __visc__getNodeInstanceID_x(thisNode);       // threadIdx.x
   long dimx = __visc__getNumNodeInstances_x(thisNode);   // blockDim.x
 
-
   // Assume a single thread block
   // Thread block iterates over all elements
   for (int i = lx + dimx; i < m*n; i+= dimx) {
@@ -584,57 +566,13 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
     *maxG = G[lx];
   }
 
-  __visc__return(1, n);
+  __visc__return(1, bytesMaxG);
 }
 
-/* 
- * Reduction
- * G : input
- * maxG: output
- * Each static node processes 2*nodeDim elements
- * Need 1D grid, a thread per 2 pixels
- */
-//void computeMaxGradientLeaf(float *G, size_t bytesG,
-                            //float *maxG, size_t bytesMaxG,
-                            //int m, int n) {
-
-  //__visc__hint(visc::DEVICE);
-  //TODO: maxG should be initialized to zero (MIN_BR) every time
-  //__visc__attributes(2, G, maxG, 1, maxG);
-
-  //void* thisNode = __visc__getNode();
-  //void* parentNode = __visc__getParentNode(thisNode);
-
-  //int lx = __visc__getNodeInstanceID_x(thisNode);
-  //int px = __visc__getNodeInstanceID_x(parentNode);
-  //int dimx = __visc__getNumNodeInstances_x(thisNode);
-
-  //int gid = lx + 2*px*dimx;
-  
-  //for (unsigned stride = dimx; stride > 32; stride >>= 1) {
-    //if ((gid + stride < m*n) && (lx < stride))
-      //if (G[gid + stride] > G[gid])
-        //G[gid] = G[gid + stride];
-    //__visc__barrier();
-  //}
-
-  //for (unsigned stride = 32; stride >= 1; stride >>= 1) {
-    //if ((gid + stride < m*n) && (lx < stride))
-      //if (G[gid + stride] > G[gid])
-        //G[gid] = G[gid + stride];
-  //}
-
-  //if (lx == 0)
-    //__visc__atomic_max(maxG,G[gid]);
-
-  //__visc__return(1, m);
-//}
-
 void computeMaxGradientTB(float *G, size_t bytesG,
                            float *maxG, size_t bytesMaxG,
                            long m, long n,
                            long block_x) {
-  //__visc__hint(visc::DEVICE);
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
   void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
@@ -645,7 +583,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
   __visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
   __visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
 
-  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m
+  __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 void WrapperComputeMaxGradient(float *G, size_t bytesG,
@@ -663,7 +601,7 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG,
   __visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
   __visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
 
-  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m
+  __visc__bindOut(CMGTBNode, 0, 0, 0); // bind output bytesMaxG
 }
 
 /* Reject the zero crossings where the gradient is below a threshold */
@@ -682,7 +620,6 @@ void rejectZeroCrossings(float *S, size_t bytesS,
                          float *E, size_t bytesE,
                          long m, long n) {
   __visc__hint(visc::DEVICE);
-  //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
 
   void* thisNode = __visc__getNode();
@@ -690,11 +627,10 @@ void rejectZeroCrossings(float *S, size_t bytesS,
   int gy = __visc__getNodeInstanceID_y(thisNode);
 
   float mG = *maxG;
-  //float mG = 1.39203;
   if ((gx < n) && (gy < m)) {
     E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
   }
-  __visc__return(1, m);
+  __visc__return(1, bytesE);
 }
 
 void WrapperRejectZeroCrossings(float *S, size_t bytesS,
@@ -716,7 +652,7 @@ void WrapperRejectZeroCrossings(float *S, size_t bytesS,
   __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m
   __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
 
-  __visc__bindOut(RZCNode, 0, 0, 0); // bind output m
+  __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
 }
 
 
@@ -759,67 +695,59 @@ void edgeDetection(float *I, size_t bytesI, // 0
 
   // Laplacian Inputs
   __visc__bindIn(LNode, 2 , 0, 1); // Bind Is
-  __visc__bindIn(LNode, 3 , 1, 1); // Bind bytesIs
+  __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
   __visc__bindIn(LNode, 16, 2, 1); // Bind B
   __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB
   __visc__bindIn(LNode, 4 , 4, 1); // Bind L
   __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL
-//  __visc__bindIn(LNode, 22, 6, 1); // Bind m
-  __visc__edge(GSNode, LNode, 1, 0, 6, 1); // Get m
+  __visc__bindIn(LNode, 22, 6, 1); // Bind m
   __visc__bindIn(LNode, 23, 7, 1); // Bind n
 
   // Compute ZC Inputs
   __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L
-  __visc__bindIn(CZCNode, 5 , 1, 1); // Bind bytesL
+  __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
   __visc__bindIn(CZCNode, 16, 2, 1); // Bind B
   __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB
   __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S
   __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS
-  //__visc__bindIn(CZCNode, 22, 6, 1); // Bind m
-  __visc__edge(LNode, CZCNode, 1, 0, 6, 1); // Get m
+  __visc__bindIn(CZCNode, 22, 6, 1); // Bind m
   __visc__bindIn(CZCNode, 23, 7, 1); // Bind n
 
   // Gradient Inputs
   __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is
-  __visc__bindIn(CGNode, 3 , 1, 1); // Bind bytesIs
+  __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
   __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx
   __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx
   __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy
   __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy
   __visc__bindIn(CGNode, 8 , 6, 1); // Bind G
   __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG
- __visc__bindIn(CGNode, 22, 8, 1); // Bind m
-  //__visc__edge(CZCNode, CGNode, 1, 0, 8, 1); // Get m
-  //__visc__bindIn(CGNode, 23, 9, 1); // Bind n
-  __visc__edge(GSNode, CGNode, 1, 1, 9, 1); // Get n
+  __visc__bindIn(CGNode, 22, 8, 1); // Bind m
+  __visc__bindIn(CGNode, 23, 9, 1); // Bind n
 
   // Max Gradient Inputs
   __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G
-  __visc__bindIn(CMGNode, 9 , 1, 1); // Bind bytesG
+  __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
   __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG
   __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG
- __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
-  //__visc__edge(CGNode, CMGNode, 1, 0, 4, 1); // Get m
-  //__visc__bindIn(CMGNode, 23, 5, 1); // Bind n
-  __visc__edge(CGNode, CMGNode, 1, 0, 5, 1); // Get n
+  __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
+  __visc__bindIn(CMGNode, 23, 5, 1); // Bind n
   __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
   __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
 
   // Reject ZC Inputs
   __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
-  __visc__bindIn(RZCNode, 7 , 1, 1); // Bind bytesS
+  __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
   __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G
   __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG
   __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG
-  __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesMaxG
+  __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
   __visc__bindIn(RZCNode, 12, 6, 1); // Bind E
   __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE
-  //__visc__bindIn(RZCNode, 22, 8, 1); // Bind m
-  __visc__edge(CZCNode, RZCNode, 1, 0, 8, 1); // Get m
-  //__visc__bindIn(RZCNode, 23, 9, 1); // Bind n
-  __visc__edge(CMGNode, RZCNode, 1, 0, 9, 1); // Get n
+  __visc__bindIn(RZCNode, 22, 8, 1); // Bind m
+  __visc__bindIn(RZCNode, 23, 9, 1); // Bind n
 
-  __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality
+  __visc__bindOut(RZCNode, 0, 0, 1); // Bind output
 }
 
 }
@@ -833,18 +761,8 @@ void getNextFrame(VideoCapture& VC, Mat& F) {
     cvtColor( F, F, CV_BGR2GRAY );
 
   F.convertTo(F, CV_32F, 1.0/255.0);
-
 }
 
-//void showInOut(Mat& Input, Mat& Output) {
-    //Mat in, out;
-    //resize(Input, in, Size(512, 768));
-    //resize(Output, out, Size(512, 768));
-    //imshow(input_window, in);
-    //imshow(output_window, out);
-//}
-
-
 int main (int argc, char *argv[]) {
 
     struct pb_Parameters *params;
@@ -877,11 +795,6 @@ int main (int argc, char *argv[]) {
     NUM_FRAMES = 600;
     std::cout << "Number of frames = " << NUM_FRAMES << "\n";
 
-    // Used to store time after each frame computation is completed
-    //timeval *timeStamps = (timeval *)malloc(NUM_RUNS*NUM_FRAMES*sizeof(timeval));
-    //struct timeval tv_start;
-    //unsigned long cnt = 0; // Counter, for the video frames
-
     namedWindow(input_window, CV_WINDOW_AUTOSIZE);
     namedWindow(output_window, CV_WINDOW_AUTOSIZE);
     moveWindow(input_window, POSX_IN, POSY_IN);
@@ -909,7 +822,6 @@ int main (int argc, char *argv[]) {
     pb_InitializeTimerSet(&timers);
     __visc__init();
 
-    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     I_sz = src.size[0]*src.size[1]*sizeof(float);
 
@@ -920,8 +832,6 @@ int main (int argc, char *argv[]) {
                   1, 1, 1,
                   1, 1, 1 };
     size_t bytesB = 9*sizeof(float);
-    //Sx = [-1  0  1 ; -2 0 2 ; -1 0 1 ]
-    //Sy = [-1 -2 -1 ;  0 0 0 ;  1 2 1 ]
     float Sx[] = {  -1, 0, 1,
                     -2, 0, 2,
                     -1, 0, 1  };
@@ -945,10 +855,6 @@ int main (int argc, char *argv[]) {
     // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
     grid_x = 1;
 
-    // Copy A and B^T into device memory
-    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-    //showInOut(src, E);
     Mat in, out;
     resize(src, in, Size(HEIGHT, WIDTH));
     resize(E, out, Size(HEIGHT, WIDTH));
@@ -956,7 +862,6 @@ int main (int argc, char *argv[]) {
     imshow(output_window, out);
     waitKey(0);
 
-    //NUM_FRAMES = 20;
     pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
     struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
     packData(args, (float*)src.data, I_sz,
@@ -976,13 +881,6 @@ int main (int argc, char *argv[]) {
     // Check if the total elements is a multiple of block size
     assert(src.size[0]*src.size[1] % block_x == 0);
 
-    //imshow(input_window, src);
-    //imshow(output_window, E);
-    //waitKey(0);
-
-    // Get the time just before computation starts
-    //gettimeofday(&tv_start,NULL);
-
     for(unsigned j=0; j<NUM_RUNS; j++) {
       std::cout << "Run: " << j << "\n";
       void* DFG = __visc__launch(1, edgeDetection, (void*)args);
@@ -990,13 +888,9 @@ int main (int argc, char *argv[]) {
       cap = VideoCapture(params->inpFiles[0]);
       getNextFrame(cap, src);
       
-      //packData(args, A.data, BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
-
       if(NUM_FRAMES >=2) {
-        //__visc__push(DFG, args);
-        //__visc__push(DFG, args);
         for(int i=0; i<NUM_FRAMES; i++) {
-          //std::cout << "Frame " << i << "\n";
+          std::cout << "Frame " << i << "\n";
           args->I = (float*) src.data;
 
           *maxG = 0.0;
@@ -1014,53 +908,19 @@ int main (int argc, char *argv[]) {
           llvm_visc_track_mem(Sy, bytesSy);
 
           __visc__push(DFG, args);
-          __visc__pop(DFG);
+          void *ret = __visc__pop(DFG);
+          std::cout << "Returned size: " << *(size_t *)ret
+                    << " expected " << I_sz << '\n';
 
-          //llvm_visc_request_mem(E.data, I_sz);
-          //std::cout << "Show E" << "\n";
-          //imshow(window_name, E);
-          //waitKey(0);
-          //llvm_visc_request_mem(src.data, I_sz);
-          //llvm_visc_request_mem(Is.data, I_sz);
-          //llvm_visc_request_mem(L.data, I_sz);
-          //llvm_visc_request_mem(S.data, I_sz);
-          //llvm_visc_request_mem(G.data, I_sz);
           llvm_visc_request_mem(maxG, bytesMaxG);
           llvm_visc_request_mem(E.data, I_sz);
-          //std::cout << "src.data = " << (float*)src.data << "\n";
-          //std::cout << "Is.data = " << (float*)Is.data << "\n";
-          //std::cout << "L.data = " << (float*)L.data << "\n";
-          //std::cout << "S.data = " << (float*)S.data << "\n";
-          //std::cout << "G.data = " << (float*)G.data << "\n";
-          //std::cout << "E.data = " << (float*)E.data << "\n";
-          //std::cout << "Max G = " << *maxG << "\n";
-
-          //gettimeofday(&timeStamps[cnt], NULL);
-          //cnt++;          
+
           Mat in, out;
           resize(src, in, Size(HEIGHT, WIDTH));
-          //std::cout << "Show E\n";
           resize(E, out, Size(HEIGHT, WIDTH));
           imshow(output_window, out);
           imshow(input_window, in);
           waitKey(1);
-          //waitKey(0);
-          //std::cout << "Show Is\n";
-          //resize(Is, out, Size(HEIGHT, WIDTH));
-          //imshow(output_window, out);
-          //waitKey(0);
-          //std::cout << "Show L\n";
-          //resize(L, out, Size(HEIGHT, WIDTH));
-          //imshow(output_window, out);
-          //waitKey(0);
-          //std::cout << "Show S\n";
-          //resize(S, out, Size(HEIGHT, WIDTH));
-          //imshow(output_window, out);
-          //waitKey(0);
-          //std::cout << "Show G\n";
-          //resize(G, out, Size(HEIGHT, WIDTH));
-          //imshow(output_window, out);
-          //waitKey(0);
 
           llvm_visc_untrack_mem(src.data);
           llvm_visc_untrack_mem(Is.data);
@@ -1075,72 +935,17 @@ int main (int argc, char *argv[]) {
           llvm_visc_untrack_mem(Sy);
 
           getNextFrame(cap, src);
-
         }
-        //__visc__pop(DFG);
-        //__visc__pop(DFG);
       }
       else {
           __visc__push(DFG, args);
           __visc__pop(DFG);
       }
-
-     
       __visc__wait(DFG);
     }
-
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-
     pb_PrintTimerSet(&timers);
-
-    //const char *fn = "timestamps.txt";
-    //std::ofstream outfile;
-    //outfile.open(fn);
-    //if (!outfile.is_open()) {
-      //std::cout << "Failed to open " << fn << " for writing\n"; 
-    //}
-    //for (unsigned long i = 0; i < cnt; i++) {
-      //double elapsed = (timeStamps[i].tv_sec - tv_start.tv_sec) + 
-                       //((timeStamps[i].tv_usec - tv_start.tv_usec)/1000000.0);
-      //outfile << elapsed << "\n";
-    //}
-    //free(timeStamps);
-    //outfile.close();
     __visc__cleanup();
-
-    //if (params->outFile) {
-
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        //writeColMajorMatrixFile(params->outFile,
-                                //src.size[0], src.size[1], matE);
-    //}
-   
-    //std::cout << "Show Is" << "\n";
-    //Mat output(src.size[0], src.size[1], CV_32F);
-    //imshow(output_window, Is);
-    //waitKey(0);
-
-    //std::cout << "Show G" << "\n";
-    //imshow(output_window, L);
-    //waitKey(0);
-
-    //std::cout << "Show L" << "\n";
-    //imshow(output_window, S);
-    //waitKey(0);
-    
-    //std::cout << "Show S" << "\n";
-    //imshow(output_window, G);
-    //waitKey(0);
-    
-    //std::cout << "Show E" << "\n";
-    //imshow(output_window, E);
-    //waitKey(0);
-
-    //double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    //std::cout<< "GFLOPs = " << 2.* src.size[0] * src.size[1] * src.size[1]/GPUtime/1e9 << std::endl;
     pb_FreeParameters(params);
-
     return 0;
 }