From c76be1e26e5cf7be5bcd25bc0ac92a2308e1be37 Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <aejjeh@hpvmfpga1.cs.illinois.edu>
Date: Mon, 20 Jan 2020 09:18:15 -0600
Subject: [PATCH] adding final working pipeline. All kernels work on GPU except
 computeMaxGradient.

---
 .../test/parboil/benchmarks/pipeline/Makefile |  2 +-
 .../pipeline/src/visc_parallel/Makefile       |  8 +-
 .../pipeline/src/visc_parallel/main.cc        | 99 ++++++++++---------
 3 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/hpvm/test/parboil/benchmarks/pipeline/Makefile b/hpvm/test/parboil/benchmarks/pipeline/Makefile
index a83cacc2cb..5f0fd5b730 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/Makefile
+++ b/hpvm/test/parboil/benchmarks/pipeline/Makefile
@@ -31,7 +31,7 @@ BIN = $(addsuffix -$(VERSION), $(APP))
 
 SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
+DATASET_DIR ?= $(PARBOIL_ROOT)/datasets/$(APP)
 
 IMAGE = $(DATASET_DIR)/$(TEST)/input/edgetest_10.png
 VIDEO1 = $(DATASET_DIR)/$(TEST)/input/taxi/taxi01.pgm
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
index 6dc39540f2..ea6b44788d 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile
@@ -4,9 +4,11 @@ LANGUAGE=visc
 SRCDIR_OBJS=io.ll #compute_gold.o
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include
-APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include
-APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs`
+APP_CFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize  #-I/shared/opencv/include
+APP_CXXFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize #-I/shared/opencv/include
+OpenCV_DIR=/shared/opencv
+APP_LDFLAGS=`pkg-config opencv --libs`
+#APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs`
 #APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib 
 
 #OpenCV link flags all
diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
index 12b3d42a66..892cdabd09 100644
--- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
+++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc
@@ -11,7 +11,7 @@
  */
 
 #include "opencv2/opencv.hpp"
-#include "opencv2/core/ocl.hpp"
+#include "opencv2/ocl/ocl.hpp"
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
@@ -201,7 +201,7 @@ void gaussianSmoothing(float *I, size_t bytesI,
   
     Is[gloc] = smoothedVal;
   }
-  __visc__return(m, n);
+  __visc__return(2, m, n);
 }
 
 void WrapperGaussianSmoothing(float *I, size_t bytesI,
@@ -210,7 +210,7 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI,
                        long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, I, Gs, 1, Is);
-  void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n);
+  void* GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
   __visc__bindIn(GSNode, 0, 0, 0); // Bind I
   __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
   __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
@@ -241,7 +241,7 @@ void laplacianEstimate(float *Is, size_t bytesIs,
   __visc__hint(visc::DEVICE);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
-  float imageArea[SZB][SZB];
+  float imageArea[SZB*SZB];
 
   //int gx = get_global_id(0);
   //int gy = get_global_id(1);
@@ -255,64 +255,64 @@ void laplacianEstimate(float *Is, size_t bytesIs,
 
   if ((gx < n) && (gy < m)) {
     // Data copy for dilation filter
-    imageArea[1][1] = Is[gy * n + gx];
+    imageArea[1 * SZB +1] = Is[gy * n + gx];
 
     if (gx == 0) {
-      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
+      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MIN_BR;
     } else {
-      imageArea[1][0] = Is[gy * n + gx - 1];
-      imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
-      imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+      imageArea[1 * SZB +0] = Is[gy * n + gx - 1];
+      imageArea[0 * SZB +0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2 * SZB +0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
+      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MIN_BR;
     } else {
-      imageArea[1][2] = Is[gy * n + gx + 1];
-      imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
-      imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+      imageArea[1 * SZB +2] = Is[gy * n + gx + 1];
+      imageArea[0 * SZB +2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2 * SZB +2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
     }
 
-    imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
-    imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+    imageArea[0 * SZB +1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2 * SZB +1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
 
     // Compute pixel of dilated image
     float dilatedPixel = MIN_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
 
     // Data copy for erotion filter - only change the boundary conditions
     if (gx == 0) {
-      imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
+      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][0] = MAX_BR;
-      if (gy == m-1) imageArea[2][0] = MAX_BR;
+      if (gy == 0) imageArea[0 * SZB +0] = MAX_BR;
+      if (gy == m-1) imageArea[2 * SZB +0] = MAX_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
+      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][2] = MAX_BR;
-      if (gy == m-1) imageArea[2][2] = MAX_BR;
+      if (gy == 0) imageArea[0 * SZB +2] = MAX_BR;
+      if (gy == m-1) imageArea[2 * SZB +2] = MAX_BR;
     }
 
-    if (gy == 0) imageArea[0][1] = MAX_BR;
-    if (gy == m-1) imageArea[2][1] = MAX_BR;
+    if (gy == 0) imageArea[0 * SZB +1] = MAX_BR;
+    if (gy == m-1) imageArea[2 * SZB +1] = MAX_BR;
 
     // Compute pixel of eroded image
     float erodedPixel = MAX_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+        erodedPixel = _MIN(erodedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
 
-    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1];
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1];
     L[gy*n+gx] = laplacian;
   }
   //OutStruct output = {bytesB, bytesL};
   //if(gx == m-1 && gy == n-1)
     //std::cout << "Exit laplacian\n";
-  __visc__return(m);
+  __visc__return(1, m);
 }
 
 void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
@@ -321,7 +321,7 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
                           long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
-  void* LNode = __visc__createNode2D(laplacianEstimate, m, n);
+  void* LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
   __visc__bindIn(LNode, 0, 0, 0); // Bind Is
   __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(LNode, 2, 2, 0); // Bind B
@@ -434,7 +434,7 @@ void computeZeroCrossings(float *L, size_t bytesL,
   //OutStruct output = {bytesB, bytesS};
   //if(gx == n-1 && gy == n-1)
     //std::cout << "Exit ZC\n";
-  __visc__return(m); 
+  __visc__return(1, m); 
 }
 
 void WrapperComputeZeroCrossings(float *L, size_t bytesL,
@@ -443,7 +443,7 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL,
                           long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
-  void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n);
+  void* ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
   __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
   __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
   __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
@@ -518,10 +518,10 @@ void computeGradient(float *Is, size_t bytesIs,
         Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
       }
 
-    G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy);
+    G[gloc] = sqrt(Gx*Gx + Gy*Gy);
     //G[gloc] = Gx*Gx + Gy*Gy;
   }
-  __visc__return(n);
+  __visc__return(1, n);
 }
 
 void WrapperComputeGradient(float *Is, size_t bytesIs,
@@ -531,7 +531,7 @@ void WrapperComputeGradient(float *Is, size_t bytesIs,
                                  long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, Is, Sx, Sy, 1, G);
-  void* CGNode = __visc__createNode2D(computeGradient, m, n);
+  void* CGNode = __visc__createNodeND(2, computeGradient, m, n);
   __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
   __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
@@ -557,7 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
                             float *maxG, size_t bytesMaxG,
                             long m, long n) {
 
-  __visc__hint(visc::DEVICE);
+  __visc__hint(visc::CPU_TARGET);
   //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
@@ -575,15 +575,16 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   }
 
   // First thread iterates over all elements of the thread block
-  if (lx == 0) {
-    for (int i = 1; (i < dimx) && (i < m*n); i++)
+  long bounds = dimx < m*n ? dimx : m*n;
+	if (lx == 0) {
+    for (int i = 1; i < bounds; i++)
       if (G[lx] < G[i])
         G[lx] = G[i];
 
     *maxG = G[lx];
   }
 
-  __visc__return(n);
+  __visc__return(1, n);
 }
 
 /* 
@@ -626,7 +627,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   //if (lx == 0)
     //__visc__atomic_max(maxG,G[gid]);
 
-  //__visc__return(m);
+  //__visc__return(1, m);
 //}
 
 void computeMaxGradientTB(float *G, size_t bytesG,
@@ -636,7 +637,7 @@ void computeMaxGradientTB(float *G, size_t bytesG,
   //__visc__hint(visc::DEVICE);
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x);
+  void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
@@ -653,7 +654,7 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG,
                            long block_x, long grid_x) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x);
+  void* CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
   __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
@@ -693,7 +694,7 @@ void rejectZeroCrossings(float *S, size_t bytesS,
   if ((gx < n) && (gy < m)) {
     E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
   }
-  __visc__return(m);
+  __visc__return(1, m);
 }
 
 void WrapperRejectZeroCrossings(float *S, size_t bytesS,
@@ -703,7 +704,7 @@ void WrapperRejectZeroCrossings(float *S, size_t bytesS,
                          long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
-  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  void* RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
   __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
   __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
   __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
@@ -739,12 +740,12 @@ void edgeDetection(float *I, size_t bytesI, // 0
                    ) { 
   __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
   __visc__hint(visc::CPU_TARGET);
-  void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
-  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
-  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
-  void* CGNode = __visc__createNode(WrapperComputeGradient);
-  void* CMGNode = __visc__createNode(WrapperComputeMaxGradient);
-  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+  void* GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
+  void* LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
+  void* CGNode = __visc__createNodeND(0, WrapperComputeGradient);
+  void* CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
+  void* RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
 
   // Gaussian Inputs
   __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
@@ -873,7 +874,7 @@ int main (int argc, char *argv[]) {
     }
 
     int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
-    //NUM_FRAMES = 5;
+    NUM_FRAMES = 600;
     std::cout << "Number of frames = " << NUM_FRAMES << "\n";
 
     // Used to store time after each frame computation is completed
-- 
GitLab