From c76be1e26e5cf7be5bcd25bc0ac92a2308e1be37 Mon Sep 17 00:00:00 2001 From: Adel Ejjeh <aejjeh@hpvmfpga1.cs.illinois.edu> Date: Mon, 20 Jan 2020 09:18:15 -0600 Subject: [PATCH] adding final working pipeline. All kernels work on GPU except computeMaxGradient. --- .../test/parboil/benchmarks/pipeline/Makefile | 2 +- .../pipeline/src/visc_parallel/Makefile | 8 +- .../pipeline/src/visc_parallel/main.cc | 99 ++++++++++--------- 3 files changed, 56 insertions(+), 53 deletions(-) diff --git a/hpvm/test/parboil/benchmarks/pipeline/Makefile b/hpvm/test/parboil/benchmarks/pipeline/Makefile index a83cacc2cb..5f0fd5b730 100644 --- a/hpvm/test/parboil/benchmarks/pipeline/Makefile +++ b/hpvm/test/parboil/benchmarks/pipeline/Makefile @@ -31,7 +31,7 @@ BIN = $(addsuffix -$(VERSION), $(APP)) SRCDIR = src/$(VERSION) BUILDDIR = build/$(VERSION)_$(PLATFORM) -DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP) +DATASET_DIR ?= $(PARBOIL_ROOT)/datasets/$(APP) IMAGE = $(DATASET_DIR)/$(TEST)/input/edgetest_10.png VIDEO1 = $(DATASET_DIR)/$(TEST)/input/taxi/taxi01.pgm diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile index 6dc39540f2..ea6b44788d 100644 --- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile +++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/Makefile @@ -4,9 +4,11 @@ LANGUAGE=visc SRCDIR_OBJS=io.ll #compute_gold.o VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ -APP_CFLAGS+=-ffast-math -O3 -I/opt/opencv/include -APP_CXXFLAGS+=-ffast-math -O3 -I/opt/opencv/include -APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs` +APP_CFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize #-I/shared/opencv/include +APP_CXXFLAGS+=-ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize #-I/shared/opencv/include +OpenCV_DIR=/shared/opencv +APP_LDFLAGS=`pkg-config opencv --libs` +#APP_LDFLAGS=`pkg-config ${OpenCV_DIR}/lib/pkgconfig/opencv.pc --libs` #APP_LDFLAGS=-L/usr/local/cuda/lib64 -rdynamic /opt/opencv/lib/libopencv_videostab.so.3.0.0 /opt/opencv/lib/libopencv_videoio.so.3.0.0 /opt/opencv/lib/libopencv_video.so.3.0.0 /opt/opencv/lib/libopencv_superres.so.3.0.0 /opt/opencv/lib/libopencv_stitching.so.3.0.0 /opt/opencv/lib/libopencv_shape.so.3.0.0 /opt/opencv/lib/libopencv_photo.so.3.0.0 /opt/opencv/lib/libopencv_objdetect.so.3.0.0 /opt/opencv/lib/libopencv_ml.so.3.0.0 /opt/opencv/lib/libopencv_imgproc.so.3.0.0 /opt/opencv/lib/libopencv_imgcodecs.so.3.0.0 /opt/opencv/lib/libopencv_highgui.so.3.0.0 /opt/opencv/lib/libopencv_hal.a /opt/opencv/lib/libopencv_flann.so.3.0.0 /opt/opencv/lib/libopencv_features2d.so.3.0.0 /opt/opencv/lib/libopencv_core.so.3.0.0 /opt/opencv/lib/libopencv_calib3d.so.3.0.0 /opt/opencv/lib/libopencv_hal.a -ldl -lm -lpthread -lrt /opt/opencv/share/OpenCV/3rdparty/lib/libippicv.a -Wl,-rpath,/usr/local/cuda/lib64:/opt/opencv/lib #OpenCV link flags all diff --git a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc index 12b3d42a66..892cdabd09 100644 --- a/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc +++ b/hpvm/test/parboil/benchmarks/pipeline/src/visc_parallel/main.cc @@ -11,7 +11,7 @@ */ #include "opencv2/opencv.hpp" -#include "opencv2/core/ocl.hpp" +#include "opencv2/ocl/ocl.hpp" #include <stdio.h> #include <math.h> #include <stdlib.h> @@ -201,7 +201,7 @@ void gaussianSmoothing(float *I, size_t bytesI, Is[gloc] = smoothedVal; } - __visc__return(m, n); + __visc__return(2, m, n); } void WrapperGaussianSmoothing(float *I, size_t bytesI, @@ -210,7 +210,7 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, I, Gs, 1, Is); - void* GSNode = __visc__createNode2D(gaussianSmoothing, m, n); + void* GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n); __visc__bindIn(GSNode, 0, 0, 0); // Bind I __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs @@ -241,7 +241,7 @@ void laplacianEstimate(float *Is, size_t bytesIs, __visc__hint(visc::DEVICE); __visc__attributes(2, Is, B, 1, L); // 3x3 image area - float imageArea[SZB][SZB]; + float imageArea[SZB*SZB]; //int gx = get_global_id(0); //int gy = get_global_id(1); @@ -255,64 +255,64 @@ void laplacianEstimate(float *Is, size_t bytesIs, if ((gx < n) && (gy < m)) { // Data copy for dilation filter - imageArea[1][1] = Is[gy * n + gx]; + imageArea[1 * SZB +1] = Is[gy * n + gx]; if (gx == 0) { - imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR; + imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MIN_BR; } else { - imageArea[1][0] = Is[gy * n + gx - 1]; - imageArea[0][0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; - imageArea[2][0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; + imageArea[1 * SZB +0] = Is[gy * n + gx - 1]; + imageArea[0 * SZB +0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR; + imageArea[2 * SZB +0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR; } if (gx == n - 1) { - imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR; + imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MIN_BR; } else { - imageArea[1][2] = Is[gy * n + gx + 1]; - imageArea[0][2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; - imageArea[2][2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; + imageArea[1 * SZB +2] = Is[gy * n + gx + 1]; + imageArea[0 * SZB +2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR; + imageArea[2 * SZB +2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR; } - imageArea[0][1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; - imageArea[2][1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; + imageArea[0 * SZB +1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR; + imageArea[2 * SZB +1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR; // Compute pixel of dilated image float dilatedPixel = MIN_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]); + dilatedPixel = _MAX(dilatedPixel, imageArea[i * SZB +j] * B[i*SZB + j]); // Data copy for erotion filter - only change the boundary conditions if (gx == 0) { - imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR; + imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MAX_BR; } else { - if (gy == 0) imageArea[0][0] = MAX_BR; - if (gy == m-1) imageArea[2][0] = MAX_BR; + if (gy == 0) imageArea[0 * SZB +0] = MAX_BR; + if (gy == m-1) imageArea[2 * SZB +0] = MAX_BR; } if (gx == n - 1) { - imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR; + imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MAX_BR; } else { - if (gy == 0) imageArea[0][2] = MAX_BR; - if (gy == m-1) imageArea[2][2] = MAX_BR; + if (gy == 0) imageArea[0 * SZB +2] = MAX_BR; + if (gy == m-1) imageArea[2 * SZB +2] = MAX_BR; } - if (gy == 0) imageArea[0][1] = MAX_BR; - if (gy == m-1) imageArea[2][1] = MAX_BR; + if (gy == 0) imageArea[0 * SZB +1] = MAX_BR; + if (gy == m-1) imageArea[2 * SZB +1] = MAX_BR; // Compute pixel of eroded image float erodedPixel = MAX_BR; for (i = 0; i < SZB; i++) for (j = 0; j < SZB; j++) - erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]); + erodedPixel = _MIN(erodedPixel, imageArea[i * SZB +j] * B[i*SZB + j]); - float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1][1]; + float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1]; L[gy*n+gx] = laplacian; } //OutStruct output = {bytesB, bytesL}; //if(gx == m-1 && gy == n-1) //std::cout << "Exit laplacian\n"; - __visc__return(m); + __visc__return(1, m); } void WrapperlaplacianEstimate(float *Is, size_t bytesIs, @@ -321,7 +321,7 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, Is, B, 1, L); - void* LNode = __visc__createNode2D(laplacianEstimate, m, n); + void* LNode = __visc__createNodeND(2, laplacianEstimate, m, n); __visc__bindIn(LNode, 0, 0, 0); // Bind Is __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs __visc__bindIn(LNode, 2, 2, 0); // Bind B @@ -434,7 +434,7 @@ void computeZeroCrossings(float *L, size_t bytesL, //OutStruct output = {bytesB, bytesS}; //if(gx == n-1 && gy == n-1) //std::cout << "Exit ZC\n"; - __visc__return(m); + __visc__return(1, m); } void WrapperComputeZeroCrossings(float *L, size_t bytesL, @@ -443,7 +443,7 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, L, B, 1, S); - void* ZCNode = __visc__createNode2D(computeZeroCrossings, m, n); + void* ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n); __visc__bindIn(ZCNode, 0, 0, 0); // Bind L __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL __visc__bindIn(ZCNode, 2, 2, 0); // Bind B @@ -518,10 +518,10 @@ void computeGradient(float *Is, size_t bytesIs, Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j]; } - G[gloc] = __visc__sqrt(Gx*Gx + Gy*Gy); + G[gloc] = sqrt(Gx*Gx + Gy*Gy); //G[gloc] = Gx*Gx + Gy*Gy; } - __visc__return(n); + __visc__return(1, n); } void WrapperComputeGradient(float *Is, size_t bytesIs, @@ -531,7 +531,7 @@ void WrapperComputeGradient(float *Is, size_t bytesIs, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(3, Is, Sx, Sy, 1, G); - void* CGNode = __visc__createNode2D(computeGradient, m, n); + void* CGNode = __visc__createNodeND(2, computeGradient, m, n); __visc__bindIn(CGNode, 0, 0, 0); // Bind Is __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx @@ -557,7 +557,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG, size_t bytesMaxG, long m, long n) { - __visc__hint(visc::DEVICE); + __visc__hint(visc::CPU_TARGET); //__visc__hint(visc::CPU_TARGET); __visc__attributes(1, G, 1, maxG); @@ -575,15 +575,16 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, } // First thread iterates over all elements of the thread block - if (lx == 0) { - for (int i = 1; (i < dimx) && (i < m*n); i++) + long bounds = dimx < m*n ? dimx : m*n; + if (lx == 0) { + for (int i = 1; i < bounds; i++) if (G[lx] < G[i]) G[lx] = G[i]; *maxG = G[lx]; } - __visc__return(n); + __visc__return(1, n); } /* @@ -626,7 +627,7 @@ void computeMaxGradientLeaf(float *G, size_t bytesG, //if (lx == 0) //__visc__atomic_max(maxG,G[gid]); - //__visc__return(m); + //__visc__return(1, m); //} void computeMaxGradientTB(float *G, size_t bytesG, @@ -636,7 +637,7 @@ void computeMaxGradientTB(float *G, size_t bytesG, //__visc__hint(visc::DEVICE); __visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); - void* CMGLeafNode = __visc__createNode1D(computeMaxGradientLeaf, block_x); + void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x); __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG @@ -653,7 +654,7 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG, long block_x, long grid_x) { __visc__hint(visc::CPU_TARGET); __visc__attributes(2, G, maxG, 1, maxG); - void* CMGTBNode = __visc__createNode1D(computeMaxGradientTB, grid_x); + void* CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x); __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG @@ -693,7 +694,7 @@ void rejectZeroCrossings(float *S, size_t bytesS, if ((gx < n) && (gy < m)) { E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ; } - __visc__return(m); + __visc__return(1, m); } void WrapperRejectZeroCrossings(float *S, size_t bytesS, @@ -703,7 +704,7 @@ void WrapperRejectZeroCrossings(float *S, size_t bytesS, long m, long n) { __visc__hint(visc::CPU_TARGET); __visc__attributes(3, S, G, maxG, 1, E); - void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n); + void* RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n); __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G @@ -739,12 +740,12 @@ void edgeDetection(float *I, size_t bytesI, // 0 ) { __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E); __visc__hint(visc::CPU_TARGET); - void* GSNode = __visc__createNode(WrapperGaussianSmoothing); - void* LNode = __visc__createNode(WrapperlaplacianEstimate); - void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings); - void* CGNode = __visc__createNode(WrapperComputeGradient); - void* CMGNode = __visc__createNode(WrapperComputeMaxGradient); - void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings); + void* GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing); + void* LNode = __visc__createNodeND(0, WrapperlaplacianEstimate); + void* CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings); + void* CGNode = __visc__createNodeND(0, WrapperComputeGradient); + void* CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient); + void* RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings); // Gaussian Inputs __visc__bindIn(GSNode, 0 , 0, 1); // Bind I @@ -873,7 +874,7 @@ int main (int argc, char *argv[]) { } int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT); - //NUM_FRAMES = 5; + NUM_FRAMES = 600; std::cout << "Number of frames = " << NUM_FRAMES << "\n"; // Used to store time after each frame computation is completed -- GitLab