Fixed pipeline benchmark to not specify return types of internal nodes. That...

Fixed pipeline benchmark to not specify return types of internal nodes. That is automatically inferred from bind out intrinsics

Fixed pipeline benchmark to not specify return types of internal nodes. That...
0ad4f77d · Prakalp Srivastava · 25bedf2c · 0ad4f77d · 0ad4f77d
Commit 0ad4f77d authored 9 years ago by Prakalp Srivastava
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/Makefile
@@ -22,14 +22,15 @@ SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
 DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)

-MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
-MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
-MATRIX2T = $(DATASET_DIR)/$(TEST)/input/matrix2t.txt
-REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/matrix3.txt
+IMAGE = $(DATASET_DIR)/$(TEST)/input/edgetest_10.png
+VIDEO = $(DATASET_DIR)/$(TEST)/input/360x240_1mb.mp4
+#VIDEO = $(DATASET_DIR)/$(TEST)/input/bird.avi
+REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/output.txt
 RUNDIR = run/$(VERSION)/$(TEST)
-OUTPUT = $(RUNDIR)/matrix3.txt
+OUTPUT = $(RUNDIR)/output.txt

-ARGS = -i $(MATRIX1),$(MATRIX2),$(MATRIX2T) -o $(OUTPUT)
+ARGS = -i $(IMAGE) -o $(OUTPUT)
+#ARGS = -i $(VIDEO) -o $(OUTPUT)
 TOOL = tools/compare-output
 #TOOL=echo
 include $(PARBOIL_ROOT)/common/mk/Makefile
--- a/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/pipeline/src/visc/main.cc
@@ -10,6 +10,8 @@
 * Main entry of dense matrix-matrix multiplication kernel
 */

+#include "opencv2/opencv.hpp"
+#include "opencv2/core/ocl.hpp"
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
@@ -41,12 +43,12 @@ extern char* readFile(const char*);
 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))

 extern "C" {
-struct __attribute__((__packed__)) OutStruct {
+//struct __attribute__((__packed__)) OutStruct {
  //int m;
  //int n;
-  size_t bytesB;
-  size_t bytesOut;
-};
+  //size_t bytesB;
+  //size_t bytesOut;
+//};

 struct __attribute__((__packed__)) InStruct {
  float* I ;
@@ -57,6 +59,10 @@ struct __attribute__((__packed__)) InStruct {
  size_t bytesL;
  float* S;
  size_t bytesS;
+  float* G;
+  size_t bytesG;
+  float* E;
+  size_t bytesE;
  int m;
  int n;
 };
@@ -66,6 +72,8 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
                                     float* B, size_t bytesB,
                                     float* L, size_t bytesL,
                                     float* S, size_t bytesS,
+                                     float* G, size_t bytesG,
+                                     float* E, size_t bytesE,
                                     int m, int n) {
  args->I = I;
  args->bytesI = bytesI;
@@ -75,6 +83,10 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
  args->bytesL = bytesL;
  args->S = S;
  args->bytesS = bytesS;
+  args->G = G;
+  args->bytesG = bytesG;
+  args->E = E;
+  args->bytesE = bytesE;
  args->m = m;
  args->n = n;
 }
@@ -167,10 +179,10 @@ void laplacianEstimate(float *I, size_t bytesI,
  //OutStruct output = {bytesB, bytesL};
  //if(gx == m-1 && gy == n-1)
    //std::cout << "Exit laplacian\n";
-  __visc__return(bytesB, bytesL);
+  __visc__return(m);
 }

-OutStruct WrapperlaplacianEstimate(float *I, size_t bytesI,
+void WrapperlaplacianEstimate(float *I, size_t bytesI,
                          float *B, size_t bytesB,
                          float *L, size_t bytesL,
                          int m, int n) {
@@ -187,9 +199,7 @@ OutStruct WrapperlaplacianEstimate(float *I, size_t bytesI,
  __visc__bindIn(LNode, 7, 7, 0); // Bind n

  __visc__bindOut(LNode, 0, 0, 0); // bind output m
-  __visc__bindOut(LNode, 1, 1, 0); // bind output n

-  return {0};
 }

 /* Compute the zero crossings of input image L of size m x n */
@@ -279,10 +289,12 @@ void computeZeroCrossings(float *L, size_t bytesL,
  //OutStruct output = {bytesB, bytesS};
  //if(gx == n-1 && gy == n-1)
    //std::cout << "Exit ZC\n";
-  __visc__return(bytesB, bytesS);
+  // FIXME: Return bytesG instead of L. RZC nodes expects bytesG
+  // This is just temporary
+  __visc__return(m); 
 }

-OutStruct WrapperComputeZeroCrossings(float *L, size_t bytesL,
+void WrapperComputeZeroCrossings(float *L, size_t bytesL,
                          float *B, size_t bytesB,
                          float *S, size_t bytesS,
                          int m, int n) {
@@ -299,80 +311,183 @@ OutStruct WrapperComputeZeroCrossings(float *L, size_t bytesL,
  __visc__bindIn(ZCNode, 7, 7, 0); // Bind n

  __visc__bindOut(ZCNode, 0, 0, 0); // bind output m
-  __visc__bindOut(ZCNode, 1, 1, 0); // bind output n

-  return {0};
 }

+/*
+ * S: sign of image
+ * G: gradient of (smoothed) image
+ * E: output image : edges
+ * theta: threshold value
+ * maxG: computed maximum graddient of G
+ */
+void rejectZeroCrossings(float *S, size_t bytesS,
+                         float *G, size_t bytesG,
+                         float *E, size_t bytesE,
+                         //float theta, float maxG,
+                         int m, int n) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, S, G, 1, E);
+
+  void* thisNode = __visc__getNode();
+  int gx = __visc__getNodeInstanceID_x(thisNode);
+  int gy = __visc__getNodeInstanceID_y(thisNode);
+  float theta = 1.0;
+  float maxG = 10.0;
+  if ((gx < n) && (gy < m)) {
+    E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > theta*maxG)) ? 1.0 : 0.0 ;
+  }
+  
+  __visc__return(m);
+       
+}
+
+void WrapperRejectZeroCrossings(float *S, size_t bytesS,
+                          float *G, size_t bytesG,
+                          float *E, size_t bytesE,
+                          //float theta, float maxG,
+                          int m, int n) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, S, G, 1, E);
+  void* RZCNode = __visc__createNode2D(rejectZeroCrossings, m, n);
+  __visc__bindIn(RZCNode, 0, 0, 0); // Bind S
+  __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
+  __visc__bindIn(RZCNode, 2, 2, 0); // Bind G
+  __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
+  __visc__bindIn(RZCNode, 4, 4, 0); // Bind E
+  __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesE
+  __visc__bindIn(RZCNode, 6, 6, 0); // Bind m
+  __visc__bindIn(RZCNode, 7, 7, 0); // Bind n
+
+  __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesG
+
+  //return {0};
+}
+
+
+
 // Pipelined Root node
-OutStruct edgeDetection(float *I, size_t bytesI,
+void edgeDetection(float *I, size_t bytesI,
                          float *B, size_t bytesB,
                          float *L, size_t bytesL,
                          float *S, size_t bytesS,
+                          float *G, size_t bytesG,
+                          float *E, size_t bytesE,
                          int m, int n) {
-  __visc__attributes(2, I, B, 2, L, S);
-  void* PNode = __visc__createNode(WrapperlaplacianEstimate);
-  void* CNode = __visc__createNode(WrapperComputeZeroCrossings);
-
-  __visc__bindIn(PNode, 0, 0, 1); // Bind I
-  __visc__bindIn(PNode, 1, 1, 1); // Bind bytesI
-  __visc__bindIn(PNode, 2, 2, 1); // Bind B
-  __visc__bindIn(PNode, 3, 3, 1); // Bind bytesB
-  __visc__bindIn(PNode, 4, 4, 1); // Bind L
-  __visc__bindIn(PNode, 5, 5, 1); // Bind bytesL
-  __visc__bindIn(PNode, 8, 6, 1); // Bind m
-  __visc__bindIn(PNode, 9, 7, 1); // Bind n
-
-  __visc__bindIn(CNode, 4, 0, 1); // Bind L
-  //__visc__bindIn(CNode, 5, 1, 1); // Bind bytesL
-  __visc__edge(PNode, CNode, 1, 1, 1); // Get bytesL
-  __visc__bindIn(CNode, 2, 2, 1); // Bind B
-  //__visc__bindIn(CNode, 3, 3, 1); // Bind bytesB
-  __visc__edge(PNode, CNode, 0, 3, 1); // Get bytesB
-  __visc__bindIn(CNode, 6, 4, 1); // Bind S
-  __visc__bindIn(CNode, 7, 5, 1); // Bind bytesS
-
-  __visc__bindIn(CNode, 8, 6, 1); // Bind m
-  __visc__bindIn(CNode, 9, 7, 1); // Bind n
-
-  //__visc__edge(PNode, CNode, 0, 6, 1); // pass m from P->C 
-  //__visc__edge(PNode, CNode, 1, 7, 1); // pass n from P->C
-
-  __visc__bindOut(CNode, 0, 0, 1); // dummy bind output to get pipeline functionality
-  __visc__bindOut(CNode, 1, 1, 1); // dummy bind output to get pipeline functionality
-  return {0};
+  __visc__attributes(2, I, B, 4, L, S, G, E);
+  void* LNode = __visc__createNode(WrapperlaplacianEstimate);
+  void* CZCNode = __visc__createNode(WrapperComputeZeroCrossings);
+  void* RZCNode = __visc__createNode(WrapperRejectZeroCrossings);
+
+  // Laplacian Inputs
+  __visc__bindIn(LNode, 0, 0, 1); // Bind I
+  __visc__bindIn(LNode, 1, 1, 1); // Bind bytesI
+  __visc__bindIn(LNode, 2, 2, 1); // Bind B
+  __visc__bindIn(LNode, 3, 3, 1); // Bind bytesB
+  __visc__bindIn(LNode, 4, 4, 1); // Bind L
+  __visc__bindIn(LNode, 5, 5, 1); // Bind bytesL
+  __visc__bindIn(LNode, 12, 6, 1); // Bind m
+  __visc__bindIn(LNode, 13, 7, 1); // Bind n
+
+  // Compute ZC Inputs
+  __visc__bindIn(CZCNode, 4, 0, 1); // Bind L
+  __visc__bindIn(CZCNode, 5, 1, 1); // Get bytesL
+  __visc__bindIn(CZCNode, 2, 2, 1); // Bind B
+  __visc__bindIn(CZCNode, 3, 3, 1); // Get bytesB
+  __visc__bindIn(CZCNode, 6, 4, 1); // Bind S
+  __visc__bindIn(CZCNode, 7, 5, 1); // Bind bytesS
+  //__visc__bindIn(CZCNode, 12, 6, 1); // Bind m
+  __visc__edge(LNode, CZCNode, 0, 6, 1);
+  __visc__bindIn(CZCNode, 13, 7, 1); // Bind n
+
+  // Reject ZC Inputs
+  __visc__bindIn(RZCNode, 6, 0, 1); // Bind S
+  __visc__bindIn(RZCNode, 7, 1, 1); // Get bytesS
+  __visc__bindIn(RZCNode, 8, 2, 1); // Bind G
+  __visc__bindIn(RZCNode, 9, 3, 1); // Get bytesG
+  __visc__bindIn(RZCNode, 10, 4, 1); // Bind E
+  __visc__bindIn(RZCNode, 11, 5, 1); // Bind bytesE
+  //__visc__bindIn(RZCNode, 12, 6, 1); // Bind m
+  __visc__edge(CZCNode, RZCNode, 0, 6, 1);
+  __visc__bindIn(RZCNode, 13, 7, 1); // Bind n
+
+  __visc__bindOut(RZCNode, 0, 0, 1); // dummy bind output to get pipeline functionality
 }

 }
-#define NUM_RUNS 5
+#define NUM_RUNS 1
 #define NUM_FRAMES 20
+using namespace cv;

 int main (int argc, char *argv[]) {

    struct pb_Parameters *params;
    struct pb_TimerSet timers;

-    size_t I_sz, L_sz, S_sz;
+    size_t I_sz, L_sz, S_sz, G_sz, E_sz;
    int matIrow, matIcol;
    std::vector<float> matI;
+    std::string window_name = "Edge Map";

+    std::cout << "Using OpenCV" << CV_VERSION << "\n";

    /* Read command line. Expect 3 inputs: A, B and B^T
       in column-major layout*/
    params = pb_ReadParameters(&argc, argv);
    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
+            || (params->inpFiles[1] != NULL))
    {
-        fprintf(stderr, "Expecting three input filenames\n");
+        fprintf(stderr, "Expecting input image filename\n");
        exit(-1);
    }

    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matIrow, matIcol, matI);
+    //std::cout << "Reading video file: " << params->inpFiles[0] << "\n";
+    //VideoCapture cap(params->inpFiles[0]);
+    //if(!cap.isOpened()) {
+      //std::cout << "Could not open video file" << "\n";
+      //return -1;
+    //}
+
+    //double count =  cap.get(CV_CAP_PROP_FRAME_COUNT);
+    //namedWindow(window_name, CV_WINDOW_AUTOSIZE);
+
+    //Mat src;
+    //while(1) {
+      //bool success = cap.read(src);
+      //if(!success) {
+        //std::cout << "Cannot read frame" << "\n";
+        //return -1;
+      //}
+      //imshow(window_name, src);
+      //if(waitKey(0) == 27) break;
+    //}
+    
+    Mat src = imread(params->inpFiles[0]);
+    if( !src.data )
+    {
+        return -1;
+    }
+    /// Create a window
+    namedWindow( window_name, CV_WINDOW_AUTOSIZE );
+    imshow(window_name, src);
+    waitKey(0);
+    std::cout << "Image dimension = " << src.size() << "\n";
+    /// Convert the image to grayscale
+    cvtColor( src, src, CV_BGR2GRAY );
+
+    src.convertTo(src, CV_32F, 1.0/255.0);
+    if(src.isContinuous()) {
+      matI.assign((float*)src.datastart, (float*)src.dataend);
+      std::cout << "Assigning image to float vector\n";
+      std::cout << "Vector size = " << matI.size() << "\n";
+      matIrow = src.size[0];
+      matIcol = src.size[1];
+    }
+    else {
+      std::cout << "Expecting contiguous storage of image in memory!\n";
+      exit(-1);
+    }

    pb_InitializeTimerSet(&timers);
    __visc__init();
@@ -384,10 +499,14 @@ int main (int argc, char *argv[]) {

    // allocate space for C
    S_sz = I_sz;
+    G_sz = I_sz;
+    E_sz = I_sz;

    // OpenCL memory allocation
    std::vector<float> matL(matIrow*matIcol);
    std::vector<float> matS(matIrow*matIcol);
+    std::vector<float> matG(matIrow*matIcol);
+    std::vector<float> matE(matIrow*matIcol);

    float B[] = {0, 1, 0, 1, 1, 1, 0, 1, 0};
    size_t bytesB = 9*sizeof(float);
@@ -402,6 +521,8 @@ int main (int argc, char *argv[]) {
    for(size_t i=0; i<matL.size(); i++) {
        matL[i] = 0.0f;
        matS[i] = 0.0f;
+        matG[i] = 0.0f;
+        matE[i] = 0.0f;
    }

    pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
@@ -411,6 +532,8 @@ int main (int argc, char *argv[]) {
                   B, bytesB,
                   &matL.front(), L_sz,
                   &matS.front(), S_sz,
+                   &matG.front(), G_sz,
+                   &matE.front(), E_sz,
                   matIrow, matIcol);

    assert(I_sz % BlockSize == 0);
@@ -421,6 +544,8 @@ int main (int argc, char *argv[]) {
      llvm_visc_track_mem(&matI[0], I_sz);
      llvm_visc_track_mem(&matL[0], L_sz);
      llvm_visc_track_mem(&matS[0], S_sz);
+      llvm_visc_track_mem(&matG[0], G_sz);
+      llvm_visc_track_mem(&matE[0], E_sz);
      llvm_visc_track_mem(B, bytesB);
      //packData(args, &matA[0], BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
      __visc__push(DFG, args);
@@ -435,6 +560,8 @@ int main (int argc, char *argv[]) {
      llvm_visc_untrack_mem(&matI[0]);
      llvm_visc_untrack_mem(&matL[0]);
      llvm_visc_untrack_mem(&matS[0]);
+      llvm_visc_untrack_mem(&matG[0]);
+      llvm_visc_untrack_mem(&matE[0]);
      llvm_visc_untrack_mem(B);

    //for(unsigned i=0 ; i < I_sz/sizeof(float); i = i + 16) {
@@ -472,6 +599,10 @@ int main (int argc, char *argv[]) {
        writeColMajorMatrixFile(params->outFile,
                                matIrow, matIcol, matS);
    }
+    Mat output(src.size[0], src.size[1], CV_32F);
+    memcpy(output.data, matS.data(), matS.size()*sizeof(float));
+    imshow(window_name, output);
+    waitKey(0);

    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
    std::cout<< "GFLOPs = " << 2.* matIrow * matIcol * matIcol/GPUtime/1e9 << std::endl;