MergeDFN: 2Level, CC with independent nodes testcase

109e2276 · Maria Kotsifakou · e635f322 · 109e2276 · 109e2276 · 109e2276
Commit 109e2276 authored 8 years ago by Maria Kotsifakou
--- a/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/Makefile
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
+
--- a/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc
+++ b/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/io.cc
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
--- a/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/merge-tests/src/2LevelICC/main.cc
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <parboil.h>
+#include <visc.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_N 16
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N*TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<<errorMessage<<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+typedef struct __attribute__((__packed__)) {
+    float *A;
+    size_t bytesA;
+    float *B;
+    size_t bytesB;
+    float *C;
+    size_t bytesC;
+    float *D;
+    size_t bytesD;
+    int block_x;
+    int block_y;
+    int grid_x;
+    int grid_y;
+}
+RootIn;
+
+void packData(RootIn* args,
+              float *A, size_t bytesA,
+              float *B, size_t bytesB,
+              float *C, size_t bytesC,
+              float *D, size_t bytesD,
+              int block_x,
+              int block_y,
+              int grid_x,
+              int grid_y) {
+    args->A = A;
+    args->bytesA = bytesA;
+    args->B = B;
+    args->bytesB = bytesB;
+    args->C = C;
+    args->bytesC = bytesC;
+    args->D = D;
+    args->bytesD = bytesD;
+    args->block_x = block_x;
+    args->block_y = block_y;
+    args->grid_x = grid_x;
+    args->grid_y = grid_y;
+}
+
+void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
+{
+    __visc__hint(visc::DEVICE);
+    //__visc__hint(visc::SPIR_TARGET);
+    // TODO: shB is not an in or out attribute
+    __visc__attributes(3, A, B, C, 1, C);
+
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int ly = __visc__getNodeInstanceID_y(thisNode);
+
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
+
+    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
+    int gridx = __visc__getNumNodeInstances_x(parentNode);
+    int gridy = __visc__getNumNodeInstances_y(parentNode);
+    //int dimy = __visc__getNumNodeInstances_y(thisNode);
+
+    int x = gx*gridx+lx;
+    int y = gy*gridy+ly;
+    int dimx = blockDimx*gridx;
+
+    C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
+    __visc__return(bytesA);
+}
+
+void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
+                  int block_x, int block_y ) {
+    __visc__hint(visc::DEVICE);
+    //__visc__hint(visc::SPIR_TARGET);
+    // TODO: shB is not an in or out attribute
+    __visc__attributes(3, A, B, C, 1, C);
+    void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
+
+    // Bind inputs
+    __visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
+    __visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
+    __visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
+    __visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
+    __visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
+    __visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
+
+    // Bind outputs
+    __visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
+
+}
+
+void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
+{
+    __visc__hint(visc::DEVICE);
+    //__visc__hint(visc::SPIR_TARGET);
+    // TODO: shB is not an in or out attribute
+    __visc__attributes(3, A, B, D, 1, D);
+
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int ly = __visc__getNodeInstanceID_y(thisNode);
+
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
+
+    int blockDimx = __visc__getNumNodeInstances_x(thisNode);
+    int gridx = __visc__getNumNodeInstances_x(parentNode);
+    int gridy = __visc__getNumNodeInstances_y(parentNode);
+    //int dimy = __visc__getNumNodeInstances_y(thisNode);
+
+    int x = gx*gridx+lx;
+    int y = gy*gridy+ly;
+    int dimx = blockDimx*gridx;
+
+    D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
+    __visc__return(bytesA);
+}
+
+void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
+                  int block_x, int block_y) {
+    __visc__hint(visc::DEVICE);
+    //__visc__hint(visc::SPIR_TARGET);
+    // TODO: shB is not an in or out attribute
+    __visc__attributes(3, A, B, D, 1, D);
+    void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
+
+    // Bind inputs
+    __visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
+    __visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
+    __visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
+    __visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
+    __visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
+    __visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
+
+    // Bind outputs
+    __visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
+}
+
+//void LeafDest(size_t bytesC, size_t bytesD) {
+    //__visc__hint(visc::DEVICE);
+    //__visc__attributes(0, 0);
+
+    //__visc__return(bytesC, bytesD);
+//}
+
+// Root node for sgemm - Creates thread block node
+void Root(float *A, size_t bytesA,
+               float *B, size_t bytesB,
+               float *C, size_t bytesC,
+               float *D, size_t bytesD,
+               int block_x,
+               int block_y,
+               int grid_x,
+               int grid_y) {
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(4, A, B, C, D, 2, C, D);
+    void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
+    void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
+    //void* LeafDestNode = __visc__createNode(LeafDest);
+
+    // Bind inputs
+    __visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
+    __visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
+    __visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
+    __visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
+    __visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
+    __visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
+    __visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
+    __visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
+
+    // Bind inputs
+    __visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
+    __visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
+    __visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
+    __visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
+    __visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
+    __visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
+    __visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
+    __visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
+
+    // Bind Edges
+    //__visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
+
+    //TODO: bindOut : for now with out attribute
+    __visc__bindOut(InternalMulNode, 0, 0, 0); // bind output bytesA
+    __visc__bindOut(InternalSumNode, 0, 1, 0); // bind output bytesA
+}
+
+// Creates root node for sgemm
+__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
+{
+    if ((transa != 'N') && (transa != 'n')) {
+        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    if ((transb != 'T') && (transb != 't')) {
+        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    // In this code we assume the matrix sizes are multiple of tile size
+    if ((m%TILE_M) || (n%TILE_N)) {
+        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
+                  << "; n should be multiple of " << TILE_N << std::endl;
+        return;
+    }
+
+//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
+//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
+
+//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
+
+    int block_x = 16;
+    int block_y = 16;
+
+    int grid_x = m/block_x;
+    int grid_y = n/block_y;
+    // Pack data in struct
+    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
+    packData(args,
+             A, bytesA,
+             B, bytesB,
+             C, bytesC,
+             D, bytesD,
+             block_x,
+             block_y,
+             grid_x,
+             grid_y
+            );
+
+    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
+    void* DFG = __visc__launch(0, Root, (void*) args);
+
+    __visc__wait(DFG);
+    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+}
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t A_sz, B_sz, C_sz, D_sz;
+    int matArow, matAcol;
+    int matBrow, matBcol;
+    std::vector<float> matA, matBT;
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] == NULL)
+            || (params->inpFiles[2] == NULL)
+            || (params->inpFiles[3] != NULL))
+    {
+        fprintf(stderr, "Expecting three input filenames\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    // load A
+    readColMajorMatrixFile(params->inpFiles[0],
+                           matArow, matAcol, matA);
+
+    // load B^T
+    readColMajorMatrixFile(params->inpFiles[2],
+                           matBcol, matBrow, matBT);
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    A_sz = matArow*matAcol*sizeof(float);
+    B_sz = matBrow*matBcol*sizeof(float);
+
+    // allocate space for C
+    C_sz = matArow*matBcol*sizeof(float);
+    D_sz = matArow*matBcol*sizeof(float);
+
+    // OpenCL memory allocation
+    std::vector<float> matC(matArow*matBcol);
+    std::vector<float> matD(matArow*matBcol);
+
+    llvm_visc_track_mem(&matA.front(), A_sz);
+    llvm_visc_track_mem(&matBT.front(), B_sz);
+    llvm_visc_track_mem(&matC.front(), C_sz);
+    llvm_visc_track_mem(&matD.front(), D_sz);
+    // Copy A and B^T into device memory
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    for(size_t i=0; i<matC.size(); i++)
+        matC[i] = 0.0f;
+
+    for(size_t i=0; i<matD.size(); i++)
+        matD[i] = 0.0f;
+
+    // Use standard sgemm interface
+    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
+               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+    llvm_visc_request_mem(&matC.front(), C_sz);
+    llvm_visc_request_mem(&matD.front(), D_sz);
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    llvm_visc_untrack_mem(&matA.front());
+    llvm_visc_untrack_mem(&matBT.front());
+    llvm_visc_untrack_mem(&matC.front());
+    llvm_visc_untrack_mem(&matD.front());
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        writeColMajorMatrixFile(params->outFile,
+                                matArow, matBcol, matC);
+    }
+
+    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+    pb_FreeParameters(params);
+
+    return 0;
+}