visc_sh version: VISC with allocation of shared memory in allocation node - to be tested

cf32a23e · Maria Kotsifakou · 93ea2659 · cf32a23e · cf32a23e · cf32a23e
Commit cf32a23e authored 9 years ago by Maria Kotsifakou
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/io.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/io.cc
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <parboil.h>
+#include <visc.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_N 16
+#define TILE_TB_HEIGHT 8
+#define TILE_M (TILE_N*TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<<errorMessage<<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+typedef struct __attribute__((__packed__)) {
+  float *A; size_t bytesA;
+  int lda;
+  float *B; size_t bytesB;
+  int ldb;
+  float *C; size_t bytesC;
+  int ldc;
+  int k;
+  int alpha;
+  int beta;
+  int block_x;
+  int block_y;
+  int grid_x;
+  int grid_y;
+} RootIn;
+
+void packData(RootIn* args,
+              float *A, size_t bytesA, 
+              int lda,
+              float *B, size_t bytesB,
+              int ldb,
+              float *C, size_t bytesC,
+              int ldc,
+              int k,
+              int alpha,
+              int beta,
+              int block_x,
+              int block_y,
+              int grid_x,
+              int grid_y) {
+  args->A = A;
+  args->bytesA = bytesA;
+  args->lda = lda;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->ldb = ldb;
+  args->C = C;
+  args->bytesC = bytesC;
+  args->ldc = ldc;
+  args->k = k;
+  args->alpha = alpha;
+  args->beta = beta;
+  args->block_x = block_x;
+  args->block_y = block_y;
+  args->grid_x = grid_x;
+  args->grid_y = grid_y;
+}
+
+typedef struct __attribute__((packed)) {
+  void* shB; size_t bytes_shB;
+} AllocationOut;
+
+// TODO: decide between dynamic vs static allocation. Merely a convension - will
+// be translated.
+AllocationOut Allocation(block_x, block_y) {
+  // Memory shared between threadblocks
+  float shB[block_y][block_x];
+  void* ret_shB = (void*) shB;
+  
+  return {ret_shB, block_x*block_y*sizeof(float)}; 
+}
+
+
+void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float* C, size_t bytesC, int ldc, int k, float alpha, float beta, float* shB, size_t bytesshB )
+{
+    __visc__hint(visc::GPU_TARGET);
+    // TODO: shB is not an in or out attribute
+    __visc__attributes(3, A, B, C, 1, C);
+
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int ly = __visc__getNodeInstanceID_y(thisNode);
+
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
+
+    int dimx = __visc__getNumNodeInstances_x(thisNode);
+
+    float c[TILE_N];
+    for (int i=0; i < TILE_N; i++)
+      c[i] = 0.0f;
+   
+    int mid = ly*dimx+lx;
+    int m = gx * TILE_M + mid;
+    int n = gy * TILE_N + lx;
+
+    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+      float a; 
+      shB[ly][lx] = B[n+(i+ly)*ldb];
+
+      __visc__barrier();
+        b_base = get_group_id(1) * TILE_N + i * ldb;
+
+      for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+	      a = A[m + (i+j)*lda];
+	      for (int kk = 0; kk < TILE_N; kk++)
+		      c[kk] += a * shB[j][kk];
+
+      }
+      __visc__barrier();
+    }
+
+    int t = ldc * gy * TILE_N + m;
+    for (int i = 0; i < TILE_N; i++) {
+      C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
+    }
+}
+
+// Thread block node for sgemm - Creates allocation node and leaf (thread) node
+void SgemmTB(float *A, size_t bytesA, 
+             int lda,
+             float *B, size_t bytesB,
+             int ldb,
+             float *C, size_t bytesC,
+             int ldc,
+             int k,
+             int alpha,
+             int beta,
+             int block_x,
+             int block_y) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void* AllocationNode = __visc__createNode(Allocation);
+  void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y);
+
+  // Bind edges
+  __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
+  __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
+  __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
+  __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
+  __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
+  __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
+  __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
+  __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
+  __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
+  __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
+  __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
+  __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
+
+  __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
+  __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
+  
+
+  // Create Edges between AllocationNode and BFSLeafNodeNode
+  __visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B 
+  __visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B 
+
+  //TODO: bindOut : for now with out attribute
+}
+
+// Root node for sgemm - Creates thread block node
+void SgemmRoot(float *A, size_t bytesA, 
+               int lda,
+               float *B, size_t bytesB,
+               int ldb,
+               float *C, size_t bytesC,
+               int ldc,
+               int k,
+               int alpha,
+               int beta,
+               int block_x,
+               int block_y,
+               int grid_x,
+               int grid_y) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y);
+
+  // Bind edges
+  __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
+  __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
+  __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
+  __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
+  __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
+  __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
+  __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
+  __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
+  __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
+  __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
+  __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
+  __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
+  __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
+  __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
+  
+  //TODO: bindOut : for now with out attribute
+}
+
+// Creates root node for sgemm
+__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
+{
+    if ((transa != 'N') && (transa != 'n')) {
+        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    if ((transb != 'T') && (transb != 't')) {
+        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    // In this code we assume the matrix sizes are multiple of tile size
+    if ((m%TILE_M) || (n%TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
+              << "; n should be multiple of " << TILE_N << std::endl;
+        return;
+    }
+
+//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
+//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+//    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
+
+//    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
+
+  int block_x = TILE_N;
+  int block_y = TILE_TB_HEIGHT;
+  int grid_x = m*TILE_N/TILE_M;
+  int grid_y = n*TILE_TB_HEIGHT/TILE_N;
+  // Pack data in struct
+  RootIn* args = (RootIn*) malloc(sizeof(RootIn));
+  packData(args, 
+          A, bytesA, 
+          lda,
+          B, bytesB,
+          ldb,
+          C, bytesC,
+          ldc,
+          k,
+          alpha,
+          beta,
+          block_x,
+          block_y,
+          grid_x,
+          grid_y
+          );
+
+    void* sgemmDFG = __visc__launch(0, SgemmRoot, (void*) args);
+
+    __visc__wait(sgemmDFG);
+}
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t A_sz, B_sz, C_sz;
+    int matArow, matAcol;
+    int matBrow, matBcol;
+    std::vector<float> matA, matBT;
+
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] == NULL)
+            || (params->inpFiles[2] == NULL)
+            || (params->inpFiles[3] != NULL))
+    {
+        fprintf(stderr, "Expecting three input filenames\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    // load A
+    readColMajorMatrixFile(params->inpFiles[0],
+                           matArow, matAcol, matA);
+
+    // load B^T
+    readColMajorMatrixFile(params->inpFiles[2],
+                           matBcol, matBrow, matBT);
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    A_sz = matArow*matAcol*sizeof(float);
+    B_sz = matBrow*matBcol*sizeof(float);
+
+    // allocate space for C
+    C_sz = matArow*matBcol*sizeof(float);
+
+    // OpenCL memory allocation
+    std::vector<float> matC(matArow*matBcol);
+
+    llvm_visc_track_mem(&matA.front(), A_sz);
+    llvm_visc_track_mem(&matBT.front(), B_sz);
+    llvm_visc_track_mem(&matC.front(), C_sz);
+    // Copy A and B^T into device memory
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    for(size_t i=0; i<matC.size(); i++)
+        matC[i] = 0.0f;
+
+    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+
+    // Use standard sgemm interface
+    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
+               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+    llvm_visc_request_mem(&matC.front(), C_sz);
+
+    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
+    llvm_visc_untrack_mem(&matA.front());
+    llvm_visc_untrack_mem(&matBT.front());
+    llvm_visc_untrack_mem(&matC.front());
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        writeColMajorMatrixFile(params->outFile,
+                                matArow, matBcol, matC);
+    }
+
+    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+    pb_FreeParameters(params);
+
+    return 0;
+}