Skip to content
Snippets Groups Projects
Commit cf32a23e authored by Maria Kotsifakou's avatar Maria Kotsifakou
Browse files

visc_sh version: VISC with allocation of shared memory in allocation node - to be tested

parent 93ea2659
No related branches found
No related tags found
No related merge requests found
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=visc
SRCDIR_OBJS=io.ll #compute_gold.o
VISC_OBJS=main.visc.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include<fstream>
#include<iostream>
#include<vector>
char* readFile(const char* fileName)
{
std::fstream f(fileName,std::fstream::in);
if(!f.good())
{
std::cerr<<"Error Reading File!!"<<std::endl;
return NULL;
}
f.seekg(0,std::ios::end);
int length = f.tellg();
f.seekg(0,std::ios::beg);
char* buffer;
if(length>0)
{
buffer = new char[length];
f.read(buffer,length);
buffer[length-1]=0;
}
else
{
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << std::endl;
std::fstream f(fn, std::fstream::in);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
while (f.good() ) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f << nr_row << " "<<nr_col<<" ";
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <malloc.h>
#include <vector>
#include <iostream>
#include <parboil.h>
#include <visc.h>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
extern char* readFile(const char*);
// Parameters of tile sizes
#define TILE_N 16
#define TILE_TB_HEIGHT 8
#define TILE_M (TILE_N*TILE_TB_HEIGHT)
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
std::cout<<errorMessage<<" Error!\n"; \
std::cout<<"Line: "<<__LINE__<<"\n"; \
exit(1); \
}
typedef struct __attribute__((__packed__)) {
float *A; size_t bytesA;
int lda;
float *B; size_t bytesB;
int ldb;
float *C; size_t bytesC;
int ldc;
int k;
int alpha;
int beta;
int block_x;
int block_y;
int grid_x;
int grid_y;
} RootIn;
void packData(RootIn* args,
float *A, size_t bytesA,
int lda,
float *B, size_t bytesB,
int ldb,
float *C, size_t bytesC,
int ldc,
int k,
int alpha,
int beta,
int block_x,
int block_y,
int grid_x,
int grid_y) {
args->A = A;
args->bytesA = bytesA;
args->lda = lda;
args->B = B;
args->bytesB = bytesB;
args->ldb = ldb;
args->C = C;
args->bytesC = bytesC;
args->ldc = ldc;
args->k = k;
args->alpha = alpha;
args->beta = beta;
args->block_x = block_x;
args->block_y = block_y;
args->grid_x = grid_x;
args->grid_y = grid_y;
}
typedef struct __attribute__((packed)) {
void* shB; size_t bytes_shB;
} AllocationOut;
// TODO: decide between dynamic vs static allocation. Merely a convension - will
// be translated.
AllocationOut Allocation(block_x, block_y) {
// Memory shared between threadblocks
float shB[block_y][block_x];
void* ret_shB = (void*) shB;
return {ret_shB, block_x*block_y*sizeof(float)};
}
void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float* C, size_t bytesC, int ldc, int k, float alpha, float beta, float* shB, size_t bytesshB )
{
__visc__hint(visc::GPU_TARGET);
// TODO: shB is not an in or out attribute
__visc__attributes(3, A, B, C, 1, C);
void* thisNode = __visc__getNode();
void* parentNode = __visc__getParentNode(thisNode);
int lx = __visc__getNodeInstanceID_x(thisNode);
int ly = __visc__getNodeInstanceID_y(thisNode);
int gx = __visc__getNodeInstanceID_x(parentNode);
int gy = __visc__getNodeInstanceID_y(parentNode);
int dimx = __visc__getNumNodeInstances_x(thisNode);
float c[TILE_N];
for (int i=0; i < TILE_N; i++)
c[i] = 0.0f;
int mid = ly*dimx+lx;
int m = gx * TILE_M + mid;
int n = gy * TILE_N + lx;
for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
float a;
shB[ly][lx] = B[n+(i+ly)*ldb];
__visc__barrier();
b_base = get_group_id(1) * TILE_N + i * ldb;
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i+j)*lda];
for (int kk = 0; kk < TILE_N; kk++)
c[kk] += a * shB[j][kk];
}
__visc__barrier();
}
int t = ldc * gy * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
}
}
// Thread block node for sgemm - Creates allocation node and leaf (thread) node
void SgemmTB(float *A, size_t bytesA,
int lda,
float *B, size_t bytesB,
int ldb,
float *C, size_t bytesC,
int ldc,
int k,
int alpha,
int beta,
int block_x,
int block_y) {
__visc__hint(visc::CPU_TARGET);
__visc__attributes(3, A, B, C, 1, C);
void* AllocationNode = __visc__createNode(Allocation);
void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y);
// Bind edges
__visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
__visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
__visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
__visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
__visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
__visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
__visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
__visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
__visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
__visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
__visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
__visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
__visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
__visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
// Create Edges between AllocationNode and BFSLeafNodeNode
__visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B
__visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B
//TODO: bindOut : for now with out attribute
}
// Root node for sgemm - Creates thread block node
void SgemmRoot(float *A, size_t bytesA,
int lda,
float *B, size_t bytesB,
int ldb,
float *C, size_t bytesC,
int ldc,
int k,
int alpha,
int beta,
int block_x,
int block_y,
int grid_x,
int grid_y) {
__visc__hint(visc::CPU_TARGET);
__visc__attributes(3, A, B, C, 1, C);
void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y);
// Bind edges
__visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
__visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
__visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
__visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
__visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
__visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
__visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
__visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
__visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
__visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
__visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
__visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
__visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
__visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
//TODO: bindOut : for now with out attribute
}
// Creates root node for sgemm
__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
{
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m%TILE_M) || (n%TILE_N)) {
std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
<< "; n should be multiple of " << TILE_N << std::endl;
return;
}
// unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
// unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
// unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
int block_x = TILE_N;
int block_y = TILE_TB_HEIGHT;
int grid_x = m*TILE_N/TILE_M;
int grid_y = n*TILE_TB_HEIGHT/TILE_N;
// Pack data in struct
RootIn* args = (RootIn*) malloc(sizeof(RootIn));
packData(args,
A, bytesA,
lda,
B, bytesB,
ldb,
C, bytesC,
ldc,
k,
alpha,
beta,
block_x,
block_y,
grid_x,
grid_y
);
void* sgemmDFG = __visc__launch(0, SgemmRoot, (void*) args);
__visc__wait(sgemmDFG);
}
int main (int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL)
|| (params->inpFiles[1] == NULL)
|| (params->inpFiles[2] == NULL)
|| (params->inpFiles[3] != NULL))
{
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0],
matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2],
matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
__visc__init();
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
// copy A to device memory
A_sz = matArow*matAcol*sizeof(float);
B_sz = matBrow*matBcol*sizeof(float);
// allocate space for C
C_sz = matArow*matBcol*sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow*matBcol);
llvm_visc_track_mem(&matA.front(), A_sz);
llvm_visc_track_mem(&matBT.front(), B_sz);
llvm_visc_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
for(size_t i=0; i<matC.size(); i++)
matC[i] = 0.0f;
pb_SwitchToTimer( &timers, pb_TimerID_NONE );
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
&matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
llvm_visc_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
llvm_visc_untrack_mem(&matA.front());
llvm_visc_untrack_mem(&matBT.front());
llvm_visc_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__visc__cleanup();
if (params->outFile) {
/* Write C to file */
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile,
matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
pb_FreeParameters(params);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment