Skip to content
Snippets Groups Projects
Commit 109e2276 authored by Maria Kotsifakou's avatar Maria Kotsifakou
Browse files

MergeDFN: 2Level, CC with independent nodes testcase

parent e635f322
No related branches found
No related tags found
No related merge requests found
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=visc
SRCDIR_OBJS=io.ll #compute_gold.o
VISC_OBJS=main.visc.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include<fstream>
#include<iostream>
#include<vector>
char* readFile(const char* fileName)
{
std::fstream f(fileName,std::fstream::in);
if(!f.good())
{
std::cerr<<"Error Reading File!!"<<std::endl;
return NULL;
}
f.seekg(0,std::ios::end);
int length = f.tellg();
f.seekg(0,std::ios::beg);
char* buffer;
if(length>0)
{
buffer = new char[length];
f.read(buffer,length);
buffer[length-1]=0;
}
else
{
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << std::endl;
std::fstream f(fn, std::fstream::in);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
while (f.good() ) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f << nr_row << " "<<nr_col<<" ";
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <malloc.h>
#include <vector>
#include <iostream>
#include <parboil.h>
#include <visc.h>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
extern char* readFile(const char*);
// Parameters of tile sizes
#define TILE_N 16
#define TILE_TB_HEIGHT 8
#define TILE_M (TILE_N*TILE_TB_HEIGHT)
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
std::cout<<errorMessage<<" Error!\n"; \
std::cout<<"Line: "<<__LINE__<<"\n"; \
exit(1); \
}
typedef struct __attribute__((__packed__)) {
float *A;
size_t bytesA;
float *B;
size_t bytesB;
float *C;
size_t bytesC;
float *D;
size_t bytesD;
int block_x;
int block_y;
int grid_x;
int grid_y;
}
RootIn;
void packData(RootIn* args,
float *A, size_t bytesA,
float *B, size_t bytesB,
float *C, size_t bytesC,
float *D, size_t bytesD,
int block_x,
int block_y,
int grid_x,
int grid_y) {
args->A = A;
args->bytesA = bytesA;
args->B = B;
args->bytesB = bytesB;
args->C = C;
args->bytesC = bytesC;
args->D = D;
args->bytesD = bytesD;
args->block_x = block_x;
args->block_y = block_y;
args->grid_x = grid_x;
args->grid_y = grid_y;
}
void LeafMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC)
{
__visc__hint(visc::DEVICE);
//__visc__hint(visc::SPIR_TARGET);
// TODO: shB is not an in or out attribute
__visc__attributes(3, A, B, C, 1, C);
void* thisNode = __visc__getNode();
void* parentNode = __visc__getParentNode(thisNode);
int lx = __visc__getNodeInstanceID_x(thisNode);
int ly = __visc__getNodeInstanceID_y(thisNode);
int gx = __visc__getNodeInstanceID_x(parentNode);
int gy = __visc__getNodeInstanceID_y(parentNode);
int blockDimx = __visc__getNumNodeInstances_x(thisNode);
int gridx = __visc__getNumNodeInstances_x(parentNode);
int gridy = __visc__getNumNodeInstances_y(parentNode);
//int dimy = __visc__getNumNodeInstances_y(thisNode);
int x = gx*gridx+lx;
int y = gy*gridy+ly;
int dimx = blockDimx*gridx;
C[x+y*dimx] = C[x+y*dimx] + A[x+y*dimx] * B[x+y*dimx];
__visc__return(bytesA);
}
void InternalMul( float* A, size_t bytesA, float* B, size_t bytesB, float* C, size_t bytesC,
int block_x, int block_y ) {
__visc__hint(visc::DEVICE);
//__visc__hint(visc::SPIR_TARGET);
// TODO: shB is not an in or out attribute
__visc__attributes(3, A, B, C, 1, C);
void* LeafMulNode = __visc__createNode2D(LeafMul, block_x, block_y);
// Bind inputs
__visc__bindIn(LeafMulNode, 0, 0, 0); // Bind A
__visc__bindIn(LeafMulNode, 1, 1, 0); // Bind bytesA
__visc__bindIn(LeafMulNode, 2, 2, 0); // Bind B
__visc__bindIn(LeafMulNode, 3, 3, 0); // Bind bytesB
__visc__bindIn(LeafMulNode, 4, 4, 0); // Bind C
__visc__bindIn(LeafMulNode, 5, 5, 0); // Bind bytesC
// Bind outputs
__visc__bindOut(LeafMulNode, 0, 0, 0); // Bind bytesA
}
void LeafSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD)
{
__visc__hint(visc::DEVICE);
//__visc__hint(visc::SPIR_TARGET);
// TODO: shB is not an in or out attribute
__visc__attributes(3, A, B, D, 1, D);
void* thisNode = __visc__getNode();
void* parentNode = __visc__getParentNode(thisNode);
int lx = __visc__getNodeInstanceID_x(thisNode);
int ly = __visc__getNodeInstanceID_y(thisNode);
int gx = __visc__getNodeInstanceID_x(parentNode);
int gy = __visc__getNodeInstanceID_y(parentNode);
int blockDimx = __visc__getNumNodeInstances_x(thisNode);
int gridx = __visc__getNumNodeInstances_x(parentNode);
int gridy = __visc__getNumNodeInstances_y(parentNode);
//int dimy = __visc__getNumNodeInstances_y(thisNode);
int x = gx*gridx+lx;
int y = gy*gridy+ly;
int dimx = blockDimx*gridx;
D[x+y*dimx] = D[x+y*dimx] + A[x+y*dimx] + B[x+y*dimx];
__visc__return(bytesA);
}
void InternalSum( float* A, size_t bytesA, float* B, size_t bytesB, float* D, size_t bytesD,
int block_x, int block_y) {
__visc__hint(visc::DEVICE);
//__visc__hint(visc::SPIR_TARGET);
// TODO: shB is not an in or out attribute
__visc__attributes(3, A, B, D, 1, D);
void* LeafSumNode = __visc__createNode2D(LeafSum, block_x, block_y);
// Bind inputs
__visc__bindIn(LeafSumNode, 0, 0, 0); // Bind A
__visc__bindIn(LeafSumNode, 1, 1, 0); // Bind bytesA
__visc__bindIn(LeafSumNode, 2, 2, 0); // Bind B
__visc__bindIn(LeafSumNode, 3, 3, 0); // Bind bytesB
__visc__bindIn(LeafSumNode, 4, 4, 0); // Bind D
__visc__bindIn(LeafSumNode, 5, 5, 0); // Bind bytesD
// Bind outputs
__visc__bindOut(LeafSumNode, 0, 0, 0); // Bind bytesA
}
//void LeafDest(size_t bytesC, size_t bytesD) {
//__visc__hint(visc::DEVICE);
//__visc__attributes(0, 0);
//__visc__return(bytesC, bytesD);
//}
// Root node for sgemm - Creates thread block node
void Root(float *A, size_t bytesA,
float *B, size_t bytesB,
float *C, size_t bytesC,
float *D, size_t bytesD,
int block_x,
int block_y,
int grid_x,
int grid_y) {
__visc__hint(visc::CPU_TARGET);
__visc__attributes(4, A, B, C, D, 2, C, D);
void* InternalMulNode = __visc__createNode2D(InternalMul, grid_x, grid_y);
void* InternalSumNode = __visc__createNode2D(InternalSum, grid_x, grid_y);
//void* LeafDestNode = __visc__createNode(LeafDest);
// Bind inputs
__visc__bindIn(InternalMulNode, 0, 0, 0); // Bind A
__visc__bindIn(InternalMulNode, 1, 1, 0); // Bind bytesA
__visc__bindIn(InternalMulNode, 2, 2, 0); // Bind B
__visc__bindIn(InternalMulNode, 3, 3, 0); // Bind bytesB
__visc__bindIn(InternalMulNode, 4, 4, 0); // Bind C
__visc__bindIn(InternalMulNode, 5, 5, 0); // Bind bytesC
__visc__bindIn(InternalMulNode, 8, 6, 0); // Bind block_x
__visc__bindIn(InternalMulNode, 9, 7, 0); // Bind block_y
// Bind inputs
__visc__bindIn(InternalSumNode, 0, 0, 0); // Bind A
__visc__bindIn(InternalSumNode, 1, 1, 0); // Bind bytesA [Pass as edge]
__visc__bindIn(InternalSumNode, 2, 2, 0); // Bind B
__visc__bindIn(InternalSumNode, 3, 3, 0); // Bind bytesB
__visc__bindIn(InternalSumNode, 6, 4, 0); // Bind D
__visc__bindIn(InternalSumNode, 7, 5, 0); // Bind bytesD
__visc__bindIn(InternalSumNode, 8, 6, 0); // Bind block_x
__visc__bindIn(InternalSumNode, 9, 7, 0); // Bind block_y
// Bind Edges
//__visc__edge(InternalMulNode, InternalSumNode, 0, 0, 1, 0); // Bind bytesA
//TODO: bindOut : for now with out attribute
__visc__bindOut(InternalMulNode, 0, 0, 0); // bind output bytesA
__visc__bindOut(InternalSumNode, 0, 1, 0); // bind output bytesA
}
// Creates root node for sgemm
__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc, float* D, size_t bytesD )
{
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m%TILE_M) || (n%TILE_N)) {
std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
<< "; n should be multiple of " << TILE_N << std::endl;
return;
}
// unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
// unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
// unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
int block_x = 16;
int block_y = 16;
int grid_x = m/block_x;
int grid_y = n/block_y;
// Pack data in struct
RootIn* args = (RootIn*) malloc(sizeof(RootIn));
packData(args,
A, bytesA,
B, bytesB,
C, bytesC,
D, bytesD,
block_x,
block_y,
grid_x,
grid_y
);
pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
void* DFG = __visc__launch(0, Root, (void*) args);
__visc__wait(DFG);
pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
}
int main (int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz, D_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL)
|| (params->inpFiles[1] == NULL)
|| (params->inpFiles[2] == NULL)
|| (params->inpFiles[3] != NULL))
{
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0],
matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2],
matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
__visc__init();
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
// copy A to device memory
A_sz = matArow*matAcol*sizeof(float);
B_sz = matBrow*matBcol*sizeof(float);
// allocate space for C
C_sz = matArow*matBcol*sizeof(float);
D_sz = matArow*matBcol*sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow*matBcol);
std::vector<float> matD(matArow*matBcol);
llvm_visc_track_mem(&matA.front(), A_sz);
llvm_visc_track_mem(&matBT.front(), B_sz);
llvm_visc_track_mem(&matC.front(), C_sz);
llvm_visc_track_mem(&matD.front(), D_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
for(size_t i=0; i<matC.size(); i++)
matC[i] = 0.0f;
for(size_t i=0; i<matD.size(); i++)
matD[i] = 0.0f;
// Use standard sgemm interface
basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
&matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow, &matD.front(), D_sz);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
llvm_visc_request_mem(&matC.front(), C_sz);
llvm_visc_request_mem(&matD.front(), D_sz);
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
llvm_visc_untrack_mem(&matA.front());
llvm_visc_untrack_mem(&matBT.front());
llvm_visc_untrack_mem(&matC.front());
llvm_visc_untrack_mem(&matD.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__visc__cleanup();
if (params->outFile) {
/* Write C to file */
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile,
matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
pb_FreeParameters(params);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment