Skip to content
Snippets Groups Projects
Commit ec3e7deb authored by Yifan Zhao's avatar Yifan Zhao
Browse files

Removed unused versions of benchmarks

parent 20a3c17f
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 1817 deletions
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
{
// Partial results
float c[TILE_N];
for (int i=0; i < TILE_N; i++)
c[i] = 0.0f;
int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
int m = get_group_id(0) * TILE_M + mid;
int b_base = 0;
for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
float a;
b_base = get_group_id(1) * TILE_N + i * ldb;
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i+j)*lda];
for (int kk = 0; kk < TILE_N; kk++)
c[kk] += a * B[b_base + j * ldb + kk];
}
}
int t = ldc * get_group_id(1) * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
}
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <hpvm.h>
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <vector>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v);
extern bool writeColMajorMatrixFile(const char *fn, int, int,
std::vector<float> &);
extern char *readFile(const char *);
// Parameters of tile sizes
#define TILE_N 16
#define TILE_TB_HEIGHT 8
#define TILE_M (TILE_N * TILE_TB_HEIGHT)
#define CHECK_ERROR(errorMessage) \
if (clStatus != CL_SUCCESS) { \
std::cout << errorMessage << " Error!\n"; \
std::cout << "Line: " << __LINE__ << "\n"; \
exit(1); \
}
void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
float alpha, float beta) {
__hpvm__hint(hpvm::GPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
float c[TILE_N];
for (int i = 0; i < TILE_N; i++)
c[i] = 0.0f;
int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
int m = get_group_id(0) * TILE_M + mid;
int b_base = 0;
for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
float a;
b_base = get_group_id(1) * TILE_N + i * ldb;
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i + j) * lda];
for (int kk = 0; kk < TILE_N; kk++)
c[kk] += a * B[b_base + j * ldb + kk];
}
}
int t = ldc * get_group_id(1) * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
}
}
__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
int n, int k, float alpha, float *A,
size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float beta,
float *C, size_t bytesC, int ldc) {
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m % TILE_M) || (n % TILE_N)) {
std::cerr << "unsupported size of matrix. m should be multiple of "
<< TILE_M << "; n should be multiple of " << TILE_N << std::endl;
return;
}
unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N};
unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
ldb, C, bytesC, ldc, k, alpha, beta, 0);
__hpvm__wait(sgemmDFG);
}
int main(int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
(params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
__hpvm__init();
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// copy A to device memory
A_sz = matArow * matAcol * sizeof(float);
B_sz = matBrow * matBcol * sizeof(float);
// allocate space for C
C_sz = matArow * matBcol * sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow * matBcol);
llvm_hpvm_track_mem(&matA.front(), A_sz);
llvm_hpvm_track_mem(&matBT.front(), B_sz);
llvm_hpvm_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
for (size_t i = 0; i < matC.size(); i++)
matC[i] = 0.0f;
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
matArow);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
llvm_hpvm_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
llvm_hpvm_untrack_mem(&matA.front());
llvm_hpvm_untrack_mem(&matBT.front());
llvm_hpvm_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__hpvm__cleanup();
if (params->outFile) {
/* Write C to file */
// pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
<< std::endl;
pb_FreeParameters(params);
return 0;
}
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
APP_OPTFLAGS=-unroll-threshold=300 -loop-unroll -sroa
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <hpvm.h>
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <vector>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v);
extern bool writeColMajorMatrixFile(const char *fn, int, int,
std::vector<float> &);
extern char *readFile(const char *);
// Parameters of tile sizes
#define TILE_N 16
#define TILE_TB_HEIGHT 8
#define TILE_M (TILE_N * TILE_TB_HEIGHT)
#define CHECK_ERROR(errorMessage) \
if (clStatus != CL_SUCCESS) { \
std::cout << errorMessage << " Error!\n"; \
std::cout << "Line: " << __LINE__ << "\n"; \
exit(1); \
}
typedef struct __attribute__((__packed__)) {
float *A;
size_t bytesA;
int lda;
float *B;
size_t bytesB;
int ldb;
float *C;
size_t bytesC;
int ldc;
int k;
float alpha;
float beta;
long block_x;
long block_y;
long grid_x;
long grid_y;
} RootIn;
void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k,
float alpha, float beta, long block_x, long block_y, long grid_x,
long grid_y) {
args->A = A;
args->bytesA = bytesA;
args->lda = lda;
args->B = B;
args->bytesB = bytesB;
args->ldb = ldb;
args->C = C;
args->bytesC = bytesC;
args->ldc = ldc;
args->k = k;
args->alpha = alpha;
args->beta = beta;
args->block_x = block_x;
args->block_y = block_y;
args->grid_x = grid_x;
args->grid_y = grid_y;
}
void Allocation(long block_x, long block_y) {
void *shB = __hpvm__malloc(block_x * block_y * sizeof(float));
__hpvm__return(2, shB, block_x * block_y * sizeof(float));
}
void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
int ldb, float *C, size_t bytesC, int ldc, int k, float alpha,
float beta, float *shB, size_t bytesshB) {
__hpvm__hint(hpvm::DEVICE);
//__hpvm__hint(hpvm::SPIR_TARGET);
//__hpvm__hint(hpvm::GPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
void *thisNode = __hpvm__getNode();
void *parentNode = __hpvm__getParentNode(thisNode);
long lx = __hpvm__getNodeInstanceID_x(thisNode);
long ly = __hpvm__getNodeInstanceID_y(thisNode);
long gx = __hpvm__getNodeInstanceID_x(parentNode);
long gy = __hpvm__getNodeInstanceID_y(parentNode);
long dimx = __hpvm__getNumNodeInstances_x(thisNode);
float c[TILE_N];
for (int i = 0; i < TILE_N; i++)
c[i] = 0.0f;
int mid = ly * dimx + lx;
int m = gx * TILE_M + mid;
int n = gy * TILE_N + lx;
for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
float a;
// shB[ly][lx] = B[n+(i+ly)*ldb];
shB[ly * dimx + lx] = B[n + (i + ly) * ldb];
__hpvm__barrier();
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i + j) * lda];
for (int kk = 0; kk < TILE_N; kk++) {
// c[kk] += a * shB[j][kk];
c[kk] += a * shB[j * dimx + kk];
}
}
__hpvm__barrier();
}
int t = ldc * gy * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
}
}
// Work group node for sgemm - Creates allocation node and leaf (work item) node
void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb,
float *C, size_t bytesC, int ldc, int k, float alpha, float beta,
long block_x, long block_y) {
__hpvm__hint(hpvm::CPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
void *AllocationNode = __hpvm__createNodeND(0, Allocation);
void *SgemmLeafNode = __hpvm__createNodeND(2, SgemmLeaf, block_x, block_y);
// Bind edges
__hpvm__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
__hpvm__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
__hpvm__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
__hpvm__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
__hpvm__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
__hpvm__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
__hpvm__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
__hpvm__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
__hpvm__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
__hpvm__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
__hpvm__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
__hpvm__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
__hpvm__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
__hpvm__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
// Create Edges between AllocationNode and BFSLeafNodeNode
__hpvm__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
__hpvm__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
0); // Edge bytes_local_B
}
// Root node for sgemm - Creates work group node
void SgemmRoot(float *A, size_t bytesA, int lda, // 0-2
float *B, size_t bytesB, int ldb, // 3-5
float *C, size_t bytesC, int ldc, // 6-8
int k, float alpha, float beta, // 9-11
long block_x, long block_y, long grid_x, long grid_y // 12-15
) {
__hpvm__hint(hpvm::CPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
void *SgemmTBNode = __hpvm__createNodeND(2, SgemmTB, grid_x, grid_y);
// Bind edges
__hpvm__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
__hpvm__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
__hpvm__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
__hpvm__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
__hpvm__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
__hpvm__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
__hpvm__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
__hpvm__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
__hpvm__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
__hpvm__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
__hpvm__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
__hpvm__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
__hpvm__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
__hpvm__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
}
void SgemmWrapper(float *A, size_t bytesA, int lda, // 0-2
float *B, size_t bytesB, int ldb, // 3-5
float *C, size_t bytesC, int ldc, // 6-8
int k, float alpha, float beta, // 9-11
long block_x, long block_y, long grid_x, long grid_y // 12-15
) {
__hpvm__hint(hpvm::CPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
void *SgemmRootNode = __hpvm__createNodeND(0, SgemmRoot);
// Bind edges
__hpvm__bindIn(SgemmRootNode, 0, 0, 0); // Bind A
__hpvm__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA
__hpvm__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda
__hpvm__bindIn(SgemmRootNode, 3, 3, 0); // Bind B
__hpvm__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB
__hpvm__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb
__hpvm__bindIn(SgemmRootNode, 6, 6, 0); // Bind C
__hpvm__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC
__hpvm__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc
__hpvm__bindIn(SgemmRootNode, 9, 9, 0); // Bind k
__hpvm__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
__hpvm__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
__hpvm__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
__hpvm__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
__hpvm__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
__hpvm__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
}
// Creates root node for sgemm
__attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers,
char transa, char transb, int m,
int n, int k, float alpha, float *A,
size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float beta,
float *C, size_t bytesC, int ldc) {
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m % TILE_M) || (n % TILE_N)) {
std::cerr << "unsupported size of matrix. m should be multiple of "
<< TILE_M << "; n should be multiple of " << TILE_N << std::endl;
return;
}
// unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
// unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
long block_x = TILE_N;
long block_y = TILE_TB_HEIGHT;
long grid_x = m / TILE_M;
long grid_y = n / TILE_N;
// Pack data in struct
RootIn *args = (RootIn *)malloc(sizeof(RootIn));
packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta,
block_x, block_y, grid_x, grid_y);
pb_SwitchToTimer(timers, hpvm_TimerID_COMPUTATION);
void *sgemmDFG = __hpvm__launch(0, SgemmWrapper, (void *)args);
__hpvm__wait(sgemmDFG);
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
}
int main(int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
(params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
__hpvm__init();
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// copy A to device memory
A_sz = matArow * matAcol * sizeof(float);
B_sz = matBrow * matBcol * sizeof(float);
// allocate space for C
C_sz = matArow * matBcol * sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow * matBcol);
llvm_hpvm_track_mem(&matA.front(), A_sz);
llvm_hpvm_track_mem(&matBT.front(), B_sz);
llvm_hpvm_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
for (size_t i = 0; i < matC.size(); i++)
matC[i] = 0.0f;
// Use standard sgemm interface
basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(),
A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(),
C_sz, matArow);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
llvm_hpvm_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
llvm_hpvm_untrack_mem(&matA.front());
llvm_hpvm_untrack_mem(&matBT.front());
llvm_hpvm_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__hpvm__cleanup();
if (params->outFile) {
/* Write C to file */
// pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
<< std::endl;
pb_FreeParameters(params);
return 0;
}
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <hpvm.h>
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <vector>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v);
extern bool writeColMajorMatrixFile(const char *fn, int, int,
std::vector<float> &);
extern char *readFile(const char *);
// Parameters of tile sizes
#define TILE_SZ 16
#define CHECK_ERROR(errorMessage) \
if (clStatus != CL_SUCCESS) { \
std::cout << errorMessage << " Error!\n"; \
std::cout << "Line: " << __LINE__ << "\n"; \
exit(1); \
}
void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
float alpha, float beta) {
__hpvm__attributes(3, A, B, C, 1, C);
float c0, c1, c2, c3;
c0 = c1 = c2 = c3 = 0.0f;
int m = 4 * get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a0 = A[m + i * lda];
float a1 = A[m + 1 + i * lda];
float a2 = A[m + 2 + i * lda];
float a3 = A[m + 3 + i * lda];
float b = B[n + i * ldb];
c0 += a0 * b;
c1 += a1 * b;
c2 += a2 * b;
c3 += a3 * b;
}
C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
}
__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
int n, int k, float alpha, float *A,
size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float beta,
float *C, size_t bytesC, int ldc) {
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m % TILE_SZ) || (n % TILE_SZ)) {
std::cerr << "unsupported size of matrix. m should be multiple of "
<< TILE_SZ << "; n should be multiple of " << TILE_SZ
<< std::endl;
}
unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
ldb, C, bytesC, ldc, k, alpha, beta, 0);
__hpvm__wait(sgemmDFG);
}
int main(int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
pb_InitializeTimerSet(&timers);
__hpvm__init();
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
(params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
pb_SwitchToTimer(&timers, pb_TimerID_IO);
// load A
readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// copy A to device memory
A_sz = matArow * matAcol * sizeof(float);
B_sz = matBrow * matBcol * sizeof(float);
// allocate space for C
C_sz = matArow * matBcol * sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow * matBcol);
llvm_hpvm_track_mem(&matA.front(), A_sz);
llvm_hpvm_track_mem(&matBT.front(), B_sz);
llvm_hpvm_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
for (size_t i = 0; i < matC.size(); i++)
matC[i] = 0.0f;
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
matArow);
if (params->outFile) {
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
/* Write C to file */
llvm_hpvm_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
}
pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
llvm_hpvm_untrack_mem(&matA.front());
llvm_hpvm_untrack_mem(&matBT.front());
llvm_hpvm_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
<< std::endl;
pb_PrintTimerSet(&timers);
__hpvm__cleanup();
pb_FreeParameters(params);
return 0;
}
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <hpvm.h>
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <vector>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v);
extern bool writeColMajorMatrixFile(const char *fn, int, int,
std::vector<float> &);
extern char *readFile(const char *);
// Parameters of tile sizes
#define TILE_SZ 16
#define CHECK_ERROR(errorMessage) \
if (clStatus != CL_SUCCESS) { \
std::cout << errorMessage << " Error!\n"; \
std::cout << "Line: " << __LINE__ << "\n"; \
exit(1); \
}
void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
float alpha, float beta) {
__hpvm__attributes(3, A, B, C, 1, C);
float c0, c1, c2, c3;
c0 = c1 = c2 = c3 = 0.0f;
int m = 4 * get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a0 = A[m + i * lda];
float a1 = A[m + 1 + i * lda];
float a2 = A[m + 2 + i * lda];
float a3 = A[m + 3 + i * lda];
float b = B[n + i * ldb];
c0 += a0 * b;
c1 += a1 * b;
c2 += a2 * b;
c3 += a3 * b;
}
C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
}
__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
int n, int k, float alpha, float *A,
size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float beta,
float *C, size_t bytesC, int ldc) {
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m % TILE_SZ) || (n % TILE_SZ)) {
std::cerr << "unsupported size of matrix. m should be multiple of "
<< TILE_SZ << "; n should be multiple of " << TILE_SZ
<< std::endl;
}
unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
ldb, C, bytesC, ldc, k, alpha, beta, 0);
__hpvm__wait(sgemmDFG);
}
int main(int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
pb_InitializeTimerSet(&timers);
__hpvm__init();
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
(params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
pb_SwitchToTimer(&timers, pb_TimerID_IO);
// load A
readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// copy A to device memory
A_sz = matArow * matAcol * sizeof(float);
B_sz = matBrow * matBcol * sizeof(float);
// allocate space for C
C_sz = matArow * matBcol * sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow * matBcol);
llvm_hpvm_track_mem(&matA.front(), A_sz);
llvm_hpvm_track_mem(&matBT.front(), B_sz);
llvm_hpvm_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
for (size_t i = 0; i < matC.size(); i++)
matC[i] = 0.0f;
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
matArow);
if (params->outFile) {
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
/* Write C to file */
llvm_hpvm_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
}
pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
llvm_hpvm_untrack_mem(&matA.front());
llvm_hpvm_untrack_mem(&matBT.front());
llvm_hpvm_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
<< std::endl;
pb_PrintTimerSet(&timers);
__hpvm__cleanup();
pb_FreeParameters(params);
return 0;
}
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
__kernel void mysgemmNT( __global float *A, size_t bytesA, int lda, __global float *B, size_t bytesB, int ldb, __global float* C, size_t bytesC, int ldc, int k, float alpha, float beta )
{
/*
// Partial results
float c[8];
for (int i=0; i < 8; i++)
c[i] = 0.0f;
float a[8];
float b[8];
int m = get_global_id(0) * 8;
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
for (int id = 0; id < 8; id++) {
a[id] = A[m + id + i * lda];
b[id] = B[n + i * ldb];
c[id] += a[id] * b[id];
}
}
for (int id = 0; id < 8; id++)
C[m+id+n*ldc] = C[m+id+n*ldc] * beta + alpha * c[id];
*/
// Partial results
float8 cp = (float8)(0.0f);
int m = get_global_id(0) * 8;
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float8 a = vload8(0, A + (m + i * lda));
float8 b = (float8)(B[n + i * ldb]);
cp += a * b;
}
float8 c = vload8(0, C + (m+n*ldc));
c = c * beta + alpha * cp;
vstore8(c, 0, C + (m+n*ldc));
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <hpvm.h>
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <vector>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v);
extern bool writeColMajorMatrixFile(const char *fn, int, int,
std::vector<float> &);
extern char *readFile(const char *);
// Parameters of tile sizes
#define TILE_SZ 16
#define VEC_SZ 8
#define CHECK_ERROR(errorMessage) \
if (clStatus != CL_SUCCESS) { \
std::cout << errorMessage << " Error!\n"; \
std::cout << "Line: " << __LINE__ << "\n"; \
exit(1); \
}
void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
float alpha, float beta) {
__hpvm__hint(hpvm::GPU_TARGET);
__hpvm__attributes(3, A, B, C, 1, C);
float c = 0.0f;
int m = get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a = A[m + i * lda];
float b = B[n + i * ldb];
c += a * b;
}
C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c;
/*
Will be substituted by this kernel at the llvm level
// Partial results
float8 cp = (float8)(0.0f);
int m = get_global_id(0) * 8;
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float8 a = vload8(0, A + (m + i * lda));
float8 b = (float8)(B[n + i * ldb]);
cp += a * b;
}
float8 c = vload8(0, C + (m+n*ldc));
c = c * beta + alpha * cp;
vstore8(c, 0, C + (m+n*ldc));
*/
}
__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
int n, int k, float alpha, float *A,
size_t bytesA, int lda, float *B,
size_t bytesB, int ldb, float beta,
float *C, size_t bytesC, int ldc) {
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m % TILE_SZ) || (n % TILE_SZ)) {
std::cerr << "unsupported size of matrix. m should be multiple of "
<< TILE_SZ << "; n should be multiple of " << TILE_SZ
<< std::endl;
}
unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
unsigned sgemmDFG = __hpvm__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
ldb, C, bytesC, ldc, k, alpha, beta, 0);
__hpvm__wait(sgemmDFG);
}
int main(int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
(params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
__hpvm__init();
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// copy A to device memory
A_sz = matArow * matAcol * sizeof(float);
B_sz = matBrow * matBcol * sizeof(float);
// allocate space for C
C_sz = matArow * matBcol * sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow * matBcol);
llvm_hpvm_track_mem(&matA.front(), A_sz);
llvm_hpvm_track_mem(&matBT.front(), B_sz);
llvm_hpvm_track_mem(&matC.front(), C_sz);
// Copy A and B^T into device memory
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
for (size_t i = 0; i < matC.size(); i++)
matC[i] = 0.0f;
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
matArow);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
llvm_hpvm_request_mem(&matC.front(), C_sz);
pb_SwitchToTimer(&timers, hpvm_TimerID_MEM_UNTRACK);
llvm_hpvm_untrack_mem(&matA.front());
llvm_hpvm_untrack_mem(&matBT.front());
llvm_hpvm_untrack_mem(&matC.front());
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
__hpvm__cleanup();
if (params->outFile) {
/* Write C to file */
// pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
<< std::endl;
pb_FreeParameters(params);
return 0;
}
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=hpvm
SRCDIR_OBJS=io.ll #compute_gold.o
HPVM_OBJS=main.hpvm.ll
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include <fstream>
#include <iostream>
#include <vector>
char *readFile(const char *fileName) {
std::fstream f(fileName, std::fstream::in);
if (!f.good()) {
std::cerr << "Error Reading File!!" << std::endl;
return NULL;
}
f.seekg(0, std::ios::end);
int length = f.tellg();
f.seekg(0, std::ios::beg);
char *buffer;
if (length > 0) {
buffer = new char[length];
f.read(buffer, length);
buffer[length - 1] = 0;
} else {
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << std::endl;
std::fstream f(fn, std::fstream::in);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
while (f.good()) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
std::vector<float> &v) {
std::cerr << "Opening file:" << fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if (!f.good()) {
return false;
}
// Read # of rows and cols
f << nr_row << " " << nr_col << " ";
float data;
std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
// Parameters of tile sizes
#define TILE_N 8
#define TILE_TB_HEIGHT 8
#define TILE_M (TILE_N*TILE_TB_HEIGHT)
__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
{
float c[TILE_N];
for (int i=0; i < TILE_N; i++)
c[i] = 0.0f;
int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
int m = get_group_id(0) * TILE_M + mid;
int b_base = 0;
for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
float a;
b_base = get_group_id(1) * TILE_N + i * ldb;
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i+j)*lda];
for (int kk = 0; kk < TILE_N; kk++)
c[kk] += a * B[b_base + j * ldb + kk];
}
}
int t = ldc * get_group_id(1) * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
}
/*
Will be substituted by this kernel at the llvm level
// Partial results
floatn cp = (floatn)(0.0f);
int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
int m = get_group_id(0) * TILE_M + mid;
int b_base = 0;
for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
float a;
b_base = get_group_id(1) * TILE_N + i * ldb;
for (int j = 0; j < TILE_TB_HEIGHT; j++) {
a = A[m + (i+j)*lda];
cp += a * vloadn(0, B + b_base + j * ldb);
}
}
cp = alpha * cp;
float c[TILE_N];
c[0] = cp.s0;
c[1] = cp.s1;
c[2] = cp.s2;
c[3] = cp.s3;
c[4] = cp.s4;
c[5] = cp.s5;
c[6] = cp.s6;
c[7] = cp.s7;
int t = ldc * get_group_id(1) * TILE_N + m;
for (int i = 0; i < TILE_N; i++) {
C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
}
*/
*/
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment