Skip to content
Snippets Groups Projects
Commit 320a7119 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

sgemm opencl vector version committed

parent 877bca7e
No related branches found
No related tags found
No related merge requests found
# (c) 2010 The Board of Trustees of the University of Illinois.
LANGUAGE=opencl
SRCDIR_OBJS=main.o io.o #compute_gold.o
APP_CUDALDFLAGS=-lm -lstdc++
APP_CFLAGS=-ffast-math -O3
APP_CXXFLAGS=-ffast-math -O3
KERNEL_OBJS=kernel_offline.nvptx.s
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/* I/O routines for reading and writing matrices in column-major
* layout
*/
#include<fstream>
#include<iostream>
#include<vector>
char* readFile(const char* fileName)
{
std::fstream f(fileName,std::fstream::in);
if(!f.good())
{
std::cerr<<"Error Reading File!!"<<std::endl;
return NULL;
}
f.seekg(0,std::ios::end);
int length = f.tellg();
f.seekg(0,std::ios::beg);
char* buffer;
if(length>0)
{
buffer = new char[length];
f.read(buffer,length);
buffer[length-1]=0;
}
else
{
buffer = new char;
buffer[0] = 0;
}
f.close();
return buffer;
}
bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << std::endl;
std::fstream f(fn, std::fstream::in);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f >> nr_row;
f >> nr_col;
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
while (f.good() ) {
f >> data;
v.push_back(data);
}
v.pop_back(); // remove the duplicated last element
return true;
}
bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
{
std::cerr << "Opening file:"<< fn << " for write." << std::endl;
std::fstream f(fn, std::fstream::out);
if ( !f.good() ) {
return false;
}
// Read # of rows and cols
f << nr_row << " "<<nr_col<<" ";
float data;
std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
for (int i = 0; i < v.size(); ++i) {
f << v[i] << ' ';
}
f << "\n";
return true;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
{
float c = 0.0f;
int m = get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a = A[m + i * lda];
float b = B[n + i * ldb];
c += a * b;
}
C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
{
float c = 0.0f;
int m = get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a = A[m + i * lda];
float b = B[n + i * ldb];
c += a * b;
}
C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
}
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <malloc.h>
#include <vector>
#include <iostream>
#include <CL/cl.h>
#include <parboil.h>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
extern char* readFile(const char*);
// Parameters of tile sizes
#define TILE_SZ 16
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \
std::cout<<"Line: "<<__LINE__<<"\n"; \
exit(1); \
}
void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
{
if ((transa != 'N') && (transa != 'n')) {
std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
return;
}
if ((transb != 'T') && (transb != 't')) {
std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
return;
}
// In this code we assume the matrix sizes are multiple of tile size
if ((m%TILE_SZ) || (n%TILE_SZ)) {
std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
<< "; n should be multiple of " << TILE_SZ << std::endl;
}
size_t db[2] = {TILE_SZ,TILE_SZ};
size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
cl_int clStatus;
clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
CHECK_ERROR("clSetKernelArg")
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
}
int main (int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matBT;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL)
|| (params->inpFiles[1] == NULL)
|| (params->inpFiles[2] == NULL)
|| (params->inpFiles[3] != NULL))
{
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0],
matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[2],
matBcol, matBrow, matBT);
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
cl_int clStatus;
cl_uint numPlatforms;
clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id clPlatform[numPlatforms];
clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
CHECK_ERROR("clGetPlatformIDs")
cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
CHECK_ERROR("clCreateContextFromType")
cl_device_id clDevice;
clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
CHECK_ERROR("clGetDeviceIDs")
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
//cl_kernel clKernel;
//cl_program clProgram;
//pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"");
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
CHECK_ERROR("clCreateKernel")
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
// copy A to device memory
A_sz = matArow*matAcol*sizeof(float);
B_sz = matBrow*matBcol*sizeof(float);
// allocate space for C
C_sz = matArow*matBcol*sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow*matBcol);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// Copy A and B^T into device memory
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
for(int i=0;i<matC.size();i++)
matC[i] = 0.0f;
clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
// Use standard sgemm interface
basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseMemObject(dA);
clStatus = clReleaseMemObject(dB);
clStatus = clReleaseMemObject(dC);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
if (params->outFile) {
/* Write C to file */
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile,
matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
pb_FreeParameters(params);
//free((void*)clSource[0]);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment