diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..36b421ec6f1359114ea0035d21048ab0b95bf30e --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile @@ -0,0 +1,8 @@ +# (c) 2010 The Board of Trustees of the University of Illinois. + +LANGUAGE=opencl +SRCDIR_OBJS=main.o io.o #compute_gold.o +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS=-ffast-math -O3 +APP_CXXFLAGS=-ffast-math -O3 +KERNEL_OBJS=kernel_offline.nvptx.s diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc @@ -0,0 +1,91 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* I/O routines for reading and writing matrices in column-major + * layout + */ + +#include<fstream> +#include<iostream> +#include<vector> + +char* readFile(const char* fileName) +{ + std::fstream f(fileName,std::fstream::in); + if(!f.good()) + { + std::cerr<<"Error Reading File!!"<<std::endl; + return NULL; + } + + f.seekg(0,std::ios::end); + int length = f.tellg(); + f.seekg(0,std::ios::beg); + + char* buffer; + + if(length>0) + { + buffer = new char[length]; + f.read(buffer,length); + buffer[length-1]=0; + } + else + { + buffer = new char; + buffer[0] = 0; + } + + f.close(); + + return buffer; +} + +bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << std::endl; + std::fstream f(fn, std::fstream::in); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f >> nr_row; + f >> nr_col; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + while (f.good() ) { + f >> data; + v.push_back(data); + } + v.pop_back(); // remove the duplicated last element + return true; + +} + +bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v) +{ + std::cerr << "Opening file:"<< fn << " for write." << std::endl; + std::fstream f(fn, std::fstream::out); + if ( !f.good() ) { + return false; + } + + // Read # of rows and cols + f << nr_row << " "<<nr_col<<" "; + + float data; + std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl; + for (int i = 0; i < v.size(); ++i) { + f << v[i] << ' '; + } + f << "\n"; + return true; + +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..f376a27d90003e3c7c18dafb9f64a8b459a40029 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl @@ -0,0 +1,25 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Kernel of dense matrix-matrix multiplication kernel. + */ + +__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) +{ + float c = 0.0f; + int m = get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a = A[m + i * lda]; + float b = B[n + i * ldb]; + c += a * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl new file mode 100644 index 0000000000000000000000000000000000000000..f376a27d90003e3c7c18dafb9f64a8b459a40029 --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl @@ -0,0 +1,25 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Kernel of dense matrix-matrix multiplication kernel. + */ + +__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta ) +{ + float c = 0.0f; + int m = get_global_id(0); + int n = get_global_id(1); + + for (int i = 0; i < k; ++i) { + float a = A[m + i * lda]; + float b = B[n + i * ldb]; + c += a * b; + } + C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c; +} diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cd29c205bbf06f245561e81ca6d48c9c4125def --- /dev/null +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc @@ -0,0 +1,225 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* + * Main entry of dense matrix-matrix multiplication kernel + */ + +#include <stdio.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <malloc.h> +#include <vector> +#include <iostream> +#include <CL/cl.h> +#include <parboil.h> + +// I/O routines +extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v); +extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&); +extern char* readFile(const char*); + +// Parameters of tile sizes +#define TILE_SZ 16 + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \ + std::cout<<"Line: "<<__LINE__<<"\n"; \ + exit(1); \ + } + +void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue ) +{ + if ((transa != 'N') && (transa != 'n')) { + std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl; + return; + } + + if ((transb != 'T') && (transb != 't')) { + std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl; + return; + } + + // In this code we assume the matrix sizes are multiple of tile size + if ((m%TILE_SZ) || (n%TILE_SZ)) { + std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ + << "; n should be multiple of " << TILE_SZ << std::endl; + } + + size_t db[2] = {TILE_SZ,TILE_SZ}; + size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]}; + + cl_int clStatus; + + clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A); + clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda); + clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B); + clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb); + clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C); + clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc); + clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k); + clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha); + clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta); + CHECK_ERROR("clSetKernelArg") + + clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL); + CHECK_ERROR("clEnqueueNDRangeKernel") + + clStatus = clFinish(clCommandQueue); + CHECK_ERROR("clFinish") +} + +int main (int argc, char *argv[]) { + + struct pb_Parameters *params; + struct pb_TimerSet timers; + + size_t A_sz, B_sz, C_sz; + int matArow, matAcol; + int matBrow, matBcol; + std::vector<float> matA, matBT; + + + /* Read command line. Expect 3 inputs: A, B and B^T + in column-major layout*/ + params = pb_ReadParameters(&argc, argv); + if ((params->inpFiles[0] == NULL) + || (params->inpFiles[1] == NULL) + || (params->inpFiles[2] == NULL) + || (params->inpFiles[3] != NULL)) + { + fprintf(stderr, "Expecting three input filenames\n"); + exit(-1); + } + + /* Read in data */ + // load A + readColMajorMatrixFile(params->inpFiles[0], + matArow, matAcol, matA); + // load B^T + readColMajorMatrixFile(params->inpFiles[2], + matBcol, matBrow, matBT); + + pb_InitializeTimerSet(&timers); + + pb_SwitchToTimer(&timers, visc_TimerID_SETUP); + cl_int clStatus; + + cl_uint numPlatforms; + clStatus = clGetPlatformIDs(0, NULL, &numPlatforms); + + cl_platform_id clPlatform[numPlatforms]; + clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL); + CHECK_ERROR("clGetPlatformIDs") + + cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0}; + cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus); + CHECK_ERROR("clCreateContextFromType") + + cl_device_id clDevice; + clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL); + CHECK_ERROR("clGetDeviceIDs") + + cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") + + pb_SetOpenCL(&clContext, &clCommandQueue); + + const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")}; + //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + //cl_kernel clKernel; + //cl_program clProgram; + //pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); + cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions,""); + + clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); + CHECK_ERROR("clCreateKernel") + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + // copy A to device memory + A_sz = matArow*matAcol*sizeof(float); + B_sz = matBrow*matBcol*sizeof(float); + + // allocate space for C + C_sz = matArow*matBcol*sizeof(float); + + // OpenCL memory allocation + std::vector<float> matC(matArow*matBcol); + + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + // Copy A and B^T into device memory + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + for(int i=0;i<matC.size();i++) + matC[i] = 0.0f; + + clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); + pb_SwitchToTimer( &timers, pb_TimerID_KERNEL ); + + // Use standard sgemm interface + basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \ + dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue); + + pb_SwitchToTimer( &timers, pb_TimerID_COPY ); + clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL); + + pb_SwitchToTimer( &timers, visc_TimerID_SETUP); + clStatus = clReleaseKernel(clKernel); + clStatus = clReleaseProgram(clProgram); + clStatus = clReleaseMemObject(dA); + clStatus = clReleaseMemObject(dB); + clStatus = clReleaseMemObject(dC); + clStatus = clReleaseCommandQueue(clCommandQueue); + clStatus = clReleaseContext(clContext); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + + if (params->outFile) { + + /* Write C to file */ + //pb_SwitchToTimer(&timers, pb_TimerID_IO); + writeColMajorMatrixFile(params->outFile, + matArow, matBcol, matC); + } + + + double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL])); + std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl; + pb_FreeParameters(params); + + //free((void*)clSource[0]); + + + return 0; +}