Skip to content
Snippets Groups Projects
Commit c62c1e51 authored by Akash Kothari's avatar Akash Kothari
Browse files

Removing paper and cfar-report

parent 4effc383
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 1276 deletions
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Main entry of dense matrix-matrix multiplication kernel
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <malloc.h>
#include <vector>
#include <iostream>
#include <cassert>
#include <CL/cl.h>
#include <parboil.h>
// I/O routines
extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
extern char* readFile(const char*);
// Parameters of tile sizes
#define TILE_SZ 16
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \
std::cout<<"Line: "<<__LINE__<<"\n"; \
exit(1); \
}
void basicSgemm( int m, int n, cl_mem A, cl_mem B, cl_mem C, cl_kernel clKernel, cl_command_queue clCommandQueue )
{
// In this code we assume the matrix sizes are multiple of tile size
if ((m%TILE_SZ) || (n%TILE_SZ)) {
std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
<< "; n should be multiple of " << TILE_SZ << std::endl;
}
//#ifdef ROWM
//size_t db = m;
//size_t dg = (m*n);
//#else
//size_t db = n;
//size_t dg = (m*n);
//#endif
#ifdef ROWM
size_t dg = m;
#else
size_t dg = n;
#endif
cl_int clStatus;
//std::cout << "Block dim = " << db << ", Group dim = " << dg/db << "\n";
clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),(void*)&B);
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&C);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&m);
clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&n);
CHECK_ERROR("clSetKernelArg")
//clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,&db,0,NULL,NULL);
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,NULL,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
}
int main (int argc, char *argv[]) {
struct pb_Parameters *params;
struct pb_TimerSet timers;
size_t A_sz, B_sz, C_sz;
int matArow, matAcol;
int matBrow, matBcol;
std::vector<float> matA, matB;
/* Read command line. Expect 3 inputs: A, B and B^T
in column-major layout*/
params = pb_ReadParameters(&argc, argv);
if ((params->inpFiles[0] == NULL)
|| (params->inpFiles[1] == NULL)
|| (params->inpFiles[2] != NULL))
{
fprintf(stderr, "Expecting three input filenames\n");
exit(-1);
}
/* Read in data */
// load A
readColMajorMatrixFile(params->inpFiles[0],
matArow, matAcol, matA);
// load B^T
readColMajorMatrixFile(params->inpFiles[1],
matBrow, matBcol, matB);
assert(matArow == matBrow && matAcol == matBcol && "Dimensions of two input matrices should match");
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
cl_int clStatus;
cl_platform_id clPlatform;
clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
CHECK_ERROR("clGetPlatformIDs")
cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
CHECK_ERROR("clCreateContextFromType")
cl_device_id clDevice;
clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
CHECK_ERROR("clGetDeviceIDs")
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
// const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
// cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_kernel clKernel;
cl_program clProgram;
pb_CreateAndBuildKernelFromBinary("build/opencl_default/kernel_offline.nvptx.s", "matAdd", &clContext, &clDevice, &clProgram, &clKernel);
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
//CHECK_ERROR("clCreateProgramWithSource")
//char clOptions[50];
//sprintf(clOptions,"");
//clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
//CHECK_ERROR("clBuildProgram")
//cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
//CHECK_ERROR("clCreateKernel")
pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
// copy A to device memory
A_sz = matArow*matAcol*sizeof(float);
B_sz = matBrow*matBcol*sizeof(float);
// allocate space for C
C_sz = matArow*matBcol*sizeof(float);
// OpenCL memory allocation
std::vector<float> matC(matArow*matBcol);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// Copy A and B^T into device memory
clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matB.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
// Use standard sgemm interface
basicSgemm(matArow, matAcol, dA, dB, dC, clKernel, clCommandQueue);
pb_SwitchToTimer( &timers, pb_TimerID_COPY );
clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseMemObject(dA);
clStatus = clReleaseMemObject(dB);
clStatus = clReleaseMemObject(dC);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
if (params->outFile) {
/* Write C to file */
//pb_SwitchToTimer(&timers, pb_TimerID_IO);
writeColMajorMatrixFile(params->outFile,
matArow, matBcol, matC);
}
double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
pb_FreeParameters(params);
//free((void*)clSource[0]);
return 0;
}
__kernel void matAdd( __global float *A, __global float *B, __global float* C, int m, int n ) {
int j = get_global_id(0);
for(int i=0; i < m; i++)
C[i*n+j] = A[i*n+j] + B[i*n+j];
}
void basicSgemm( int m, int n, cl_mem A, cl_mem B, cl_mem C, cl_kernel clKernel, cl_command_queue clCommandQueue )
{
size_t global_work_group = n;
clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
clSetKernelArg(clKernel,1,sizeof(cl_mem),(void*)&B);
clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&C);
clSetKernelArg(clKernel,3,sizeof(int),(void*)&m);
clSetKernelArg(clKernel,4,sizeof(int),(void*)&n);
clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &global_work_group, NULL, 0, NULL, NULL);
clFinish(clCommandQueue);
}
__kernel void matAdd( __global float *A, __global float *B, __global float* C, int m, int n ) {
int i = get_global_id(0);
for(int j=0; j < n; j++)
C[i*n+j] = A[i*n+j] + B[i*n+j];
}
void basicSgemm( int m, int n, cl_mem A, cl_mem B, cl_mem C, cl_kernel clKernel, cl_command_queue clCommandQueue )
{
size_t global_work_group = m;
clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
clSetKernelArg(clKernel,1,sizeof(cl_mem),(void*)&B);
clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&C);
clSetKernelArg(clKernel,3,sizeof(int),(void*)&m);
clSetKernelArg(clKernel,4,sizeof(int),(void*)&n);
clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
&global_work_group, NULL, 0, NULL, NULL);
clFinish(clCommandQueue);
}
File deleted
hpvm/cfar-report/Figures/RARpic.jpg

165 KiB

File deleted
File deleted
hpvm/cfar-report/Figures/compilationFlow.png

235 KiB

File deleted
File deleted
File deleted
This diff is collapsed.
hpvm/cfar-report/Figures/gpu-fpga-cmp.jpg

58.3 KiB

File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
File deleted
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment