Skip to content
Snippets Groups Projects
Commit e96af743 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

(1) Added routines to parboil_opencl to use compiled binary for opencl kernels.

(2) Added a compiled ptx assembly file for sgemm. Working for sgemm
parent bacafe03
No related branches found
No related tags found
No related merge requests found
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* Kernel of dense matrix-matrix multiplication kernel.
*/
__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
{
float c = 0.0f;
int m = get_global_id(0);
int n = get_global_id(1);
for (int i = 0; i < k; ++i) {
float a = A[m + i * lda];
float b = B[n + i * ldb];
c += a * b;
}
C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
}
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20, texmode_independent
.address_size 32
// .globl mysgemmNT
.entry mysgemmNT(
.param .u32 .ptr .global .align 4 mysgemmNT_param_0,
.param .u32 mysgemmNT_param_1,
.param .u32 .ptr .global .align 4 mysgemmNT_param_2,
.param .u32 mysgemmNT_param_3,
.param .u32 .ptr .global .align 4 mysgemmNT_param_4,
.param .u32 mysgemmNT_param_5,
.param .u32 mysgemmNT_param_6,
.param .f32 mysgemmNT_param_7,
.param .f32 mysgemmNT_param_8
)
{
.reg .pred %p<3>;
.reg .f32 %f<14>;
.reg .s32 %r<36>;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
ld.param.u32 %r23, [mysgemmNT_param_4];
mov.u32 %r5, %ctaid.y;
ld.param.u32 %r24, [mysgemmNT_param_5];
mov.u32 %r6, %ntid.y;
ld.param.u32 %r25, [mysgemmNT_param_6];
ld.param.f32 %f4, [mysgemmNT_param_7];
mov.u32 %r7, %tid.y;
ld.param.f32 %f5, [mysgemmNT_param_8];
mad.lo.s32 %r8, %r6, %r5, %r7;
setp.lt.s32 %p1, %r25, 1;
@%p1 bra BB0_1;
ld.param.u32 %r19, [mysgemmNT_param_0];
ld.param.u32 %r20, [mysgemmNT_param_1];
ld.param.u32 %r21, [mysgemmNT_param_2];
ld.param.u32 %r22, [mysgemmNT_param_3];
shl.b32 %r27, %r8, 2;
add.s32 %r9, %r21, %r27;
shl.b32 %r10, %r22, 2;
shl.b32 %r29, %r4, 2;
add.s32 %r11, %r19, %r29;
shl.b32 %r12, %r20, 2;
mov.f32 %f7, 0f00000000;
BB0_3:
ld.global.f32 %f8, [%r11];
ld.global.f32 %f9, [%r9];
fma.rn.f32 %f2, %f8, %f9, %f7;
add.s32 %r16, %r25, -1;
add.s32 %r17, %r9, %r10;
add.s32 %r18, %r11, %r12;
setp.ne.s32 %p2, %r16, 0;
mov.u32 %r11, %r18;
mov.u32 %r9, %r17;
mov.u32 %r25, %r16;
mov.f32 %f7, %f2;
@%p2 bra BB0_3;
bra.uni BB0_4;
BB0_1:
mov.f32 %f7, 0f00000000;
BB0_4:
mad.lo.s32 %r30, %r8, %r24, %r4;
shl.b32 %r31, %r30, 2;
add.s32 %r32, %r23, %r31;
ld.global.f32 %f10, [%r32];
mul.f32 %f11, %f7, %f4;
fma.rn.f32 %f12, %f10, %f5, %f11;
st.global.f32 [%r32], %f12;
ret;
}
...@@ -32,7 +32,7 @@ extern char* readFile(const char*); ...@@ -32,7 +32,7 @@ extern char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \ #define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \ if(clStatus != CL_SUCCESS) \
{ \ { \
std::cout<<errorMessage<<" Error!\n"; \ std::cout<< errorMessage <<": "<< clStatus <<" Error!\n"; \
std::cout<<"Line: "<<__LINE__<<"\n"; \ std::cout<<"Line: "<<__LINE__<<"\n"; \
exit(1); \ exit(1); \
} }
...@@ -121,18 +121,23 @@ int main (int argc, char *argv[]) { ...@@ -121,18 +121,23 @@ int main (int argc, char *argv[]) {
pb_SetOpenCL(&clContext, &clCommandQueue); pb_SetOpenCL(&clContext, &clCommandQueue);
const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
CHECK_ERROR("clCreateProgramWithSource") // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_kernel clKernel;
cl_program clProgram;
pb_CreateAndBuildKernelFromBinary("src/opencl_base/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
//CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50]; //char clOptions[50];
sprintf(clOptions,""); //sprintf(clOptions,"");
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
CHECK_ERROR("clBuildProgram") //CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus); //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
CHECK_ERROR("clCreateKernel") //CHECK_ERROR("clCreateKernel")
/* Read in data */ /* Read in data */
pb_SwitchToTimer(&timers, pb_TimerID_IO); pb_SwitchToTimer(&timers, pb_TimerID_IO);
......
...@@ -188,6 +188,9 @@ pb_DestroyTimerSet(struct pb_TimerSet * timers); ...@@ -188,6 +188,9 @@ pb_DestroyTimerSet(struct pb_TimerSet * timers);
void void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
void
pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKerenlPtr);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -593,6 +593,95 @@ void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) { ...@@ -593,6 +593,95 @@ void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) {
clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr); clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr);
} }
static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(Filename, "rb");
if(pFileStream == 0)
{
return NULL;
}
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + 1);
if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength;
}
cSourceString[szSourceLength] = '\0';
return cSourceString;
}
static inline void checkErr(cl_int err, cl_int success, const char * name) {
if (err != success) {
printf("ERROR: %s\n", name);
exit(EXIT_FAILURE);
}
}
void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKernelPtr) {
size_t kernelLength;
char *programSource = LoadProgSource(file, &kernelLength);
checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
cl_int binaryStatus;
cl_int errcode;
cl_device_id clDevice = *(cl_device_id*) clDevicePtr;
cl_context clContext = *(cl_context*) clContextPtr;
cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
&kernelLength,
(const unsigned char **)&programSource,
&binaryStatus, &errcode);
checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
// printf("Building kernel - %s, from file %s\n", kernel, file);
errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
// If build fails, get build log from device
if(errcode != CL_SUCCESS) {
printf("ERROR: Failure to build program\n");
size_t len = 0;
errcode = clGetProgramBuildInfo(clProgram, clDevice , CL_PROGRAM_BUILD_LOG, 0,
NULL, &len);
printf("LOG LENGTH: %lu\n", len);
checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
char *log = (char*) malloc(len*sizeof(char));
errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len,
log, NULL);
checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
printf("Device Build Log: %s\n", log);
free(log);
exit(EXIT_FAILURE);
}
cl_kernel clKernel = clCreateKernel(clProgram, kernel, &errcode);
checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
*(cl_program*) clProgramPtr = clProgram;
*(cl_kernel*)clKernelPtr = clKernel;
free(programSource);
}
void void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment