From b770deb6c31310a54b343a6387d84dd58a969ca6 Mon Sep 17 00:00:00 2001 From: Prakalp Srivastava <psrivas2@illinois.edu> Date: Thu, 6 Nov 2014 08:11:23 +0000 Subject: [PATCH] (1) Modifying visc_gemm_ptx.ll to also compile the kernel.ll file to produce ptx binary (2) Several hacks in visc-rt to estimate the work required to get MatrixMultiplication to work. 3 issues need to be resolved i) The metadata for kernel followed by number of kernels has to be one single metadata ii) Get Kernel name and work group sizes in NVPTX pass iii) Major: Cannot use device memory pointer as cl_mem. Fundamentally this is a design compatibility issue with VISC and OpenCL 1.1. OpenCL 2.0 might resolve it --- llvm/projects/visc-rt/visc-rt.cpp | 48 ++++++++++++++++++- .../MatrixMultiplication/visc_gemm_ptx.ll | 2 + 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 6c032289e7..3cd64258af 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -77,18 +77,22 @@ void* llvm_visc_ptx_initContext() { extern "C" void llvm_visc_ptx_clearContext() { + printf("Clear Context\n"); clReleaseContext(globalGPUContext); } extern "C" void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) { + printf("Set Scalar Input. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); } +cl_mem C; extern "C" void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) { + printf("Set Pointer Input. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode; cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE | @@ -96,11 +100,14 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument"); + printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size); + C = d_input; return d_input; } extern "C" void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { + printf("Set Pointer Output. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode; cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE, @@ -108,11 +115,15 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument"); + printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); return d_output; } - +int count = 0; extern "C" void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { + if(size > 1000) + d_output = (void*) C; + printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); void* h_output = malloc(size); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, @@ -122,9 +133,22 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { } extern "C" -void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) { +void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* + localWorkSize, const size_t* globalWorkSize) { + + printf("Execute Node\n"); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_event* event; + // -------------- Just for testing ------ + if(globalWorkSize == NULL) { + // TODO: Remove this hack + size_t WorkSize[2] = {1024, 1024}; + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, + Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); + return event; + } + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event); checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); @@ -141,6 +165,7 @@ void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* ////////////////////////////////////////////////////////////////////////////// static char* LoadProgSource(const char* Filename, size_t* szFinalLength) { + printf("Load Prog Source\n"); // locals FILE* pFileStream = NULL; size_t szSourceLength; @@ -179,6 +204,7 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength) extern "C" void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { + printf("Launch PTX Kernel\n"); // Initialize OpenCL // OpenCL specific variables @@ -256,6 +282,22 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL); + // If build fails, get build log from device + if(errcode != CL_SUCCESS) { + size_t len = 0; + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, + NULL, &len); + printf("LOG LENGTH: %lu\n", len); + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); + char *log = (char*) malloc(len*sizeof(char)); + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len, + log, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); + + printf("Device Build Log:\n%s\n", log); + + + } checkErr(errcode, CL_SUCCESS, "Failure to build program"); Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); @@ -286,6 +328,7 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { extern "C" void llvm_visc_ptx_wait(void* graphID) { + printf("Wait\n"); DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID; clFinish(Context->clCommandQue); @@ -293,4 +336,5 @@ void llvm_visc_ptx_wait(void* graphID) { // clReleaseContext(Context->clGPUContext); clReleaseKernel(Context->clKernel); clReleaseProgram(Context->clProgram); + printf("Done with PTX kernel\n"); } diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll index 80b33744ba..c9bfaecf62 100644 --- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll +++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll @@ -1,4 +1,6 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s +;; RUN: llvm-link /home/psrivas2/Hetero/VISC/Code/trunk/libclc/built_libs/nvptx--nvidiacl.bc %t.ll.kernels.ll -o %t.ll.kernels.linked.bc +;; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %t.nvptx.s ; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin ; RUN: %t.bin -- GitLab