diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 6c032289e713a4010ef3e1fb3e7a4aa11eab93d8..3cd64258af529c11937c3662ae3e7c8616cac9c1 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -77,18 +77,22 @@ void* llvm_visc_ptx_initContext() { extern "C" void llvm_visc_ptx_clearContext() { + printf("Clear Context\n"); clReleaseContext(globalGPUContext); } extern "C" void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) { + printf("Set Scalar Input. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); } +cl_mem C; extern "C" void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) { + printf("Set Pointer Input. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode; cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE | @@ -96,11 +100,14 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument"); + printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size); + C = d_input; return d_input; } extern "C" void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { + printf("Set Pointer Output. Argument Index = %d\n", arg_index); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode; cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE, @@ -108,11 +115,15 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument"); + printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); return d_output; } - +int count = 0; extern "C" void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { + if(size > 1000) + d_output = (void*) C; + printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); void* h_output = malloc(size); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, @@ -122,9 +133,22 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { } extern "C" -void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) { +void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* + localWorkSize, const size_t* globalWorkSize) { + + printf("Execute Node\n"); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_event* event; + // -------------- Just for testing ------ + if(globalWorkSize == NULL) { + // TODO: Remove this hack + size_t WorkSize[2] = {1024, 1024}; + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, + Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); + return event; + } + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event); checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); @@ -141,6 +165,7 @@ void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* ////////////////////////////////////////////////////////////////////////////// static char* LoadProgSource(const char* Filename, size_t* szFinalLength) { + printf("Load Prog Source\n"); // locals FILE* pFileStream = NULL; size_t szSourceLength; @@ -179,6 +204,7 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength) extern "C" void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { + printf("Launch PTX Kernel\n"); // Initialize OpenCL // OpenCL specific variables @@ -256,6 +282,22 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL); + // If build fails, get build log from device + if(errcode != CL_SUCCESS) { + size_t len = 0; + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, + NULL, &len); + printf("LOG LENGTH: %lu\n", len); + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); + char *log = (char*) malloc(len*sizeof(char)); + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len, + log, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); + + printf("Device Build Log:\n%s\n", log); + + + } checkErr(errcode, CL_SUCCESS, "Failure to build program"); Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); @@ -286,6 +328,7 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { extern "C" void llvm_visc_ptx_wait(void* graphID) { + printf("Wait\n"); DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID; clFinish(Context->clCommandQue); @@ -293,4 +336,5 @@ void llvm_visc_ptx_wait(void* graphID) { // clReleaseContext(Context->clGPUContext); clReleaseKernel(Context->clKernel); clReleaseProgram(Context->clProgram); + printf("Done with PTX kernel\n"); } diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll index 80b33744ba9388300851b83258e038df03c2bd78..c9bfaecf623a5568f8d66d4e42854d113d008710 100644 --- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll +++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll @@ -1,4 +1,6 @@ ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s +;; RUN: llvm-link /home/psrivas2/Hetero/VISC/Code/trunk/libclc/built_libs/nvptx--nvidiacl.bc %t.ll.kernels.ll -o %t.ll.kernels.linked.bc +;; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %t.nvptx.s ; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll ; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin ; RUN: %t.bin