From b770deb6c31310a54b343a6387d84dd58a969ca6 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Thu, 6 Nov 2014 08:11:23 +0000
Subject: [PATCH] (1) Modifying visc_gemm_ptx.ll to also compile the kernel.ll
 file to produce ptx     binary (2) Several hacks in visc-rt to estimate the
 work required to get     MatrixMultiplication to work. 3 issues need to be
 resolved     i)    The metadata for kernel followed by number of kernels has
 to be one           single metadata     ii)   Get Kernel name and work group
 sizes in NVPTX pass     iii)  Major: Cannot use device memory pointer as
 cl_mem. Fundamentally           this is a design compatibility issue with
 VISC and OpenCL 1.1.           OpenCL 2.0 might resolve it

---
 llvm/projects/visc-rt/visc-rt.cpp             | 48 ++++++++++++++++++-
 .../MatrixMultiplication/visc_gemm_ptx.ll     |  2 +
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 6c032289e7..3cd64258af 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -77,18 +77,22 @@ void* llvm_visc_ptx_initContext() {
 
 extern "C"
 void llvm_visc_ptx_clearContext() {
+  printf("Clear Context\n");
   clReleaseContext(globalGPUContext);
 }
 
 extern "C"
 void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  printf("Set Scalar Input. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
   checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
 }
 
+cl_mem C;
 extern "C"
 void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) {
+  printf("Set Pointer Input. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode;
   cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE |
@@ -96,11 +100,14 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument");
+  printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size);
+  C = d_input;
   return d_input;
 }
 
 extern "C"
 void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
+  printf("Set Pointer Output. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode;
   cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE,
@@ -108,11 +115,15 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument");
+  printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
   return d_output;
 }
-
+int count = 0;
 extern "C"
 void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
+  if(size > 1000)
+    d_output = (void*) C;
+  printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
   void* h_output = malloc(size);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
@@ -122,9 +133,22 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
 }
 
 extern "C"
-void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) {
+void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t*
+                                localWorkSize, const size_t* globalWorkSize) {
+
+  printf("Execute Node\n");
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_event* event;
+  // -------------- Just for testing ------
+  if(globalWorkSize == NULL) {
+    // TODO: Remove this hack
+    size_t WorkSize[2] = {1024, 1024};
+    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
+        Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event);
+    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+    return event;
+  }
+
   cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
       Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event);
   checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
@@ -141,6 +165,7 @@ void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t*
 //////////////////////////////////////////////////////////////////////////////
 static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 {
+  printf("Load Prog Source\n");
   // locals
   FILE* pFileStream = NULL;
   size_t szSourceLength;
@@ -179,6 +204,7 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 
 extern "C"
 void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
+  printf("Launch PTX Kernel\n");
   // Initialize OpenCL
 
   // OpenCL specific variables
@@ -256,6 +282,22 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
   errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
+  // If build fails, get build log from device
+  if(errcode != CL_SUCCESS) {
+    size_t len = 0;
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
+        NULL, &len);
+    printf("LOG LENGTH: %lu\n", len);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
+    char *log = (char*) malloc(len*sizeof(char));
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
+        log, NULL);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
+
+    printf("Device Build Log:\n%s\n", log);
+
+
+  }
   checkErr(errcode, CL_SUCCESS, "Failure to build program");
 
   Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
@@ -286,6 +328,7 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
 
 extern "C" 
 void llvm_visc_ptx_wait(void* graphID) {
+  printf("Wait\n");
   DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
   clFinish(Context->clCommandQue);
 
@@ -293,4 +336,5 @@ void llvm_visc_ptx_wait(void* graphID) {
   // clReleaseContext(Context->clGPUContext);
   clReleaseKernel(Context->clKernel);
   clReleaseProgram(Context->clProgram);
+  printf("Done with PTX kernel\n");
 }
diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
index 80b33744ba..c9bfaecf62 100644
--- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
@@ -1,4 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
+;; RUN: llvm-link /home/psrivas2/Hetero/VISC/Code/trunk/libclc/built_libs/nvptx--nvidiacl.bc %t.ll.kernels.ll -o %t.ll.kernels.linked.bc
+;; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %t.nvptx.s
 ; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
 ; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin
-- 
GitLab