diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 6c032289e713a4010ef3e1fb3e7a4aa11eab93d8..3cd64258af529c11937c3662ae3e7c8616cac9c1 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -77,18 +77,22 @@ void* llvm_visc_ptx_initContext() {
 
 extern "C"
 void llvm_visc_ptx_clearContext() {
+  printf("Clear Context\n");
   clReleaseContext(globalGPUContext);
 }
 
 extern "C"
 void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  printf("Set Scalar Input. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
   checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
 }
 
+cl_mem C;
 extern "C"
 void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) {
+  printf("Set Pointer Input. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode;
   cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE |
@@ -96,11 +100,14 @@ void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument");
+  printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size);
+  C = d_input;
   return d_input;
 }
 
 extern "C"
 void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
+  printf("Set Pointer Output. Argument Index = %d\n", arg_index);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode;
   cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE,
@@ -108,11 +115,15 @@ void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument");
+  printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
   return d_output;
 }
-
+int count = 0;
 extern "C"
 void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
+  if(size > 1000)
+    d_output = (void*) C;
+  printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
   void* h_output = malloc(size);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
@@ -122,9 +133,22 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
 }
 
 extern "C"
-void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) {
+void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t*
+                                localWorkSize, const size_t* globalWorkSize) {
+
+  printf("Execute Node\n");
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_event* event;
+  // -------------- Just for testing ------
+  if(globalWorkSize == NULL) {
+    // TODO: Remove this hack
+    size_t WorkSize[2] = {1024, 1024};
+    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
+        Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event);
+    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+    return event;
+  }
+
   cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
       Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event);
   checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
@@ -141,6 +165,7 @@ void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t*
 //////////////////////////////////////////////////////////////////////////////
 static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 {
+  printf("Load Prog Source\n");
   // locals
   FILE* pFileStream = NULL;
   size_t szSourceLength;
@@ -179,6 +204,7 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 
 extern "C"
 void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
+  printf("Launch PTX Kernel\n");
   // Initialize OpenCL
 
   // OpenCL specific variables
@@ -256,6 +282,22 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
   errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
+  // If build fails, get build log from device
+  if(errcode != CL_SUCCESS) {
+    size_t len = 0;
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
+        NULL, &len);
+    printf("LOG LENGTH: %lu\n", len);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
+    char *log = (char*) malloc(len*sizeof(char));
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
+        log, NULL);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
+
+    printf("Device Build Log:\n%s\n", log);
+
+
+  }
   checkErr(errcode, CL_SUCCESS, "Failure to build program");
 
   Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
@@ -286,6 +328,7 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
 
 extern "C" 
 void llvm_visc_ptx_wait(void* graphID) {
+  printf("Wait\n");
   DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
   clFinish(Context->clCommandQue);
 
@@ -293,4 +336,5 @@ void llvm_visc_ptx_wait(void* graphID) {
   // clReleaseContext(Context->clGPUContext);
   clReleaseKernel(Context->clKernel);
   clReleaseProgram(Context->clProgram);
+  printf("Done with PTX kernel\n");
 }
diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
index 80b33744ba9388300851b83258e038df03c2bd78..c9bfaecf623a5568f8d66d4e42854d113d008710 100644
--- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
@@ -1,4 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
+;; RUN: llvm-link /home/psrivas2/Hetero/VISC/Code/trunk/libclc/built_libs/nvptx--nvidiacl.bc %t.ll.kernels.ll -o %t.ll.kernels.linked.bc
+;; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %t.nvptx.s
 ; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
 ; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin