diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f376a27d90003e3c7c18dafb9f64a8b459a40029
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.cl
@@ -0,0 +1,25 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    float c = 0.0f;
+    int m = get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+	float a = A[m + i * lda]; 
+	float b = B[n + i * ldb];
+	c += a * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.nvptx.s b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.nvptx.s
new file mode 100644
index 0000000000000000000000000000000000000000..13cfd2d24547432388d9d3de6ef92eca164fc822
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel_offline.nvptx.s
@@ -0,0 +1,79 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 3.1
+.target sm_20, texmode_independent
+.address_size 32
+
+	// .globl	mysgemmNT
+
+.entry mysgemmNT(
+	.param .u32 .ptr .global .align 4 mysgemmNT_param_0,
+	.param .u32 mysgemmNT_param_1,
+	.param .u32 .ptr .global .align 4 mysgemmNT_param_2,
+	.param .u32 mysgemmNT_param_3,
+	.param .u32 .ptr .global .align 4 mysgemmNT_param_4,
+	.param .u32 mysgemmNT_param_5,
+	.param .u32 mysgemmNT_param_6,
+	.param .f32 mysgemmNT_param_7,
+	.param .f32 mysgemmNT_param_8
+)
+{
+	.reg .pred 	%p<3>;
+	.reg .f32 	%f<14>;
+	.reg .s32 	%r<36>;
+
+	mov.u32	%r1, %ctaid.x;
+	mov.u32	%r2, %ntid.x;
+	mov.u32	%r3, %tid.x;
+	mad.lo.s32 	%r4, %r2, %r1, %r3;
+	ld.param.u32 	%r23, [mysgemmNT_param_4];
+	mov.u32	%r5, %ctaid.y;
+	ld.param.u32 	%r24, [mysgemmNT_param_5];
+	mov.u32	%r6, %ntid.y;
+	ld.param.u32 	%r25, [mysgemmNT_param_6];
+	ld.param.f32 	%f4, [mysgemmNT_param_7];
+	mov.u32	%r7, %tid.y;
+	ld.param.f32 	%f5, [mysgemmNT_param_8];
+	mad.lo.s32 	%r8, %r6, %r5, %r7;
+	setp.lt.s32 	%p1, %r25, 1;
+	@%p1 bra 	BB0_1;
+	ld.param.u32 	%r19, [mysgemmNT_param_0];
+	ld.param.u32 	%r20, [mysgemmNT_param_1];
+	ld.param.u32 	%r21, [mysgemmNT_param_2];
+	ld.param.u32 	%r22, [mysgemmNT_param_3];
+	shl.b32 	%r27, %r8, 2;
+	add.s32 	%r9, %r21, %r27;
+	shl.b32 	%r10, %r22, 2;
+	shl.b32 	%r29, %r4, 2;
+	add.s32 	%r11, %r19, %r29;
+	shl.b32 	%r12, %r20, 2;
+	mov.f32 	%f7, 0f00000000;
+BB0_3:
+	ld.global.f32 	%f8, [%r11];
+	ld.global.f32 	%f9, [%r9];
+	fma.rn.f32 	%f2, %f8, %f9, %f7;
+	add.s32 	%r16, %r25, -1;
+	add.s32 	%r17, %r9, %r10;
+	add.s32 	%r18, %r11, %r12;
+	setp.ne.s32 	%p2, %r16, 0;
+	mov.u32 	%r11, %r18;
+	mov.u32 	%r9, %r17;
+	mov.u32 	%r25, %r16;
+	mov.f32 	%f7, %f2;
+	@%p2 bra 	BB0_3;
+	bra.uni 	BB0_4;
+BB0_1:
+	mov.f32 	%f7, 0f00000000;
+BB0_4:
+	mad.lo.s32 	%r30, %r8, %r24, %r4;
+	shl.b32 	%r31, %r30, 2;
+	add.s32 	%r32, %r23, %r31;
+	ld.global.f32 	%f10, [%r32];
+	mul.f32 	%f11, %f7, %f4;
+	fma.rn.f32 	%f12, %f10, %f5, %f11;
+	st.global.f32 	[%r32], %f12;
+	ret;
+}
+
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
index 4d199149e29953b834253881faa60202fa29a7fb..020b1256ee09c63a2d7f16d230cb9a5120cca270 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
@@ -32,7 +32,7 @@ extern char* readFile(const char*);
 #define CHECK_ERROR(errorMessage)           \
   if(clStatus != CL_SUCCESS)                \
   {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
+     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
      std::cout<<"Line: "<<__LINE__<<"\n";   \
      exit(1);                               \
   }
@@ -121,18 +121,23 @@ int main (int argc, char *argv[]) {
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
+  // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary("src/opencl_base/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  //CHECK_ERROR("clCreateProgramWithSource")
 
-  char clOptions[50];
-  sprintf(clOptions,"");
+  //char clOptions[50];
+  //sprintf(clOptions,"");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  CHECK_ERROR("clBuildProgram")
+  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  //CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  CHECK_ERROR("clCreateKernel")
+  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  //CHECK_ERROR("clCreateKernel")
 
   /* Read in data */
   pb_SwitchToTimer(&timers, pb_TimerID_IO);
diff --git a/llvm/test/VISC/parboil/common/include/parboil.h b/llvm/test/VISC/parboil/common/include/parboil.h
index 691e5759093db115d1f92bc7a8e666ec0e9a08b6..41f78a99c07cea79e013eee86bc746f223517fdc 100644
--- a/llvm/test/VISC/parboil/common/include/parboil.h
+++ b/llvm/test/VISC/parboil/common/include/parboil.h
@@ -188,6 +188,9 @@ pb_DestroyTimerSet(struct pb_TimerSet * timers);
 void
 pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
 
+void
+pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKerenlPtr);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llvm/test/VISC/parboil/common/src/parboil_opencl.c b/llvm/test/VISC/parboil/common/src/parboil_opencl.c
index c1200a2479a28791356f2e3e0b786b8e2cabbb1d..1b038efc0525b511cfa3a6884e14cbdcbef47e92 100644
--- a/llvm/test/VISC/parboil/common/src/parboil_opencl.c
+++ b/llvm/test/VISC/parboil/common/src/parboil_opencl.c
@@ -593,6 +593,95 @@ void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) {
   clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr);
 }
 
+static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
+{
+  // locals
+  FILE* pFileStream = NULL;
+  size_t szSourceLength;
+
+  // open the OpenCL source code file
+  pFileStream = fopen(Filename, "rb");
+  if(pFileStream == 0)
+  {
+    return NULL;
+  }
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char* cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+  {
+      fclose(pFileStream);
+      free(cSourceString);
+      return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble + source) string
+  fclose(pFileStream);
+  if(szFinalLength != 0)
+  {
+      *szFinalLength = szSourceLength;
+  }
+  cSourceString[szSourceLength] = '\0';
+
+  return cSourceString;
+}
+
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
+  if (err != success) {
+    printf("ERROR: %s\n", name);
+    exit(EXIT_FAILURE);
+  }
+}
+
+void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKernelPtr) {
+  size_t kernelLength;
+  char *programSource = LoadProgSource(file, &kernelLength);
+  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+
+  cl_int binaryStatus;
+  cl_int errcode;
+  cl_device_id clDevice = *(cl_device_id*) clDevicePtr;
+  cl_context clContext = *(cl_context*) clContextPtr;
+  cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
+                                        &kernelLength,
+                                        (const unsigned char **)&programSource,
+                                        &binaryStatus, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
+
+  // printf("Building kernel - %s, from file %s\n", kernel, file);
+  errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
+  // If build fails, get build log from device
+  if(errcode != CL_SUCCESS) {
+    printf("ERROR: Failure to build program\n");
+    size_t len = 0;
+    errcode = clGetProgramBuildInfo(clProgram, clDevice , CL_PROGRAM_BUILD_LOG, 0,
+        NULL, &len);
+    printf("LOG LENGTH: %lu\n", len);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
+    char *log = (char*) malloc(len*sizeof(char));
+    errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len,
+        log, NULL);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
+
+    printf("Device Build Log: %s\n", log);
+    free(log);
+    exit(EXIT_FAILURE);
+  }
+
+  cl_kernel clKernel = clCreateKernel(clProgram, kernel, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
+  
+  *(cl_program*) clProgramPtr = clProgram;
+  *(cl_kernel*)clKernelPtr = clKernel;
+
+  free(programSource);
+}
+
 void
 pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {