diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..36b421ec6f1359114ea0035d21048ab0b95bf30e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=opencl
+SRCDIR_OBJS=main.o io.o #compute_gold.o
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
+KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f376a27d90003e3c7c18dafb9f64a8b459a40029
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel.cl
@@ -0,0 +1,25 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    float c = 0.0f;
+    int m = get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+	float a = A[m + i * lda]; 
+	float b = B[n + i * ldb];
+	c += a * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f376a27d90003e3c7c18dafb9f64a8b459a40029
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/kernel_offline.cl
@@ -0,0 +1,25 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    float c = 0.0f;
+    int m = get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+	float a = A[m + i * lda]; 
+	float b = B[n + i * ldb];
+	c += a * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cd29c205bbf06f245561e81ca6d48c9c4125def
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
@@ -0,0 +1,225 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <CL/cl.h>
+#include <parboil.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
+{
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+  
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+  
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m%TILE_SZ) || (n%TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
+      << "; n should be multiple of " << TILE_SZ << std::endl;
+  }
+
+  size_t db[2] = {TILE_SZ,TILE_SZ};
+  size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+
+  cl_int clStatus;
+ 
+  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
+  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
+  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
+  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
+  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
+  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
+  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
+  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
+  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(clCommandQueue); 
+  CHECK_ERROR("clFinish")
+}
+
+int main (int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+
+  /* Read command line. Expect 3 inputs: A, B and B^T 
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) 
+      || (params->inpFiles[1] == NULL)
+      || (params->inpFiles[2] == NULL)
+      || (params->inpFiles[3] != NULL))
+    {
+      fprintf(stderr, "Expecting three input filenames\n");
+      exit(-1);
+    }
+
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0],
+      matArow, matAcol, matA);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2],
+      matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+
+  cl_uint numPlatforms; 
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
+  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+   
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};
+  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  //cl_kernel clKernel;
+  //cl_program clProgram;
+  //pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions,"");
+
+  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // copy A to device memory
+  A_sz = matArow*matAcol*sizeof(float);
+  B_sz = matBrow*matBcol*sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow*matBcol*sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow*matBcol);
+  
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  for(int i=0;i<matC.size();i++)
+	matC[i] = 0.0f;
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
+      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+
+  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseMemObject(dA);
+  clStatus = clReleaseMemObject(dB);
+  clStatus = clReleaseMemObject(dC);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext); 
+ 
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  
+  if (params->outFile) {
+   
+    /* Write C to file */
+    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile,
+	matArow, matBcol, matC); 
+  }
+
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  pb_FreeParameters(params);
+
+  //free((void*)clSource[0]);
+
+ 
+  return 0;
+}