diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/Makefile b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..d5f6d9708d0589222d80f654586af06ef491a12c
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=opencl
+SRCDIR_OBJS=main.o file.o
+APP_CUDALDFLAGS=-lm
+APP_CFLAGS=-ffast-math -g3 -O3
+APP_CXXFLAGS=-ffast-math -g3 -O3
+KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/common.h b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..33bb06d5bd7e02e009565688882ed4e0ef2d52d4
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/common.h
@@ -0,0 +1,12 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef _COMMON_H_
+#define _COMMON_H_
+#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#endif
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.cc b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c57469f7a4b1886f14be77a373750e1a7635cbe
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.cc
@@ -0,0 +1,87 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <endian.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+# error "File I/O is not implemented for this system: wrong endianness."
+#endif
+
+extern "C"
+void inputData(char* fName, int* nx, int* ny, int* nz)
+{
+  FILE* fid = fopen(fName, "r");
+
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open input file\n");
+      exit(-1);
+    }
+	
+  fread(nx, sizeof(int ),1,fid);
+  fread(ny, sizeof(int ),1,fid);
+  fread(nz, sizeof(int ),1,fid);
+  fclose (fid); 
+}
+
+extern "C"
+void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
+{
+  FILE* fid = fopen(fName, "w");
+  uint32_t tmp32;
+  if (fid == NULL)
+    {
+      fprintf(stderr, "Cannot open output file\n");
+      exit(-1);
+    }
+  tmp32 = nx*ny*nz;
+  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
+  fwrite(h_A0, sizeof(float), tmp32, fid);
+
+  fclose (fid);
+}
+
+extern "C"
+char* readFile(const char* fileName)
+  {
+        FILE* fp;
+        fp = fopen(fileName,"r");
+        if(fp == NULL)
+        {
+                printf("Error 1!\n");
+                exit(1);
+        }
+
+        fseek(fp,0,SEEK_END);
+        long size = ftell(fp);
+        rewind(fp);
+
+        char* buffer = (char*)malloc(sizeof(char)*(size+1));
+        if(buffer  == NULL)
+        {
+                printf("Error 2!\n");
+                fclose(fp);
+                exit(1);
+        }
+
+        size_t res = fread(buffer,1,size,fp);
+        if(res != size)
+        {
+                printf("Error 3!\n");
+                fclose(fp);
+                exit(1);
+        }
+
+	buffer[size] = 0;
+        fclose(fp);
+        return buffer;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.h b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.h
new file mode 100644
index 0000000000000000000000000000000000000000..40c69734802ba06297418a3895d6eebd7af7b29b
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/file.h
@@ -0,0 +1,23 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+#ifndef __FILEH__
+#define __FILEH__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void inputData(char* fName, int* nx, int* ny, int* nz);
+void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+char* readFile(const char* fileName);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel.cl b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..4c5d1263db5948e4e61ea2baa27261613cd0ea06
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel.cl
@@ -0,0 +1,28 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include "common.h"
+
+__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz)
+{
+    	int i = get_global_id(0)+1;
+    	int j = get_global_id(1)+1;
+    	int k = get_global_id(2)+1;
+
+if(i<nx-1)
+{
+		Anext[Index3D (nx, ny, i, j, k)] = c1 *
+		( A0[Index3D (nx, ny, i, j, k + 1)] +
+		  A0[Index3D (nx, ny, i, j, k - 1)] +
+		  A0[Index3D (nx, ny, i, j + 1, k)] +
+		  A0[Index3D (nx, ny, i, j - 1, k)] +
+	 	  A0[Index3D (nx, ny, i + 1, j, k)] +
+		  A0[Index3D (nx, ny, i - 1, j, k)] )
+		- A0[Index3D (nx, ny, i, j, k)] * c0;
+}
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl
new file mode 100644
index 0000000000000000000000000000000000000000..4c5d1263db5948e4e61ea2baa27261613cd0ea06
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/kernel_offline.cl
@@ -0,0 +1,28 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include "common.h"
+
+__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz)
+{
+    	int i = get_global_id(0)+1;
+    	int j = get_global_id(1)+1;
+    	int k = get_global_id(2)+1;
+
+if(i<nx-1)
+{
+		Anext[Index3D (nx, ny, i, j, k)] = c1 *
+		( A0[Index3D (nx, ny, i, j, k + 1)] +
+		  A0[Index3D (nx, ny, i, j, k - 1)] +
+		  A0[Index3D (nx, ny, i, j + 1, k)] +
+		  A0[Index3D (nx, ny, i, j - 1, k)] +
+	 	  A0[Index3D (nx, ny, i + 1, j, k)] +
+		  A0[Index3D (nx, ny, i - 1, j, k)] )
+		- A0[Index3D (nx, ny, i, j, k)] * c0;
+}
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..60c75d6b4c26c652550cbdf4f421a190cc264809
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_cpu/main.c
@@ -0,0 +1,246 @@
+
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <CL/cl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parboil.h>
+
+#include "file.h"
+#include "common.h"
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     printf("Error: %s!\n",errorMessage);   \
+     printf("Line: %d\n",__LINE__);         \
+     exit(1);                               \
+  }
+
+static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
+{
+    int s=0;
+    int i,j,k;
+    for(i=0; i<nz; i++)
+    {
+        for(j=0; j<ny; j++)
+        {
+            for(k=0; k<nx; k++)
+            {
+                fread(A0+s,sizeof(float),1,fp);
+                s++;
+            }
+        }
+    }
+    return 0;
+}
+
+int main(int argc, char** argv) {
+    struct pb_TimerSet timers;
+    struct pb_Parameters *parameters;
+
+    printf("OpenCL accelerated 7 points stencil codes****\n");
+    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+    parameters = pb_ReadParameters(&argc, argv);
+
+
+    //declaration
+    unsigned nx,ny,nz;
+    unsigned size;
+    int iteration;
+    float c0=1.0f/6.0f;
+    float c1=1.0f/6.0f/6.0f;
+
+    if (argc<5)
+    {
+        printf("Usage: probe nx ny nz t\n"
+               "nx: the grid size x\n"
+               "ny: the grid size y\n"
+               "nz: the grid size z\n"
+               "t: the iteration time\n");
+        return -1;
+    }
+
+    nx = atoi(argv[1]);
+    if (nx<1)
+        return -1;
+    ny = atoi(argv[2]);
+    if (ny<1)
+        return -1;
+    nz = atoi(argv[3]);
+    if (nz<1)
+        return -1;
+    iteration = atoi(argv[4]);
+    if(iteration<1)
+        return -1;
+
+    //host data
+    float *h_A0;
+    float *h_Anext;
+    //load data from files
+
+    size=nx*ny*nz;
+
+    h_A0=(float*)malloc(sizeof(float)*size);
+    h_Anext=(float*)malloc(sizeof(float)*size);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    FILE *fp = fopen(parameters->inpFiles[0], "rb");
+    read_data(h_A0, nx,ny,nz,fp);
+    fclose(fp);
+ 
+    pb_InitializeTimerSet(&timers);
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    cl_int clStatus;
+
+    cl_uint numPlatforms;
+    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+    CHECK_ERROR("clGetPlaformIDs")
+    
+    cl_platform_id clPlatform[numPlatforms];
+    clStatus = clGetPlatformIDs(2,clPlatform,NULL);
+    CHECK_ERROR("clGetPlaformIDs")
+
+    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
+
+    cl_device_id clDevice;
+    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+    CHECK_ERROR("clGetDeviceIDs")
+
+    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
+    CHECK_ERROR("clCreateContextFromType")
+
+    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+    CHECK_ERROR("clCreateCommandQueue")
+
+    pb_SetOpenCL(&clContext, &clCommandQueue);
+
+    /*cl_program clProgram;*/
+    /*cl_kernel clKernel;*/
+
+    /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
+    const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};
+    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+    CHECK_ERROR("clCreateProgramWithSource")
+
+    char clOptions[50];
+    sprintf(clOptions,"-I src/opencl_base");
+    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+    CHECK_ERROR("clBuildProgram")
+
+    cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
+    CHECK_ERROR("clCreateKernel")
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+    //device
+    cl_mem d_A0;
+    cl_mem d_Anext;
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    memcpy (h_Anext,h_A0,sizeof(float)*size);
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+    //memory allocation
+    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
+    CHECK_ERROR("clCreateBuffer")
+    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
+    CHECK_ERROR("clCreateBuffer")
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    //memory copy
+    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
+    CHECK_ERROR("clEnqueueWriteBuffer")
+    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
+    CHECK_ERROR("clEnqueueWriteBuffer")
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+    //only use 1D thread block
+    unsigned tx =256;
+    size_t block[3] = {tx,1,1};
+    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
+    //size_t grid[3] = {nx-2,ny-2,nz-2};
+    size_t offset[3] = {1,1,1};
+//  printf("block x is %d and y is %d z \n",block[0],block[1]);
+//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
+    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
+    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
+    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
+    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
+    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
+    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
+    CHECK_ERROR("clSetKernelArg")
+
+    //main execution
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    int t;
+    for(t=0; t<iteration; t++)
+    {
+        pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
+        //printf("iteration %d\n",t)
+        CHECK_ERROR("clEnqueueNDRangeKernel")
+        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+        cl_mem d_temp = d_A0;
+        d_A0 = d_Anext;
+        d_Anext = d_temp;
+
+        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+        clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
+        clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
+        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+    }
+
+
+    cl_mem d_temp = d_A0;
+    d_A0 = d_Anext;
+    d_Anext = d_temp;
+
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
+    CHECK_ERROR("clEnqueueReadBuffer")
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    clStatus = clReleaseMemObject(d_A0);
+    clStatus = clReleaseMemObject(d_Anext);
+    clStatus = clReleaseKernel(clKernel);
+    clStatus = clReleaseProgram(clProgram);
+    clStatus = clReleaseCommandQueue(clCommandQueue);
+    clStatus = clReleaseContext(clContext);
+    CHECK_ERROR("clReleaseContext")
+
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    pb_PrintTimerSet(&timers);
+
+    if (parameters->outFile) {
+        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+
+    }
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+    //free((void*)clSource[0]);
+
+    free(h_A0);
+    free(h_Anext);
+    pb_FreeParameters(parameters);
+
+    return 0;
+}