From 877bca7eba4e82d9344248ee28514db113d6cea7 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@gmail.com>
Date: Tue, 14 Jul 2015 12:43:56 -0500
Subject: [PATCH] Start timers after IO done to increase accuracy of timers.

---
 llvm/test/VISC/parboil/.ycm_extra_conf.py     |  1 +
 .../benchmarks/lbm/src/opencl_nvidia/main.c   | 65 ++++++++++--
 .../parboil/benchmarks/lbm/src/visc/main.c    | 17 ++--
 .../VISC/parboil/benchmarks/nodeSwap/Makefile |  9 +-
 .../nodeSwap/src/opencl/kernel_offline.cl     | 25 +++--
 .../benchmarks/nodeSwap/src/opencl/main.cc    | 21 ++--
 .../benchmarks/sgemm/src/opencl_base/main.cc  | 49 +++++-----
 .../parboil/benchmarks/sgemm/src/visc/main.cc | 33 ++++---
 .../benchmarks/spmv/src/opencl_nvidia/main.c  | 98 ++++++++++---------
 .../parboil/benchmarks/spmv/src/visc/main.c   | 36 +++----
 .../benchmarks/stencil/src/opencl_base/main.c | 42 ++++----
 .../benchmarks/stencil/src/visc/stencil.c     | 40 ++++----
 llvm/test/VISC/parboil/common/include/visc.h  |  2 +
 llvm/test/VISC/parboil/common/mk/opencl.mk    |  2 +-
 llvm/test/VISC/parboil/common/mk/visc.mk      |  7 ++
 .../parboil/common/platform/visc.default.mk   |  4 +-
 16 files changed, 272 insertions(+), 179 deletions(-)

diff --git a/llvm/test/VISC/parboil/.ycm_extra_conf.py b/llvm/test/VISC/parboil/.ycm_extra_conf.py
index cb5dfd4b78..d8ca60904f 100644
--- a/llvm/test/VISC/parboil/.ycm_extra_conf.py
+++ b/llvm/test/VISC/parboil/.ycm_extra_conf.py
@@ -42,6 +42,7 @@ flags = [
     # harmful
     '-isystem', '/home/psrivas2/current-test/parboil/common/include',
     '-I', '/home/psrivas2/current-test/parboil/common/include',
+    '-I', '/home/psrivas2/current-src/include'
     '-isystem', '/System/Library/Frameworks/Python.framework/Headers',
     '-isystem', '/usr/local/include',
     '-isystem', '/usr/local/include/eigen3',
diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
index 7894ea9b70..c58dcc27e4 100644
--- a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
@@ -33,20 +33,53 @@ int main( int nArgs, char* arg[] ) {
 
     OpenCL_Param prm;
 
-    pb_InitializeTimerSet(&timers);
     struct pb_Parameters* params;
     params = pb_ReadParameters(&nArgs, arg);
 
 
-    static LBM_GridPtr TEMP_srcGrid;
     //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
     MAIN_parseCommandLine( nArgs, arg, &param, params );
     MAIN_printInfo( &param );
 
+    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+
+    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    //Setup TEMP datastructures
+    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+    LBM_allocateGrid( (float**) &TEMP_dstGrid );
+    LBM_initializeGrid( TEMP_srcGrid );
+    LBM_initializeGrid( TEMP_dstGrid );
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    if( param.obstacleFilename != NULL ) {
+        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
+        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
+    }
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
+    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_showGridStatistics( TEMP_srcGrid );
+
+    pb_InitializeTimerSet(&timers);
     pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+ 
     OpenCL_initialize(&prm);
-    MAIN_initialize( &param, &prm );
+
+    //Setup DEVICE datastructures
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
+    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
+
+    //Initialize DEVICE datastructures
+    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
+    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
+
+
 
     for( t = 1; t <= param.nTimeSteps; t++ ) {
         pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
@@ -63,13 +96,31 @@ int main( int nArgs, char* arg[] ) {
         }
     }
 
-    MAIN_finalize( &param, &prm );
+    /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
+    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+
+    clReleaseProgram(prm.clProgram);
+    clReleaseKernel(prm.clKernel);
+    clReleaseCommandQueue(prm.clCommandQueue);
+    clReleaseContext(prm.clContext);
 
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
     pb_PrintTimerSet(&timers);
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    LBM_showGridStatistics( TEMP_srcGrid );
+    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_freeGrid( (float**) &TEMP_srcGrid );
+    LBM_freeGrid( (float**) &TEMP_dstGrid );
+
     pb_FreeParameters(params);
     return 0;
 }
diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/visc/main.c b/llvm/test/VISC/parboil/benchmarks/lbm/src/visc/main.c
index 0a9d87c708..251b10b243 100644
--- a/llvm/test/VISC/parboil/benchmarks/lbm/src/visc/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/visc/main.c
@@ -32,8 +32,6 @@ int main( int nArgs, char* arg[] ) {
     MAIN_Param param;
     int t;
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
     struct pb_Parameters* params;
     params = pb_ReadParameters(&nArgs, arg);
 
@@ -46,6 +44,9 @@ int main( int nArgs, char* arg[] ) {
 
     MAIN_initialize( &param );
 
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
     size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
     pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
     llvm_visc_track_mem(srcGrid-MARGIN, size);
@@ -72,18 +73,20 @@ int main( int nArgs, char* arg[] ) {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
     llvm_visc_request_mem(srcGrid-MARGIN, size);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    MAIN_finalize( &param );
-
     pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
     llvm_visc_untrack_mem(srcGrid-MARGIN);
     llvm_visc_untrack_mem(dstGrid-MARGIN);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
 
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
     pb_PrintTimerSet(&timers);
     __visc__cleanup();
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    MAIN_finalize( &param );
+
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_freeGrid( (float**) &TEMP_srcGrid );
+
     pb_FreeParameters(params);
     return 0;
 }
diff --git a/llvm/test/VISC/parboil/benchmarks/nodeSwap/Makefile b/llvm/test/VISC/parboil/benchmarks/nodeSwap/Makefile
index 1c2cafd3fd..67fea898b6 100644
--- a/llvm/test/VISC/parboil/benchmarks/nodeSwap/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/nodeSwap/Makefile
@@ -18,6 +18,7 @@ endif
 ifeq ($(ROWM),1)
   APP_CFLAGS=-DROWM
   APP_CXXFLAGS=-DROWM
+  APP_ISPCFLAGS=-DROWM
 endif
 
 BIN = $(addsuffix -$(VERSION), $(APP))
@@ -26,17 +27,13 @@ SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
 DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
 
-ifneq ($(TEST),small)
-  TEST=medium
-endif
-
 MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
-MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
+MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
 REF_OUTPUT = $(DATASET_DIR)/$(TEST)/output/matrix3.txt
 RUNDIR = run/$(VERSION)/$(TEST)
 OUTPUT = $(RUNDIR)/matrix3.txt
 
 ARGS = -i $(MATRIX1),$(MATRIX2) -o $(OUTPUT)
-TOOL = tools/compare-output
+TOOL = diff
 
 include $(PARBOIL_ROOT)/common/mk/Makefile
diff --git a/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
index 56f16e2068..f689bdaf79 100644
--- a/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
+++ b/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/kernel_offline.cl
@@ -10,17 +10,24 @@
  * Kernel of dense matrix-matrix multiplication kernel.
  */
 
-__kernel void matAdd( __global float *A, __global float *B, __global float* C )
+__kernel void matAdd( __global float *A, __global float *B, __global float* C, int m, int n )
 {
+//#ifdef ROWM
+    //int i = get_group_id(0);
+    //int j = get_local_id(0);
+    //int dim = get_local_size(0);
+//#else
+    //int j = get_group_id(0);
+    //int i = get_local_id(0);
+    //int dim = get_global_size(0)/get_local_size(0);
+//#endif
 #ifdef ROWM
-    int i = get_group_id(0);
-    int j = get_local_id(0);
-    int dim = get_local_size(0);
+    int i = get_global_id(0);
+    for(int j=0; j < n; j++)
+      C[i*n+j] = A[i*n+j] + B[i*n+j];
 #else
-    int j = get_group_id(0);
-    int i = get_local_id(0);
-    int dim = get_global_size(0)/get_local_size(0);
+    int j = get_global_id(0);
+    for(int i=0; i < m; i++)
+      C[i*n+j] = A[i*n+j] + B[i*n+j];
 #endif
-
-    C[i*dim+j] = A[i*dim+j] + B[i*dim+j];
 }
diff --git a/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/main.cc b/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/main.cc
index 5f76fc9bc5..23d2950c2a 100644
--- a/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/nodeSwap/src/opencl/main.cc
@@ -46,22 +46,29 @@ void basicSgemm( int m, int n, cl_mem A, cl_mem B, cl_mem C, cl_kernel clKernel,
       << "; n should be multiple of " << TILE_SZ << std::endl;
   }
 
+//#ifdef ROWM
+  //size_t db = m;
+  //size_t dg = (m*n);
+//#else
+  //size_t db = n;
+  //size_t dg = (m*n);
+//#endif
 #ifdef ROWM
-  size_t db = m;
-  size_t dg = (m*n);
+  size_t dg = m;
 #else
-  size_t db = n;
-  size_t dg = (m*n);
+  size_t dg = n;
 #endif
-
   cl_int clStatus;
-  std::cout << "Block dim = " << db << ", Group dim = " << dg/db << "\n";
+  //std::cout << "Block dim = " << db << ", Group dim = " << dg/db << "\n";
   clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
   clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),(void*)&B);
   clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&C);
+  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&m);
+  clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&n);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,&db,0,NULL,NULL);
+  //clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,&db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dg,NULL,0,NULL,NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
   clStatus = clFinish(clCommandQueue); 
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
index 47ac5c4271..34ff5ea5cf 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
@@ -88,7 +88,6 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-  pb_InitializeTimerSet(&timers);
 
   /* Read command line. Expect 3 inputs: A, B and B^T 
      in column-major layout*/
@@ -102,6 +101,16 @@ int main (int argc, char *argv[]) {
       exit(-1);
     }
 
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0],
+      matArow, matAcol, matA);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2],
+      matBcol, matBrow, matBT);
+
+  pb_InitializeTimerSet(&timers);
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
@@ -138,16 +147,6 @@ int main (int argc, char *argv[]) {
   //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
   //CHECK_ERROR("clCreateKernel")
 
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
-  // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
-
   pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
   // copy A to device memory
   A_sz = matArow*matAcol*sizeof(float);
@@ -187,12 +186,24 @@ int main (int argc, char *argv[]) {
   basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
       dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+
+  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseMemObject(dA);
+  clStatus = clReleaseMemObject(dB);
+  clStatus = clReleaseMemObject(dC);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext); 
+ 
+  pb_PrintTimerSet(&timers);
+  
   if (params->outFile) {
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
    
     /* Write C to file */
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
     writeColMajorMatrixFile(params->outFile,
 	matArow, matBcol, matC); 
   }
@@ -201,18 +212,10 @@ int main (int argc, char *argv[]) {
 
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
   std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-  pb_PrintTimerSet(&timers);
   pb_FreeParameters(params);
 
   //free((void*)clSource[0]);
 
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseMemObject(dA);
-  clStatus = clReleaseMemObject(dB);
-  clStatus = clReleaseMemObject(dC);
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
-  
+ 
   return 0;
 }
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
index 23d89aeb16..04de5d2c97 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
@@ -39,6 +39,7 @@ extern char* readFile(const char*);
 
 void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
 {
+    __visc__hint(visc::GPU_TARGET);
     __visc__attributes(3, A, B, C, 1, C);
     float c = 0.0f;
     int m = get_global_id(0);
@@ -87,8 +88,6 @@ int main (int argc, char *argv[]) {
     int matBrow, matBcol;
     std::vector<float> matA, matBT;
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
 
     /* Read command line. Expect 3 inputs: A, B and B^T
        in column-major layout*/
@@ -103,8 +102,6 @@ int main (int argc, char *argv[]) {
     }
 
     /* Read in data */
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
     // load A
     readColMajorMatrixFile(params->inpFiles[0],
                            matArow, matAcol, matA);
@@ -113,6 +110,9 @@ int main (int argc, char *argv[]) {
     readColMajorMatrixFile(params->inpFiles[2],
                            matBcol, matBrow, matBT);
 
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
     pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
     // copy A to device memory
     A_sz = matArow*matAcol*sizeof(float);
@@ -139,15 +139,8 @@ int main (int argc, char *argv[]) {
     basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
                &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
 
-    if (params->outFile) {
-        pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-
-        /* Write C to file */
-        llvm_visc_request_mem(&matC.front(), C_sz);
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+    llvm_visc_request_mem(&matC.front(), C_sz);
 
     pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
     llvm_visc_untrack_mem(&matA.front());
@@ -155,10 +148,20 @@ int main (int argc, char *argv[]) {
     llvm_visc_untrack_mem(&matC.front());
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+
     pb_PrintTimerSet(&timers);
     __visc__cleanup();
+
+    if (params->outFile) {
+
+        /* Write C to file */
+        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        writeColMajorMatrixFile(params->outFile,
+                                matArow, matBcol, matC);
+    }
+
+    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
     pb_FreeParameters(params);
 
     return 0;
diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
index 87088bf98c..b42bf009c9 100644
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
@@ -43,6 +43,51 @@ int main(int argc, char** argv) {
         exit(-1);
     }
 
+    //load matrix from files
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+    //    &h_data, &h_indices, &h_ptr,
+    //    &h_perm, &h_nzcnt);
+    int col_count;
+
+    //parameters declaration
+    int len;
+    int depth;
+    int dim;
+    int pad=32;
+    int nzcnt_len;
+
+    //host memory allocation
+    //matrix
+    float *h_data;
+    int *h_indices;
+    int *h_ptr;
+    int *h_perm;
+    int *h_nzcnt;
+
+    //vector
+    float *h_Ax_vector;
+    float *h_x_vector;
+
+
+    coo_to_jds(
+        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+        1, // row padding
+        pad, // warp size
+        1, // pack size
+        1, // is mirrored?
+        0, // binary matrix
+        1, // debug level [0:2]
+        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
+        &col_count, &dim, &len, &nzcnt_len, &depth
+    );
+
+
+    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
+    h_x_vector=(float*)malloc(sizeof(float)*dim);
+    input_vec( parameters->inpFiles[1],h_x_vector,dim);
+
+
     pb_InitializeTimerSet(&timers);
     pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
 
@@ -81,25 +126,6 @@ int main(int argc, char** argv) {
     pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
 
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //parameters declaration
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
     //device memory allocation
     //matrix
     cl_mem d_data;
@@ -112,30 +138,6 @@ int main(int argc, char** argv) {
     cl_mem jds_ptr_int;
     cl_mem sh_zcnt_int;
 
-    //load matrix from files
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
     pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
 
     OpenCLDeviceProp clDeviceProp;
@@ -247,12 +249,15 @@ int main(int argc, char** argv) {
     clStatus = clReleaseCommandQueue(clCommandQueue);
     clStatus = clReleaseContext(clContext);
 
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    pb_PrintTimerSet(&timers);
     if (parameters->outFile) {
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
         outputData(parameters->outFile,h_Ax_vector,dim);
 
     }
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
     //free((void*)clSource[0]);
 
@@ -264,9 +269,6 @@ int main(int argc, char** argv) {
     free (h_Ax_vector);
     free (h_x_vector);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
     pb_FreeParameters(parameters);
 
     return 0;
diff --git a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c
index cf153ac4d4..25d81214dc 100644
--- a/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/spmv/src/visc/main.c
@@ -112,9 +112,7 @@ int main(int argc, char** argv) {
         exit(-1);
     }
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
     //parameters declaration
     int len;
@@ -136,7 +134,7 @@ int main(int argc, char** argv) {
     float *h_x_vector;
 
     //load matrix from files
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
     //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
     //    &h_data, &h_indices, &h_ptr,
     //    &h_perm, &h_nzcnt);
@@ -158,6 +156,9 @@ int main(int argc, char** argv) {
     h_x_vector=(float*)malloc(sizeof(float)*dim);
     input_vec( parameters->inpFiles[1],h_x_vector,dim);
 
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     memset(h_Ax_vector, 0, dim*sizeof(float));
 
@@ -218,17 +219,12 @@ int main(int argc, char** argv) {
          */
     }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
     //HtoD memory copy
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
 
-    if (parameters->outFile) {
-        llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        outputData(parameters->outFile,h_Ax_vector,dim);
 
-    }
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+   pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
 
     llvm_visc_untrack_mem(h_Ax_vector);
     llvm_visc_untrack_mem(h_data);
@@ -237,7 +233,17 @@ int main(int argc, char** argv) {
     llvm_visc_untrack_mem(h_x_vector);
     llvm_visc_untrack_mem(h_ptr);
     llvm_visc_untrack_mem(h_nzcnt);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+
+    if (parameters->outFile) {
+        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+        outputData(parameters->outFile,h_Ax_vector,dim);
+
+    }
+ 
     free (h_data);
     free (h_indices);
     free (h_ptr);
@@ -246,10 +252,6 @@ int main(int argc, char** argv) {
     free (h_Ax_vector);
     free (h_x_vector);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
     pb_FreeParameters(parameters);
 
     return 0;
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_base/main.c b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_base/main.c
index 5238110bae..717efc5341 100644
--- a/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_base/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/opencl_base/main.c
@@ -50,8 +50,6 @@ int main(int argc, char** argv) {
     printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
     parameters = pb_ReadParameters(&argc, argv);
 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
     //declaration
     unsigned nx,ny,nz;
@@ -83,6 +81,21 @@ int main(int argc, char** argv) {
     if(iteration<1)
         return -1;
 
+    //host data
+    float *h_A0;
+    float *h_Anext;
+    //load data from files
+
+    size=nx*ny*nz;
+
+    h_A0=(float*)malloc(sizeof(float)*size);
+    h_Anext=(float*)malloc(sizeof(float)*size);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    FILE *fp = fopen(parameters->inpFiles[0], "rb");
+    read_data(h_A0, nx,ny,nz,fp);
+    fclose(fp);
+ 
+    pb_InitializeTimerSet(&timers);
     pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
     cl_int clStatus;
     cl_platform_id clPlatform;
@@ -120,24 +133,12 @@ int main(int argc, char** argv) {
     //CHECK_ERROR("clCreateKernel")
 
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //host data
-    float *h_A0;
-    float *h_Anext;
 
     //device
     cl_mem d_A0;
     cl_mem d_Anext;
 
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
     memcpy (h_Anext,h_A0,sizeof(float)*size);
 
     pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
@@ -219,20 +220,21 @@ int main(int argc, char** argv) {
     clStatus = clReleaseContext(clContext);
     CHECK_ERROR("clReleaseContext")
 
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    pb_PrintTimerSet(&timers);
+
     if (parameters->outFile) {
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
         outputData(parameters->outFile,h_Anext,nx,ny,nz);
 
     }
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
     //free((void*)clSource[0]);
 
     free(h_A0);
     free(h_Anext);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
     pb_FreeParameters(parameters);
 
     return 0;
diff --git a/llvm/test/VISC/parboil/benchmarks/stencil/src/visc/stencil.c b/llvm/test/VISC/parboil/benchmarks/stencil/src/visc/stencil.c
index c2ec412111..865b82d781 100644
--- a/llvm/test/VISC/parboil/benchmarks/stencil/src/visc/stencil.c
+++ b/llvm/test/VISC/parboil/benchmarks/stencil/src/visc/stencil.c
@@ -61,9 +61,7 @@ int main(int argc, char** argv) {
     printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
     parameters = pb_ReadParameters(&argc, argv);
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
     //declaration
     int nx,ny,nz;
@@ -106,18 +104,24 @@ int main(int argc, char** argv) {
     h_A0=(float*)malloc(sizeof(float)*size);
     h_Anext=(float*)malloc(sizeof(float)*size);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(h_A0, sizeof(float)*size);
-    llvm_visc_track_mem(h_Anext, sizeof(float)*size);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
     FILE *fp = fopen(parameters->inpFiles[0], "rb");
     read_data(h_A0, nx,ny,nz,fp);
     fclose(fp);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-    
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+    llvm_visc_track_mem(h_A0, sizeof(float)*size);
+    llvm_visc_track_mem(h_Anext, sizeof(float)*size);
+
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
+    memcpy (h_Anext,h_A0,sizeof(float)*size);
+
+
     //only use 1D thread block
     int tx =256;
     int block[3] = {tx,1,1};
@@ -149,22 +153,24 @@ int main(int argc, char** argv) {
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
     llvm_visc_request_mem(h_Anext, bytes);
 
-    if (parameters->outFile) {
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
-
-    }
     pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
 
     llvm_visc_untrack_mem(h_A0);
     llvm_visc_untrack_mem(h_Anext);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    free(h_A0);
-    free(h_Anext);
+
     pb_SwitchToTimer(&timers, pb_TimerID_NONE);
     pb_PrintTimerSet(&timers);
 
     __visc__cleanup();
+
+    if (parameters->outFile) {
+        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+
+    }
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    free(h_A0);
+    free(h_Anext);
     pb_FreeParameters(parameters);
 
     return 0;
diff --git a/llvm/test/VISC/parboil/common/include/visc.h b/llvm/test/VISC/parboil/common/include/visc.h
index e311c73021..ce3a1f2edb 100644
--- a/llvm/test/VISC/parboil/common/include/visc.h
+++ b/llvm/test/VISC/parboil/common/include/visc.h
@@ -5,10 +5,12 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "llvm/SupportVISC/VISCHint.h"
 
 #ifdef __cplusplus
 extern "C" {
 void __visc__attributes(unsigned, ...);
+void __visc__hint(visc::target);
 void __visc__wait(unsigned);
 unsigned __visc__node(...);
 void __visc__init();
diff --git a/llvm/test/VISC/parboil/common/mk/opencl.mk b/llvm/test/VISC/parboil/common/mk/opencl.mk
index 7c3ed00d10..5bd73bb97f 100644
--- a/llvm/test/VISC/parboil/common/mk/opencl.mk
+++ b/llvm/test/VISC/parboil/common/mk/opencl.mk
@@ -115,7 +115,7 @@ $(BUILDDIR)/%.linked.bc : $(BUILDDIR)/%.ll
 	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
 
 $(BUILDDIR)/%.ll : $(SRCDIR)/%.cl
-	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl $< -O3 -emit-llvm -S -o $@
+	$(LLVM_CC) $(CFLAGS) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl $< -O3 -emit-llvm -S -o $@
 	sed -e "s/ addrspace(.)//g" -i $@
 no_opencl:
 	@echo "OPENCL_PATH is not set. Open $(PARBOIL_ROOT)/common/Makefile.conf to set default value."
diff --git a/llvm/test/VISC/parboil/common/mk/visc.mk b/llvm/test/VISC/parboil/common/mk/visc.mk
index 4445260532..3c7382d1d8 100644
--- a/llvm/test/VISC/parboil/common/mk/visc.mk
+++ b/llvm/test/VISC/parboil/common/mk/visc.mk
@@ -158,10 +158,14 @@ $(BUILDDIR)/%.ll: $(SRCDIR)/%.c
 $(BUILDDIR)/%.ll : $(SRCDIR)/%.cc
 	$(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@
 
+$(BUILDDIR)/%.ll : $(SRCDIR)/%.cpp
+	$(CXX) $(CXXFLAGS) -S -emit-llvm $< -o $@
+
 $(BUILDDIR)/%.visc.ll: $(BUILDDIR)/%.ll
 	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
 	cat $(LLVM_SRC_ROOT)/test/VISC/parboil/RUN.parboil.script $@ > $@.tmp
 	mv $@.tmp $(BUILDDIR)/$(APP).visc.ll
+	#@cp $(VISC_OBJS) $(BUILDDIR)/$(VISC_OBJS)
 
 $(BUILDDIR)/%.o : $(SRCDIR)/%.c
 	$(CC) $(CFLAGS) -c $< -o $@
@@ -175,6 +179,9 @@ $(BUILDDIR)/%.o : $(SRCDIR)/%.cc
 $(BUILDDIR)/%.o : $(SRCDIR)/%.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+$(BUILDDIR)/%.o : $(SRCDIR)/%.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 no_opencl:
 	@echo "OPENCL_PATH is not set. Open $(PARBOIL_ROOT)/common/Makefile.conf to set default value."
 	@echo "You may use $(PLATFORM_MK) if you want a platform specific configurations."
diff --git a/llvm/test/VISC/parboil/common/platform/visc.default.mk b/llvm/test/VISC/parboil/common/platform/visc.default.mk
index 983c3c971f..a7ab89acb0 100644
--- a/llvm/test/VISC/parboil/common/platform/visc.default.mk
+++ b/llvm/test/VISC/parboil/common/platform/visc.default.mk
@@ -15,10 +15,10 @@
 VISC_BUILD_DIR = Release+Asserts
 # gcc (default)
 CC = $(LLVM_SRC_ROOT)/$(VISC_BUILD_DIR)/bin/clang
-PLATFORM_CFLAGS =
+PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include
 
 CXX = $(LLVM_SRC_ROOT)/$(VISC_BUILD_DIR)/bin/clang++
-PLATFORM_CXXFLAGS =
+PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include
 
 LINKER = $(LLVM_SRC_ROOT)/$(VISC_BUILD_DIR)/bin/clang++
 PLATFORM_LDFLAGS = -lm -lpthread -lOpenCL
-- 
GitLab