diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..36b421ec6f1359114ea0035d21048ab0b95bf30e
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=opencl
+SRCDIR_OBJS=main.o io.o #compute_gold.o
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
+KERNEL_OBJS=kernel_offline.nvptx.s
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d2eff0bb89fc7a40e072dd2b151222e8b47afad0
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel.cl
@@ -0,0 +1,36 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    float c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0.0f;
+    int m = 4 * get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+	float a0 = A[m + i * lda]; 
+	float a1 = A[m + 1 + i * lda]; 
+	float a2 = A[m + 2 + i * lda]; 
+	float a3 = A[m + 3 + i * lda]; 
+
+	float b = B[n + i * ldb];
+	c0 += a0 * b;
+	c1 += a1 * b;
+	c2 += a2 * b;
+	c3 += a3 * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
+    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
+    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
+    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d2eff0bb89fc7a40e072dd2b151222e8b47afad0
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/kernel_offline.cl
@@ -0,0 +1,36 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Kernel of dense matrix-matrix multiplication kernel.
+ */
+
+__kernel void mysgemmNT( __global const float *A, int lda, __global const float *B, int ldb, __global float* C, int ldc, int k, float alpha, float beta )
+{
+    float c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0.0f;
+    int m = 4 * get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+	float a0 = A[m + i * lda]; 
+	float a1 = A[m + 1 + i * lda]; 
+	float a2 = A[m + 2 + i * lda]; 
+	float a3 = A[m + 3 + i * lda]; 
+
+	float b = B[n + i * ldb];
+	c0 += a0 * b;
+	c1 += a1 * b;
+	c2 += a2 * b;
+	c3 += a3 * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
+    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
+    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
+    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc71d7fcf4c6addb9f0d30c9b668e6b3f01cd52
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base_tc/main.cc
@@ -0,0 +1,218 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* 
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <CL/cl.h>
+#include <parboil.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
+{
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+  
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+  
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m%TILE_SZ) || (n%TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
+      << "; n should be multiple of " << TILE_SZ << std::endl;
+  }
+
+  size_t db[2] = {TILE_SZ/4,TILE_SZ};
+  size_t dg[2] = {m/4,n};
+
+  cl_int clStatus;
+ 
+  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
+  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
+  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
+  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
+  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
+  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
+  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
+  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
+  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(clCommandQueue); 
+  CHECK_ERROR("clFinish")
+}
+
+int main (int argc, char *argv[]) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
+
+  pb_InitializeTimerSet(&timers);
+
+  /* Read command line. Expect 3 inputs: A, B and B^T 
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) 
+      || (params->inpFiles[1] == NULL)
+      || (params->inpFiles[2] == NULL)
+      || (params->inpFiles[3] != NULL))
+    {
+      fprintf(stderr, "Expecting three input filenames\n");
+      exit(-1);
+    }
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
+  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+   
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  // const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
+  // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  //CHECK_ERROR("clCreateProgramWithSource")
+
+  //char clOptions[50];
+  //sprintf(clOptions,"");
+
+  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  //CHECK_ERROR("clBuildProgram")
+
+  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  //CHECK_ERROR("clCreateKernel")
+
+  /* Read in data */
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0],
+      matArow, matAcol, matA);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2],
+      matBcol, matBrow, matBT);
+
+  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // copy A to device memory
+  A_sz = matArow*matAcol*sizeof(float);
+  B_sz = matBrow*matBcol*sizeof(float);
+
+  // allocate space for C
+  C_sz = matArow*matBcol*sizeof(float);
+
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow*matBcol);
+  
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  for(int i=0;i<matC.size();i++)
+	matC[i] = 0.0f;
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
+      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+
+  if (params->outFile) {
+    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+    clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+   
+    /* Write C to file */
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile,
+	matArow, matBcol, matC); 
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+
+  //free((void*)clSource[0]);
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseMemObject(dA);
+  clStatus = clReleaseMemObject(dB);
+  clStatus = clReleaseMemObject(dC);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext); 
+  
+  return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
index 141ea8da1627548cdb38262931e8a3b8bfd0d086..04b6579d254bf6648d50870724558a5ce7773bca 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
@@ -65,6 +65,7 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
+  return true;
 
 }
 
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f74ee8921a534b6963ba06d089398114571d070b
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71a615026f979a70ffb7d99341e3e5a1ba23e8b2
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc/main.cc
@@ -0,0 +1,177 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <parboil.h>
+#include <visc.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<<errorMessage<<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
+{
+    __visc__attributes(3, A, B, C, 1, C);
+    float c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0.0f;
+    int m = 4 * get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+        float a0 = A[m + i * lda]; 
+        float a1 = A[m + 1 + i * lda]; 
+        float a2 = A[m + 2 + i * lda]; 
+        float a3 = A[m + 3 + i * lda]; 
+
+        float b = B[n + i * ldb];
+
+        c0 += a0 * b;
+        c1 += a1 * b;
+        c2 += a2 * b;
+        c3 += a3 * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
+    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
+    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
+    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+}
+
+__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
+{
+    if ((transa != 'N') && (transa != 'n')) {
+        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    if ((transb != 'T') && (transb != 't')) {
+        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    // In this code we assume the matrix sizes are multiple of tile size
+    if ((m%TILE_SZ) || (n%TILE_SZ)) {
+        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
+                  << "; n should be multiple of " << TILE_SZ << std::endl;
+    }
+
+    unsigned db[2] = {TILE_SZ/4,TILE_SZ};
+    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+
+    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
+    __visc__wait(sgemmDFG);
+}
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t A_sz, B_sz, C_sz;
+    int matArow, matAcol;
+    int matBrow, matBcol;
+    std::vector<float> matA, matBT;
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] == NULL)
+            || (params->inpFiles[2] == NULL)
+            || (params->inpFiles[3] != NULL))
+    {
+        fprintf(stderr, "Expecting three input filenames\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+    // load A
+    readColMajorMatrixFile(params->inpFiles[0],
+                           matArow, matAcol, matA);
+
+    // load B^T
+    readColMajorMatrixFile(params->inpFiles[2],
+                           matBcol, matBrow, matBT);
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    A_sz = matArow*matAcol*sizeof(float);
+    B_sz = matBrow*matBcol*sizeof(float);
+
+    // allocate space for C
+    C_sz = matArow*matBcol*sizeof(float);
+
+    // OpenCL memory allocation
+    std::vector<float> matC(matArow*matBcol);
+
+    llvm_visc_track_mem(&matA.front(), A_sz);
+    llvm_visc_track_mem(&matBT.front(), B_sz);
+    llvm_visc_track_mem(&matC.front(), C_sz);
+    // Copy A and B^T into device memory
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    for(size_t i=0; i<matC.size(); i++)
+        matC[i] = 0.0f;
+
+    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+
+    // Use standard sgemm interface
+    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
+               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+
+    if (params->outFile) {
+        pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+
+        /* Write C to file */
+        llvm_visc_request_mem(&matC.front(), C_sz);
+        pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        writeColMajorMatrixFile(params->outFile,
+                                matArow, matBcol, matC);
+    }
+
+    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
+    llvm_visc_untrack_mem(&matA.front());
+    llvm_visc_untrack_mem(&matBT.front());
+    llvm_visc_untrack_mem(&matC.front());
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+    pb_FreeParameters(params);
+
+    return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f74ee8921a534b6963ba06d089398114571d070b
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/Makefile
@@ -0,0 +1,8 @@
+# (c) 2010 The Board of Trustees of the University of Illinois.
+
+LANGUAGE=visc
+SRCDIR_OBJS=io.ll #compute_gold.o
+VISC_OBJS=main.visc.ll
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS=-ffast-math -O3
+APP_CXXFLAGS=-ffast-math -O3
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045983722390eaa48deff0df0944dff481ee148a
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
@@ -0,0 +1,91 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/* I/O routines for reading and writing matrices in column-major
+ * layout
+ */
+
+#include<fstream>
+#include<iostream>
+#include<vector>
+
+char* readFile(const char* fileName)
+{
+	std::fstream f(fileName,std::fstream::in);
+	if(!f.good())
+	{
+		std::cerr<<"Error Reading File!!"<<std::endl;
+		return NULL;
+	}
+
+	f.seekg(0,std::ios::end);
+	int length = f.tellg();
+	f.seekg(0,std::ios::beg);
+
+	char* buffer;
+
+	if(length>0)
+	{
+		buffer = new char[length];
+		f.read(buffer,length);
+		buffer[length-1]=0;
+	}
+	else
+	{
+		buffer = new char;
+		buffer[0] = 0;
+	}
+	
+	f.close();
+
+	return buffer;
+}
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << std::endl;
+  std::fstream f(fn, std::fstream::in);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f >> nr_row;
+  f >> nr_col;
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  while (f.good() ) {
+    f >> data;
+    v.push_back(data);
+  }
+  v.pop_back(); // remove the duplicated last element
+  return true;
+
+}
+
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
+{
+  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+  std::fstream f(fn, std::fstream::out);
+  if ( !f.good() ) {
+    return false;
+  }
+
+  // Read # of rows and cols
+  f << nr_row << " "<<nr_col<<" ";
+
+  float data;
+  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  for (int i = 0; i < v.size(); ++i) {
+    f << v[i] << ' ';
+  }
+  f << "\n";
+  return true;
+
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71a615026f979a70ffb7d99341e3e5a1ba23e8b2
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
@@ -0,0 +1,177 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+/*
+ * Main entry of dense matrix-matrix multiplication kernel
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <malloc.h>
+#include <vector>
+#include <iostream>
+#include <parboil.h>
+#include <visc.h>
+
+// I/O routines
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern char* readFile(const char*);
+
+// Parameters of tile sizes
+#define TILE_SZ 16
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     std::cout<<errorMessage<<" Error!\n";  \
+     std::cout<<"Line: "<<__LINE__<<"\n";   \
+     exit(1);                               \
+  }
+
+void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
+{
+    __visc__attributes(3, A, B, C, 1, C);
+    float c0, c1, c2, c3;
+    c0 = c1 = c2 = c3 = 0.0f;
+    int m = 4 * get_global_id(0);
+    int n = get_global_id(1);
+
+    for (int i = 0; i < k; ++i) {
+        float a0 = A[m + i * lda]; 
+        float a1 = A[m + 1 + i * lda]; 
+        float a2 = A[m + 2 + i * lda]; 
+        float a3 = A[m + 3 + i * lda]; 
+
+        float b = B[n + i * ldb];
+
+        c0 += a0 * b;
+        c1 += a1 * b;
+        c2 += a2 * b;
+        c3 += a3 * b;
+    }
+    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
+    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
+    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
+    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+}
+
+__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
+{
+    if ((transa != 'N') && (transa != 'n')) {
+        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    if ((transb != 'T') && (transb != 't')) {
+        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+        return;
+    }
+
+    // In this code we assume the matrix sizes are multiple of tile size
+    if ((m%TILE_SZ) || (n%TILE_SZ)) {
+        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
+                  << "; n should be multiple of " << TILE_SZ << std::endl;
+    }
+
+    unsigned db[2] = {TILE_SZ/4,TILE_SZ};
+    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+
+    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
+    __visc__wait(sgemmDFG);
+}
+
+int main (int argc, char *argv[]) {
+
+    struct pb_Parameters *params;
+    struct pb_TimerSet timers;
+
+    size_t A_sz, B_sz, C_sz;
+    int matArow, matAcol;
+    int matBrow, matBcol;
+    std::vector<float> matA, matBT;
+
+    pb_InitializeTimerSet(&timers);
+    __visc__init();
+
+    /* Read command line. Expect 3 inputs: A, B and B^T
+       in column-major layout*/
+    params = pb_ReadParameters(&argc, argv);
+    if ((params->inpFiles[0] == NULL)
+            || (params->inpFiles[1] == NULL)
+            || (params->inpFiles[2] == NULL)
+            || (params->inpFiles[3] != NULL))
+    {
+        fprintf(stderr, "Expecting three input filenames\n");
+        exit(-1);
+    }
+
+    /* Read in data */
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+    // load A
+    readColMajorMatrixFile(params->inpFiles[0],
+                           matArow, matAcol, matA);
+
+    // load B^T
+    readColMajorMatrixFile(params->inpFiles[2],
+                           matBcol, matBrow, matBT);
+
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+    // copy A to device memory
+    A_sz = matArow*matAcol*sizeof(float);
+    B_sz = matBrow*matBcol*sizeof(float);
+
+    // allocate space for C
+    C_sz = matArow*matBcol*sizeof(float);
+
+    // OpenCL memory allocation
+    std::vector<float> matC(matArow*matBcol);
+
+    llvm_visc_track_mem(&matA.front(), A_sz);
+    llvm_visc_track_mem(&matBT.front(), B_sz);
+    llvm_visc_track_mem(&matC.front(), C_sz);
+    // Copy A and B^T into device memory
+    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    for(size_t i=0; i<matC.size(); i++)
+        matC[i] = 0.0f;
+
+    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+
+    // Use standard sgemm interface
+    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
+               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+
+    if (params->outFile) {
+        pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+
+        /* Write C to file */
+        llvm_visc_request_mem(&matC.front(), C_sz);
+        pb_SwitchToTimer(&timers, pb_TimerID_IO);
+        writeColMajorMatrixFile(params->outFile,
+                                matArow, matBcol, matC);
+    }
+
+    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
+    llvm_visc_untrack_mem(&matA.front());
+    llvm_visc_untrack_mem(&matBT.front());
+    llvm_visc_untrack_mem(&matC.front());
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+    pb_PrintTimerSet(&timers);
+    __visc__cleanup();
+    pb_FreeParameters(params);
+
+    return 0;
+}
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ea1e7b3b7cc4092f69dd0de9b33ad9b693bcac1c
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_tc_vec/main.visc.ll
@@ -0,0 +1,894 @@
+; ModuleID = 'build/visc_tc_vec_default/main.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.std::ios_base::Init" = type { i8 }
+%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_ios" }
+%"class.std::basic_ios" = type { %"class.std::ios_base", %"class.std::basic_ostream"*, i8, i8, %"class.std::basic_streambuf"*, %"class.std::ctype"*, %"class.std::num_put"*, %"class.std::num_get"* }
+%"class.std::ios_base" = type { i32 (...)**, i64, i64, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %"struct.std::ios_base::_Words", [8 x %"struct.std::ios_base::_Words"], i32, %"struct.std::ios_base::_Words"*, %"class.std::locale" }
+%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"class.std::ios_base"*, i32)*, i32, i32 }
+%"struct.std::ios_base::_Words" = type { i8*, i64 }
+%"class.std::locale" = type { %"class.std::locale::_Impl"* }
+%"class.std::locale::_Impl" = type { i32, %"class.std::locale::facet"**, i64, %"class.std::locale::facet"**, i8** }
+%"class.std::locale::facet" = type { i32 (...)**, i32 }
+%"class.std::basic_streambuf" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"class.std::locale" }
+%"class.std::ctype" = type { %"class.std::locale::facet", %struct.__locale_struct*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 }
+%struct.__locale_struct = type { [13 x %struct.__locale_data*], i16*, i32*, i32*, [13 x i8*] }
+%struct.__locale_data = type opaque
+%"class.std::num_put" = type { %"class.std::locale::facet" }
+%"class.std::num_get" = type { %"class.std::locale::facet" }
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+%rtype = type {}
+%struct.arg = type <{ float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32, %rtype }>
+%struct.pb_TimerSet = type { i32, %struct.pb_async_time_marker_list*, i64, i64, [24 x %struct.pb_Timer], [24 x %struct.pb_SubTimerList*] }
+%struct.pb_async_time_marker_list = type { i8*, i32, i8*, %struct.pb_async_time_marker_list* }
+%struct.pb_Timer = type { i32, i64, i64 }
+%struct.pb_SubTimerList = type { %struct.pb_SubTimer*, %struct.pb_SubTimer* }
+%struct.pb_SubTimer = type { i8*, %struct.pb_Timer, %struct.pb_SubTimer* }
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" }
+%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* }
+%struct.pb_Parameters = type { i8*, i8** }
+
+@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1
+@__dso_handle = external global i8
+@_ZSt4cerr = external global %"class.std::basic_ostream"
+@.str = private unnamed_addr constant [48 x i8] c"unsupported value of 'transa' in regtileSgemm()\00", align 1
+@.str1 = private unnamed_addr constant [48 x i8] c"unsupported value of 'transb' in regtileSgemm()\00", align 1
+@.str2 = private unnamed_addr constant [53 x i8] c"unsupported size of matrix. m should be multiple of \00", align 1
+@.str3 = private unnamed_addr constant [27 x i8] c"; n should be multiple of \00", align 1
+@stderr = external global %struct._IO_FILE*
+@.str4 = private unnamed_addr constant [33 x i8] c"Expecting three input filenames\0A\00", align 1
+@_ZSt4cout = external global %"class.std::basic_ostream"
+@.str5 = private unnamed_addr constant [10 x i8] c"GFLOPs = \00", align 1
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@viscTimerSet_GenVISC = common global i8* null
+@0 = internal constant [14 x i8] c"GenVISC_Timer\00"
+
+declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) #0
+
+declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) #0
+
+; Function Attrs: nounwind
+declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #1
+
+; Function Attrs: nounwind uwtable
+define %rtype @_Z9mysgemmNTPfiS_iS_iiff(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta) #2 {
+entry:
+  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.getNode()
+  %_Z9mysgemmNTPfiS_iS_iiff.parentNode = call i8* @llvm.visc.getParentNode(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
+  %0 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
+  %1 = call i32 @llvm.visc.getNumNodeInstances.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
+  %2 = mul i32 %0, %1
+  %3 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
+  %4 = add i32 %2, %3
+  %mul = shl nsw i32 %4, 2
+  %5 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.parentNode)
+  %6 = call i32 @llvm.visc.getNumNodeInstances.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
+  %7 = mul i32 %5, %6
+  %8 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %_Z9mysgemmNTPfiS_iS_iiff.node)
+  %9 = add i32 %7, %8
+  %cmp147 = icmp sgt i32 %k, 0
+  %add3144 = or i32 %mul, 1
+  %add8145 = or i32 %mul, 2
+  %add13146 = or i32 %mul, 3
+
+  %mul.tmp1 = insertelement <4 x i32> < i32 0, i32 0, i32 0, i32 0 >, i32 %mul, i32 0
+  %mul.tmp2 = insertelement <4 x i32> %mul.tmp1, i32 %add3144, i32 1
+  %mul.tmp3 = insertelement <4 x i32> %mul.tmp2, i32 %add8145, i32 2
+  %mul.vector = insertelement <4 x i32> %mul.tmp2, i32 %add13146, i32 3
+
+  %lda.tmp = insertelement <1 x i32> < i32 0 >, i32 %lda, i32 0
+  %lda.vector = shufflevector <1 x i32> %lda.tmp, <1 x i32> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 >
+
+  br i1 %cmp147, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+
+;  %c0.0152 = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ]
+;  %c1.0151 = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ]
+;  %c2.0150 = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ]
+;  %c3.0149 = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ]
+  %c.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ]
+
+  %10 = trunc i64 %indvars.iv to i32
+  %mul2 = mul nsw i32 %10, %lda
+
+;  %add = add nsw i32 %mul2, %mul
+;  %idxprom = sext i32 %add to i64
+;  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
+;  %11 = load float* %arrayidx, align 4, !tbaa !0
+;  %add5 = add nsw i32 %mul2, %add3144
+;  %idxprom6 = sext i32 %add5 to i64
+;  %arrayidx7 = getelementptr inbounds float* %A, i64 %idxprom6
+;  %12 = load float* %arrayidx7, align 4, !tbaa !0
+;  %add10 = add nsw i32 %mul2, %add8145
+;  %idxprom11 = sext i32 %add10 to i64
+;  %arrayidx12 = getelementptr inbounds float* %A, i64 %idxprom11
+;  %13 = load float* %arrayidx12, align 4, !tbaa !0
+;  %add15 = add nsw i32 %mul2, %add13146
+;  %idxprom16 = sext i32 %add15 to i64
+;  %arrayidx17 = getelementptr inbounds float* %A, i64 %idxprom16
+;  %14 = load float* %arrayidx17, align 4, !tbaa !0
+  %add = add nsw i32 %mul2, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
+  %arrayidx.cast = bitcast float* %arrayidx to <4 x float>*
+  %11 = load <4 x float>* %arrayidx.cast, align 4
+
+  %mul18 = mul nsw i32 %10, %ldb
+  %add19 = add nsw i32 %mul18, %9
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds float* %B, i64 %idxprom20
+;  %15 = load float* %arrayidx21, align 4, !tbaa !0
+  %12 = load float* %arrayidx21, align 4, !tbaa !0
+
+  %b.tmp = insertelement <1 x float> < float 0.000000e+00 >, float %12, i32 0
+  %b.vector = shufflevector <1 x float> %b.tmp, <1 x float> undef, <4 x i32> < i32 0, i32 0, i32 0, i32 0 >
+
+;  %mul22 = fmul fast float %11, %15
+;  %add23 = fadd fast float %c0.0152, %mul22
+;  %mul24 = fmul fast float %12, %15
+;  %add25 = fadd fast float %c1.0151, %mul24
+;  %mul26 = fmul fast float %13, %15
+;  %add27 = fadd fast float %c2.0150, %mul26
+;  %mul28 = fmul fast float %14, %15
+;  %add29 = fadd fast float %c3.0149, %mul28
+  %mul22 = fmul fast <4 x float> %11, %b.vector
+  %add23 = fadd fast <4 x float> %c.vector, %mul22
+
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+;  %c0.0.lcssa = phi float [ %add23, %for.body ], [ 0.000000e+00, %entry ]
+;  %c1.0.lcssa = phi float [ %add25, %for.body ], [ 0.000000e+00, %entry ]
+;  %c2.0.lcssa = phi float [ %add27, %for.body ], [ 0.000000e+00, %entry ]
+;  %c3.0.lcssa = phi float [ %add29, %for.body ], [ 0.000000e+00, %entry ]
+  %c.end.vector = phi <4 x float> [ %add23, %for.body ], [ < float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 >, %entry ]
+
+  %c0.0.lcssa = extractelement <4 x float> %c.end.vector, i32 0
+  %c1.0.lcssa = extractelement <4 x float> %c.end.vector, i32 1
+  %c2.0.lcssa = extractelement <4 x float> %c.end.vector, i32 2
+  %c3.0.lcssa = extractelement <4 x float> %c.end.vector, i32 3
+
+  %mul30 = mul nsw i32 %9, %ldc
+  %add31 = add nsw i32 %mul30, %mul
+  %idxprom32 = sext i32 %add31 to i64
+  %arrayidx33 = getelementptr inbounds float* %C, i64 %idxprom32
+
+;  %16 = load float* %arrayidx33, align 4, !tbaa !0
+;  %mul34 = fmul fast float %16, %beta
+  %13 = load float* %arrayidx33, align 4, !tbaa !0
+  %mul34 = fmul fast float %13, %beta
+
+  %mul35 = fmul fast float %c0.0.lcssa, %alpha
+  %add36 = fadd fast float %mul35, %mul34
+  store float %add36, float* %arrayidx33, align 4, !tbaa !0
+  %add43 = add nsw i32 %add3144, %mul30
+  %idxprom44 = sext i32 %add43 to i64
+  %arrayidx45 = getelementptr inbounds float* %C, i64 %idxprom44
+
+;  %17 = load float* %arrayidx45, align 4, !tbaa !0
+;  %mul46 = fmul fast float %17, %beta
+  %14 = load float* %arrayidx45, align 4, !tbaa !0
+  %mul46 = fmul fast float %14, %beta
+
+  %mul47 = fmul fast float %c1.0.lcssa, %alpha
+  %add48 = fadd fast float %mul47, %mul46
+  store float %add48, float* %arrayidx45, align 4, !tbaa !0
+  %add56 = add nsw i32 %add8145, %mul30
+  %idxprom57 = sext i32 %add56 to i64
+  %arrayidx58 = getelementptr inbounds float* %C, i64 %idxprom57
+
+;  %18 = load float* %arrayidx58, align 4, !tbaa !0
+;  %mul59 = fmul fast float %18, %beta
+  %15 = load float* %arrayidx58, align 4, !tbaa !0
+  %mul59 = fmul fast float %15, %beta
+
+  %mul60 = fmul fast float %c2.0.lcssa, %alpha
+  %add61 = fadd fast float %mul60, %mul59
+  store float %add61, float* %arrayidx58, align 4, !tbaa !0
+  %add69 = add nsw i32 %add13146, %mul30
+  %idxprom70 = sext i32 %add69 to i64
+  %arrayidx71 = getelementptr inbounds float* %C, i64 %idxprom70
+
+;  %19 = load float* %arrayidx71, align 4, !tbaa !0
+;  %mul72 = fmul fast float %19, %beta
+  %16 = load float* %arrayidx71, align 4, !tbaa !0
+  %mul72 = fmul fast float %16, %beta
+
+  %mul73 = fmul fast float %c3.0.lcssa, %alpha
+  %add74 = fadd fast float %mul73, %mul72
+  store float %add74, float* %arrayidx71, align 4, !tbaa !0
+  ret %rtype undef
+}
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext %transa, i8 signext %transb, i32 %m, i32 %n, i32 %k, float %alpha, float* %A, i64 %bytesA, i32 %lda, float* %B, i64 %bytesB, i32 %ldb, float %beta, float* %C, i64 %bytesC, i32 %ldc) #3 {
+entry:
+  switch i8 %transa, label %if.then [
+    i8 78, label %if.end
+    i8 110, label %if.end
+  ]
+
+if.then:                                          ; preds = %entry
+  %call1.i = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str, i64 0, i64 0), i64 47) #1
+  %vtable.i = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3
+  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
+  %0 = bitcast i8* %vbase.offset.ptr.i to i64*
+  %vbase.offset.i = load i64* %0, align 8
+  %add.ptr.i.sum = add i64 %vbase.offset.i, 240
+  %_M_ctype.i = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i.sum
+  %1 = bitcast i8* %_M_ctype.i to %"class.std::ctype"**
+  %2 = load %"class.std::ctype"** %1, align 8, !tbaa !4
+  %tobool.i97 = icmp eq %"class.std::ctype"* %2, null
+  br i1 %tobool.i97, label %if.then.i98, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
+
+if.then.i98:                                      ; preds = %if.then
+  tail call void @_ZSt16__throw_bad_castv() #7
+  unreachable
+
+_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit:    ; preds = %if.then
+  %_M_widen_ok.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 6
+  %3 = load i8* %_M_widen_ok.i, align 1, !tbaa !1
+  %tobool.i = icmp eq i8 %3, 0
+  br i1 %tobool.i, label %if.end.i, label %if.then.i
+
+if.then.i:                                        ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
+  %arrayidx.i = getelementptr inbounds %"class.std::ctype"* %2, i64 0, i32 7, i64 10
+  %4 = load i8* %arrayidx.i, align 1, !tbaa !1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit
+
+if.end.i:                                         ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit
+  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %2) #1
+  %5 = bitcast %"class.std::ctype"* %2 to i8 (%"class.std::ctype"*, i8)***
+  %vtable.i71 = load i8 (%"class.std::ctype"*, i8)*** %5, align 8, !tbaa !3
+  %vfn.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i71, i64 6
+  %6 = load i8 (%"class.std::ctype"*, i8)** %vfn.i, align 8
+  %call.i72 = tail call signext i8 %6(%"class.std::ctype"* %2, i8 signext 10) #1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit
+
+_ZNKSt5ctypeIcE5widenEc.exit:                     ; preds = %if.end.i, %if.then.i
+  %retval.0.i = phi i8 [ %4, %if.then.i ], [ %call.i72, %if.end.i ]
+  %call1.i47 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i) #1
+  %call.i = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i47) #1
+  br label %return
+
+if.end:                                           ; preds = %entry, %entry
+  switch i8 %transb, label %if.then9 [
+    i8 84, label %if.end12
+    i8 116, label %if.end12
+  ]
+
+if.then9:                                         ; preds = %if.end
+  %call1.i49 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([48 x i8]* @.str1, i64 0, i64 0), i64 47) #1
+  %vtable.i51 = load i8** bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8**), align 8, !tbaa !3
+  %vbase.offset.ptr.i52 = getelementptr i8* %vtable.i51, i64 -24
+  %7 = bitcast i8* %vbase.offset.ptr.i52 to i64*
+  %vbase.offset.i53 = load i64* %7, align 8
+  %add.ptr.i54.sum = add i64 %vbase.offset.i53, 240
+  %_M_ctype.i73 = getelementptr inbounds i8* bitcast (%"class.std::basic_ostream"* @_ZSt4cerr to i8*), i64 %add.ptr.i54.sum
+  %8 = bitcast i8* %_M_ctype.i73 to %"class.std::ctype"**
+  %9 = load %"class.std::ctype"** %8, align 8, !tbaa !4
+  %tobool.i100 = icmp eq %"class.std::ctype"* %9, null
+  br i1 %tobool.i100, label %if.then.i101, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
+
+if.then.i101:                                     ; preds = %if.then9
+  tail call void @_ZSt16__throw_bad_castv() #7
+  unreachable
+
+_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103: ; preds = %if.then9
+  %_M_widen_ok.i75 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 6
+  %10 = load i8* %_M_widen_ok.i75, align 1, !tbaa !1
+  %tobool.i76 = icmp eq i8 %10, 0
+  br i1 %tobool.i76, label %if.end.i82, label %if.then.i78
+
+if.then.i78:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
+  %arrayidx.i77 = getelementptr inbounds %"class.std::ctype"* %9, i64 0, i32 7, i64 10
+  %11 = load i8* %arrayidx.i77, align 1, !tbaa !1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit84
+
+if.end.i82:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit103
+  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %9) #1
+  %12 = bitcast %"class.std::ctype"* %9 to i8 (%"class.std::ctype"*, i8)***
+  %vtable.i79 = load i8 (%"class.std::ctype"*, i8)*** %12, align 8, !tbaa !3
+  %vfn.i80 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i79, i64 6
+  %13 = load i8 (%"class.std::ctype"*, i8)** %vfn.i80, align 8
+  %call.i81 = tail call signext i8 %13(%"class.std::ctype"* %9, i8 signext 10) #1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit84
+
+_ZNKSt5ctypeIcE5widenEc.exit84:                   ; preds = %if.end.i82, %if.then.i78
+  %retval.0.i83 = phi i8 [ %11, %if.then.i78 ], [ %call.i81, %if.end.i82 ]
+  %call1.i56 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* @_ZSt4cerr, i8 signext %retval.0.i83) #1
+  %call.i57 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i56) #1
+  br label %return
+
+if.end12:                                         ; preds = %if.end, %if.end
+  %rem44 = and i32 %m, 15
+  %tobool = icmp eq i32 %rem44, 0
+  br i1 %tobool, label %lor.lhs.false, label %if.then15
+
+lor.lhs.false:                                    ; preds = %if.end12
+  %rem1345 = and i32 %n, 15
+  %tobool14 = icmp eq i32 %rem1345, 0
+  br i1 %tobool14, label %if.end21, label %if.then15
+
+if.then15:                                        ; preds = %lor.lhs.false, %if.end12
+  %call1.i59 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cerr, i8* getelementptr inbounds ([53 x i8]* @.str2, i64 0, i64 0), i64 52) #1
+  %call17 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cerr, i32 16) #1
+  %call1.i61 = tail call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* %call17, i8* getelementptr inbounds ([27 x i8]* @.str3, i64 0, i64 0), i64 26) #1
+  %call19 = tail call %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"* %call17, i32 16) #1
+  %14 = bitcast %"class.std::basic_ostream"* %call19 to i8**
+  %vtable.i63 = load i8** %14, align 8, !tbaa !3
+  %vbase.offset.ptr.i64 = getelementptr i8* %vtable.i63, i64 -24
+  %15 = bitcast i8* %vbase.offset.ptr.i64 to i64*
+  %vbase.offset.i65 = load i64* %15, align 8
+  %16 = bitcast %"class.std::basic_ostream"* %call19 to i8*
+  %add.ptr.i66.sum = add i64 %vbase.offset.i65, 240
+  %_M_ctype.i85 = getelementptr inbounds i8* %16, i64 %add.ptr.i66.sum
+  %17 = bitcast i8* %_M_ctype.i85 to %"class.std::ctype"**
+  %18 = load %"class.std::ctype"** %17, align 8, !tbaa !4
+  %tobool.i104 = icmp eq %"class.std::ctype"* %18, null
+  br i1 %tobool.i104, label %if.then.i105, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
+
+if.then.i105:                                     ; preds = %if.then15
+  tail call void @_ZSt16__throw_bad_castv() #7
+  unreachable
+
+_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107: ; preds = %if.then15
+  %_M_widen_ok.i87 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 6
+  %19 = load i8* %_M_widen_ok.i87, align 1, !tbaa !1
+  %tobool.i88 = icmp eq i8 %19, 0
+  br i1 %tobool.i88, label %if.end.i94, label %if.then.i90
+
+if.then.i90:                                      ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
+  %arrayidx.i89 = getelementptr inbounds %"class.std::ctype"* %18, i64 0, i32 7, i64 10
+  %20 = load i8* %arrayidx.i89, align 1, !tbaa !1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit96
+
+if.end.i94:                                       ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit107
+  tail call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %18) #1
+  %21 = bitcast %"class.std::ctype"* %18 to i8 (%"class.std::ctype"*, i8)***
+  %vtable.i91 = load i8 (%"class.std::ctype"*, i8)*** %21, align 8, !tbaa !3
+  %vfn.i92 = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i91, i64 6
+  %22 = load i8 (%"class.std::ctype"*, i8)** %vfn.i92, align 8
+  %call.i93 = tail call signext i8 %22(%"class.std::ctype"* %18, i8 signext 10) #1
+  br label %_ZNKSt5ctypeIcE5widenEc.exit96
+
+_ZNKSt5ctypeIcE5widenEc.exit96:                   ; preds = %if.end.i94, %if.then.i90
+  %retval.0.i95 = phi i8 [ %20, %if.then.i90 ], [ %call.i93, %if.end.i94 ]
+  %call1.i68 = tail call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call19, i8 signext %retval.0.i95) #1
+  %call.i69 = tail call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i68) #1
+  br label %if.end21
+
+if.end21:                                         ; preds = %_ZNKSt5ctypeIcE5widenEc.exit96, %lor.lhs.false
+  %div = sdiv i32 %m, 16
+  %mul = and i32 %div, 1073741823
+  %div22 = sdiv i32 %n, 16
+  %mul24 = and i32 %div22, 268435455
+  %conv33 = fpext float %alpha to double
+  %conv34 = fpext float %beta to double
+  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 19)
+  %in.addr = alloca %struct.arg
+  %in.addr.A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
+  store float* %A, float** %in.addr.A
+  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
+  store i64 %bytesA, i64* %in.addr.bytes_A
+  %in.addr.lda = getelementptr %struct.arg* %in.addr, i32 0, i32 2
+  store i32 %lda, i32* %in.addr.lda
+  %in.addr.B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
+  store float* %B, float** %in.addr.B
+  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 4
+  store i64 %bytesB, i64* %in.addr.bytes_B
+  %in.addr.ldb = getelementptr %struct.arg* %in.addr, i32 0, i32 5
+  store i32 %ldb, i32* %in.addr.ldb
+  %in.addr.C = getelementptr %struct.arg* %in.addr, i32 0, i32 6
+  store float* %C, float** %in.addr.C
+  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 7
+  store i64 %bytesC, i64* %in.addr.bytes_C
+  %in.addr.ldc = getelementptr %struct.arg* %in.addr, i32 0, i32 8
+  store i32 %ldc, i32* %in.addr.ldc
+  %in.addr.k = getelementptr %struct.arg* %in.addr, i32 0, i32 9
+  store i32 %k, i32* %in.addr.k
+  %in.addr.alpha = getelementptr %struct.arg* %in.addr, i32 0, i32 10
+  %in.addr.alpha.cast = fptrunc double %conv33 to float
+  store float %in.addr.alpha.cast, float* %in.addr.alpha
+  %in.addr.beta = getelementptr %struct.arg* %in.addr, i32 0, i32 11
+  %in.addr.beta.cast = fptrunc double %conv34 to float
+  store float %in.addr.beta.cast, float* %in.addr.beta
+  %in.addr.dimX0 = getelementptr %struct.arg* %in.addr, i32 0, i32 12
+  store i32 4, i32* %in.addr.dimX0
+  %in.addr.dimY0 = getelementptr %struct.arg* %in.addr, i32 0, i32 13
+  store i32 16, i32* %in.addr.dimY0
+  %in.addr.dimX1 = getelementptr %struct.arg* %in.addr, i32 0, i32 14
+  store i32 %mul, i32* %in.addr.dimX1
+  %in.addr.dimY1 = getelementptr %struct.arg* %in.addr, i32 0, i32 15
+  store i32 %mul24, i32* %in.addr.dimY1
+  %args = bitcast %struct.arg* %in.addr to i8*
+  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
+  %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2 = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level2 to i8*), i8* %args)
+  call void @llvm.visc.wait(i8* %graph_Z9mysgemmNTPfiS_iS_iiffInternal_level2)
+  br label %return
+
+return:                                           ; preds = %if.end21, %_ZNKSt5ctypeIcE5widenEc.exit84, %_ZNKSt5ctypeIcE5widenEc.exit
+  ret void
+}
+
+declare %"class.std::basic_ostream"* @_ZNSolsEi(%"class.std::basic_ostream"*, i32) #0
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** %argv) #2 {
+entry:
+  %argc.addr = alloca i32, align 4
+  %timers = alloca %struct.pb_TimerSet, align 8
+  %matArow = alloca i32, align 4
+  %matAcol = alloca i32, align 4
+  %matBrow = alloca i32, align 4
+  %matBcol = alloca i32, align 4
+  %matA = alloca %"class.std::vector", align 8
+  %matBT = alloca %"class.std::vector", align 8
+  %matC = alloca %"class.std::vector", align 8
+  store i32 %argc, i32* %argc.addr, align 4, !tbaa !5
+  %0 = bitcast %struct.pb_TimerSet* %timers to i8*
+  call void @llvm.lifetime.start(i64 800, i8* %0) #1
+  %1 = bitcast %"class.std::vector"* %matA to i8*
+  call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 24, i32 8, i1 false) #1
+  %2 = bitcast %"class.std::vector"* %matBT to i8*
+  call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i32 8, i1 false) #1
+  call void @pb_InitializeTimerSet(%struct.pb_TimerSet* %timers) #1
+  %3 = call i8* @llvm_visc_initializeTimerSet()
+  store i8* %3, i8** @viscTimerSet_GenVISC
+  call void @llvm_visc_switchToTimer(i8** @viscTimerSet_GenVISC, i32 0)
+  call void @llvm.visc.init()
+  %call = call %struct.pb_Parameters* @pb_ReadParameters(i32* %argc.addr, i8** %argv) #1
+  %inpFiles = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 1
+  %4 = load i8*** %inpFiles, align 8, !tbaa !4
+  %5 = load i8** %4, align 8, !tbaa !4
+  %cmp = icmp eq i8* %5, null
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i8** %4, i64 1
+  %6 = load i8** %arrayidx2, align 8, !tbaa !4
+  %cmp3 = icmp eq i8* %6, null
+  br i1 %cmp3, label %if.then, label %lor.lhs.false4
+
+lor.lhs.false4:                                   ; preds = %lor.lhs.false
+  %arrayidx6 = getelementptr inbounds i8** %4, i64 2
+  %7 = load i8** %arrayidx6, align 8, !tbaa !4
+  %cmp7 = icmp eq i8* %7, null
+  br i1 %cmp7, label %if.then, label %lor.lhs.false8
+
+lor.lhs.false8:                                   ; preds = %lor.lhs.false4
+  %arrayidx10 = getelementptr inbounds i8** %4, i64 3
+  %8 = load i8** %arrayidx10, align 8, !tbaa !4
+  %cmp11 = icmp eq i8* %8, null
+  br i1 %cmp11, label %if.end, label %if.then
+
+if.then:                                          ; preds = %lor.lhs.false8, %lor.lhs.false4, %lor.lhs.false, %entry
+  %9 = load %struct._IO_FILE** @stderr, align 8, !tbaa !4
+  %10 = call i64 @fwrite(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), i64 32, i64 1, %struct._IO_FILE* %9)
+  call void @exit(i32 -1) #7
+  unreachable
+
+if.end:                                           ; preds = %lor.lhs.false8
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1
+  %11 = load i8*** %inpFiles, align 8, !tbaa !4
+  %12 = load i8** %11, align 8, !tbaa !4
+  %call15 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %12, i32* %matArow, i32* %matAcol, %"class.std::vector"* %matA) #1
+  %13 = load i8*** %inpFiles, align 8, !tbaa !4
+  %arrayidx17 = getelementptr inbounds i8** %13, i64 2
+  %14 = load i8** %arrayidx17, align 8, !tbaa !4
+  %call18 = call zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8* %14, i32* %matBcol, i32* %matBrow, %"class.std::vector"* %matBT) #1
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
+  %15 = load i32* %matArow, align 4, !tbaa !5
+  %16 = load i32* %matAcol, align 4, !tbaa !5
+  %mul = mul nsw i32 %16, %15
+  %conv = sext i32 %mul to i64
+  %mul19 = shl nsw i64 %conv, 2
+  %17 = load i32* %matBrow, align 4, !tbaa !5
+  %18 = load i32* %matBcol, align 4, !tbaa !5
+  %mul20 = mul nsw i32 %18, %17
+  %conv21 = sext i32 %mul20 to i64
+  %mul22 = shl nsw i64 %conv21, 2
+  %mul23 = mul nsw i32 %18, %15
+  %conv24 = sext i32 %mul23 to i64
+  %mul25 = shl nsw i64 %conv24, 2
+  %19 = bitcast %"class.std::vector"* %matC to i8*
+  call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 24, i32 8, i1 false) #1
+  %cmp.i.i.i.i = icmp eq i32 %mul23, 0
+  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i, label %cond.true.i.i.i.i
+
+cond.true.i.i.i.i:                                ; preds = %if.end
+  %cmp.i.i.i.i.i = icmp slt i32 %mul23, 0
+  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, !prof !6
+
+if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
+  call void @_ZSt17__throw_bad_allocv() #7
+  unreachable
+
+_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i: ; preds = %cond.true.i.i.i.i
+  %call2.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul25) #1
+  %20 = bitcast i8* %call2.i.i.i.i.i to float*
+  br label %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
+
+_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i:    ; preds = %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i, %if.end
+  %cond.i.i.i.i = phi float* [ %20, %_ZN9__gnu_cxx13new_allocatorIfE8allocateEmPKv.exit.i.i.i.i ], [ null, %if.end ]
+  %_M_start.i.i.i81 = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 0
+  store float* %cond.i.i.i.i, float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %_M_finish.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 1
+  store float* %cond.i.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4
+  %add.ptr.i.i.i = getelementptr inbounds float* %cond.i.i.i.i, i64 %conv24
+  %_M_end_of_storage.i.i.i = getelementptr inbounds %"class.std::vector"* %matC, i64 0, i32 0, i32 0, i32 2
+  store float* %add.ptr.i.i.i, float** %_M_end_of_storage.i.i.i, align 8, !tbaa !4
+  br i1 %cmp.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.lr.ph.i.i.i.i.i.i.i.i
+
+for.body.lr.ph.i.i.i.i.i.i.i.i:                   ; preds = %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
+  %n.mod.vf.i.i.i.i.i.i.i.i = and i64 %conv24, 7
+  %n.vec.i.i.i.i.i.i.i.i = sub i64 %conv24, %n.mod.vf.i.i.i.i.i.i.i.i
+  %cmp.zero.i.i.i.i.i.i.i.i = icmp eq i64 %n.mod.vf.i.i.i.i.i.i.i.i, %conv24
+  %ptr.ind.end.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %n.vec.i.i.i.i.i.i.i.i
+  br i1 %cmp.zero.i.i.i.i.i.i.i.i, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
+
+vector.body.i.i.i.i.i.i.i.i:                      ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
+  %index.i.i.i.i.i.i.i.i = phi i64 [ %index.next.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ], [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ]
+  %next.gep.i.i.i.i.i.i.i.i = getelementptr float* %cond.i.i.i.i, i64 %index.i.i.i.i.i.i.i.i
+  %21 = bitcast float* %next.gep.i.i.i.i.i.i.i.i to <4 x float>*
+  store <4 x float> zeroinitializer, <4 x float>* %21, align 4
+  %next.gep.sum41.i.i.i.i.i.i.i.i = or i64 %index.i.i.i.i.i.i.i.i, 4
+  %22 = getelementptr float* %cond.i.i.i.i, i64 %next.gep.sum41.i.i.i.i.i.i.i.i
+  %23 = bitcast float* %22 to <4 x float>*
+  store <4 x float> zeroinitializer, <4 x float>* %23, align 4
+  %index.next.i.i.i.i.i.i.i.i = add i64 %index.i.i.i.i.i.i.i.i, 8
+  %24 = icmp eq i64 %index.next.i.i.i.i.i.i.i.i, %n.vec.i.i.i.i.i.i.i.i
+  br i1 %24, label %middle.block.i.i.i.i.i.i.i.i, label %vector.body.i.i.i.i.i.i.i.i
+
+middle.block.i.i.i.i.i.i.i.i:                     ; preds = %vector.body.i.i.i.i.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i
+  %resume.val.i.i.i.i.i.i.i.i = phi float* [ %cond.i.i.i.i, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %ptr.ind.end.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
+  %resume.val7.i.i.i.i.i.i.i.i = phi i64 [ %conv24, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.mod.vf.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
+  %new.indc.resume.val.i.i.i.i.i.i.i.i = phi i64 [ 0, %for.body.lr.ph.i.i.i.i.i.i.i.i ], [ %n.vec.i.i.i.i.i.i.i.i, %vector.body.i.i.i.i.i.i.i.i ]
+  %cmp.n.i.i.i.i.i.i.i.i = icmp eq i64 %new.indc.resume.val.i.i.i.i.i.i.i.i, %conv24
+  br i1 %cmp.n.i.i.i.i.i.i.i.i, label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit, label %for.body.i.i.i.i.i.i.i.i.preheader
+
+for.body.i.i.i.i.i.i.i.i.preheader:               ; preds = %middle.block.i.i.i.i.i.i.i.i
+  %resume.val.i.i.i.i.i.i.i.i101 = bitcast float* %resume.val.i.i.i.i.i.i.i.i to i8*
+  %25 = shl nsw i64 %resume.val7.i.i.i.i.i.i.i.i, 2
+  call void @llvm.memset.p0i8.i64(i8* %resume.val.i.i.i.i.i.i.i.i101, i8 0, i64 %25, i32 4, i1 false)
+  br label %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
+
+_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit:            ; preds = %for.body.i.i.i.i.i.i.i.i.preheader, %middle.block.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEEC2EmRKS0_.exit.i.i
+  store float* %add.ptr.i.i.i, float** %_M_finish.i.i.i, align 8, !tbaa !4
+  %_M_start.i.i = getelementptr inbounds %"class.std::vector"* %matA, i64 0, i32 0, i32 0, i32 0
+  %26 = load float** %_M_start.i.i, align 8, !tbaa !4
+  %27 = bitcast float* %26 to i8*
+  call void @llvm_visc_track_mem(i8* %27, i64 %mul19) #1
+  %_M_start.i.i82 = getelementptr inbounds %"class.std::vector"* %matBT, i64 0, i32 0, i32 0, i32 0
+  %28 = load float** %_M_start.i.i82, align 8, !tbaa !4
+  %29 = bitcast float* %28 to i8*
+  call void @llvm_visc_track_mem(i8* %29, i64 %mul22) #1
+  %30 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %31 = bitcast float* %30 to i8*
+  call void @llvm_visc_track_mem(i8* %31, i64 %mul25) #1
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 6) #1
+  %32 = load float** %_M_finish.i.i.i, align 8, !tbaa !4
+  %33 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %cmp3399 = icmp eq float* %32, %33
+  br i1 %cmp3399, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
+  %sub.ptr.lhs.cast.i = ptrtoint float* %32 to i64
+  %sub.ptr.rhs.cast.i = ptrtoint float* %33 to i64
+  %sub.ptr.sub.i = sub i64 %sub.ptr.lhs.cast.i, %sub.ptr.rhs.cast.i
+  %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.0100 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %add.ptr.i = getelementptr inbounds float* %33, i64 %i.0100
+  store float 0.000000e+00, float* %add.ptr.i, align 4, !tbaa !0
+  %inc = add i64 %i.0100, 1
+  %cmp33 = icmp ult i64 %inc, %sub.ptr.div.i
+  br i1 %cmp33, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %_ZNSt6vectorIfSaIfEEC1EmRKfRKS0_.exit
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
+  %34 = load i32* %matArow, align 4, !tbaa !5
+  %35 = load i32* %matBcol, align 4, !tbaa !5
+  %36 = load i32* %matAcol, align 4, !tbaa !5
+  %37 = load float** %_M_start.i.i, align 8, !tbaa !4
+  %38 = load float** %_M_start.i.i82, align 8, !tbaa !4
+  %39 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  call void @_Z10basicSgemmcciiifPfmiS_mifS_mi(i8 signext 78, i8 signext 84, i32 %34, i32 %35, i32 %36, float 1.000000e+00, float* %37, i64 %mul19, i32 %34, float* %38, i64 %mul22, i32 %35, float 0.000000e+00, float* %39, i64 %mul25, i32 %34)
+  %outFile = getelementptr inbounds %struct.pb_Parameters* %call, i64 0, i32 0
+  %40 = load i8** %outFile, align 8, !tbaa !4
+  %tobool = icmp eq i8* %40, null
+  br i1 %tobool, label %if.end42, label %if.then38
+
+if.then38:                                        ; preds = %for.end
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 3) #1
+  %41 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %42 = bitcast float* %41 to i8*
+  call void @llvm_visc_request_mem(i8* %42, i64 %mul25) #1
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 1) #1
+  %43 = load i8** %outFile, align 8, !tbaa !4
+  %44 = load i32* %matArow, align 4, !tbaa !5
+  %45 = load i32* %matBcol, align 4, !tbaa !5
+  %call41 = call zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8* %43, i32 %44, i32 %45, %"class.std::vector"* %matC) #1
+  br label %if.end42
+
+if.end42:                                         ; preds = %if.then38, %for.end
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 16) #1
+  %46 = load float** %_M_start.i.i, align 8, !tbaa !4
+  %47 = bitcast float* %46 to i8*
+  call void @llvm_visc_untrack_mem(i8* %47) #1
+  %48 = load float** %_M_start.i.i82, align 8, !tbaa !4
+  %49 = bitcast float* %48 to i8*
+  call void @llvm_visc_untrack_mem(i8* %49) #1
+  %50 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %51 = bitcast float* %50 to i8*
+  call void @llvm_visc_untrack_mem(i8* %51) #1
+  call void @pb_SwitchToTimer(%struct.pb_TimerSet* %timers, i32 0) #1
+  %arrayidx47 = getelementptr inbounds %struct.pb_TimerSet* %timers, i64 0, i32 4, i64 2
+  %call48 = call double @pb_GetElapsedTime(%struct.pb_Timer* %arrayidx47) #1
+  %call1.i88 = call %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0), i64 9) #1
+  %52 = load i32* %matArow, align 4, !tbaa !5
+  %conv50 = sitofp i32 %52 to double
+  %mul51 = fmul fast double %conv50, 2.000000e+00
+  %53 = load i32* %matBcol, align 4, !tbaa !5
+  %conv52 = sitofp i32 %53 to double
+  %mul53 = fmul fast double %mul51, %conv52
+  %54 = load i32* %matAcol, align 4, !tbaa !5
+  %conv54 = sitofp i32 %54 to double
+  %mul55 = fmul fast double %mul53, %conv54
+  %div = fdiv fast double %mul55, %call48
+  %div56 = fmul double %div, 1.000000e-09
+  %call.i = call %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* @_ZSt4cout, double %div56) #1
+  %55 = bitcast %"class.std::basic_ostream"* %call.i to i8**
+  %vtable.i = load i8** %55, align 8, !tbaa !3
+  %vbase.offset.ptr.i = getelementptr i8* %vtable.i, i64 -24
+  %56 = bitcast i8* %vbase.offset.ptr.i to i64*
+  %vbase.offset.i = load i64* %56, align 8
+  %57 = bitcast %"class.std::basic_ostream"* %call.i to i8*
+  %add.ptr.sum.i = add i64 %vbase.offset.i, 240
+  %_M_ctype.i.i = getelementptr inbounds i8* %57, i64 %add.ptr.sum.i
+  %58 = bitcast i8* %_M_ctype.i.i to %"class.std::ctype"**
+  %59 = load %"class.std::ctype"** %58, align 8, !tbaa !4
+  %tobool.i.i.i = icmp eq %"class.std::ctype"* %59, null
+  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
+
+if.then.i.i.i:                                    ; preds = %if.end42
+  call void @_ZSt16__throw_bad_castv() #7
+  unreachable
+
+_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i: ; preds = %if.end42
+  %_M_widen_ok.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 6
+  %60 = load i8* %_M_widen_ok.i.i.i, align 1, !tbaa !1
+  %tobool.i3.i.i = icmp eq i8 %60, 0
+  br i1 %tobool.i3.i.i, label %if.end.i.i.i, label %if.then.i4.i.i
+
+if.then.i4.i.i:                                   ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
+  %arrayidx.i.i.i = getelementptr inbounds %"class.std::ctype"* %59, i64 0, i32 7, i64 10
+  %61 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1
+  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
+
+if.end.i.i.i:                                     ; preds = %_ZSt13__check_facetISt5ctypeIcEERKT_PS3_.exit.i.i
+  call void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"* %59) #1
+  %62 = bitcast %"class.std::ctype"* %59 to i8 (%"class.std::ctype"*, i8)***
+  %vtable.i.i.i = load i8 (%"class.std::ctype"*, i8)*** %62, align 8, !tbaa !3
+  %vfn.i.i.i = getelementptr inbounds i8 (%"class.std::ctype"*, i8)** %vtable.i.i.i, i64 6
+  %63 = load i8 (%"class.std::ctype"*, i8)** %vfn.i.i.i, align 8
+  %call.i.i.i = call signext i8 %63(%"class.std::ctype"* %59, i8 signext 10) #1
+  br label %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
+
+_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit: ; preds = %if.end.i.i.i, %if.then.i4.i.i
+  %retval.0.i.i.i = phi i8 [ %61, %if.then.i4.i.i ], [ %call.i.i.i, %if.end.i.i.i ]
+  %call1.i = call %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"* %call.i, i8 signext %retval.0.i.i.i) #1
+  %call.i.i = call %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"* %call1.i) #1
+  call void @pb_PrintTimerSet(%struct.pb_TimerSet* %timers) #1
+  %Ptr = getelementptr [14 x i8]* @0, i64 0, i64 0
+  call void @llvm_visc_printTimerSet(i8** @viscTimerSet_GenVISC, i8* %Ptr)
+  call void @llvm.visc.cleanup()
+  call void @pb_FreeParameters(%struct.pb_Parameters* %call) #1
+  %64 = load float** %_M_start.i.i.i81, align 8, !tbaa !4
+  %tobool.i.i.i.i78 = icmp eq float* %64, null
+  br i1 %tobool.i.i.i.i78, label %_ZNSt6vectorIfSaIfEED1Ev.exit80, label %if.then.i.i.i.i79
+
+if.then.i.i.i.i79:                                ; preds = %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
+  %65 = bitcast float* %64 to i8*
+  call void @_ZdlPv(i8* %65) #1
+  br label %_ZNSt6vectorIfSaIfEED1Ev.exit80
+
+_ZNSt6vectorIfSaIfEED1Ev.exit80:                  ; preds = %if.then.i.i.i.i79, %_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_.exit
+  %66 = load float** %_M_start.i.i82, align 8, !tbaa !4
+  %tobool.i.i.i.i74 = icmp eq float* %66, null
+  br i1 %tobool.i.i.i.i74, label %_ZNSt6vectorIfSaIfEED1Ev.exit76, label %if.then.i.i.i.i75
+
+if.then.i.i.i.i75:                                ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit80
+  %67 = bitcast float* %66 to i8*
+  call void @_ZdlPv(i8* %67) #1
+  br label %_ZNSt6vectorIfSaIfEED1Ev.exit76
+
+_ZNSt6vectorIfSaIfEED1Ev.exit76:                  ; preds = %if.then.i.i.i.i75, %_ZNSt6vectorIfSaIfEED1Ev.exit80
+  %68 = load float** %_M_start.i.i, align 8, !tbaa !4
+  %tobool.i.i.i.i = icmp eq float* %68, null
+  br i1 %tobool.i.i.i.i, label %_ZNSt6vectorIfSaIfEED1Ev.exit, label %if.then.i.i.i.i
+
+if.then.i.i.i.i:                                  ; preds = %_ZNSt6vectorIfSaIfEED1Ev.exit76
+  %69 = bitcast float* %68 to i8*
+  call void @_ZdlPv(i8* %69) #1
+  br label %_ZNSt6vectorIfSaIfEED1Ev.exit
+
+_ZNSt6vectorIfSaIfEED1Ev.exit:                    ; preds = %if.then.i.i.i.i, %_ZNSt6vectorIfSaIfEED1Ev.exit76
+  call void @llvm.lifetime.end(i64 800, i8* %0) #1
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare void @pb_InitializeTimerSet(%struct.pb_TimerSet*) #0
+
+declare %struct.pb_Parameters* @pb_ReadParameters(i32*, i8**) #0
+
+; Function Attrs: noreturn nounwind
+declare void @exit(i32) #4
+
+declare void @pb_SwitchToTimer(%struct.pb_TimerSet*, i32) #0
+
+declare zeroext i1 @_Z22readColMajorMatrixFilePKcRiS1_RSt6vectorIfSaIfEE(i8*, i32*, i32*, %"class.std::vector"*) #0
+
+declare void @llvm_visc_track_mem(i8*, i64) #0
+
+declare void @llvm_visc_request_mem(i8*, i64) #0
+
+declare zeroext i1 @_Z23writeColMajorMatrixFilePKciiRSt6vectorIfSaIfEE(i8*, i32, i32, %"class.std::vector"*) #0
+
+declare void @llvm_visc_untrack_mem(i8*) #0
+
+declare double @pb_GetElapsedTime(%struct.pb_Timer*) #0
+
+declare void @pb_PrintTimerSet(%struct.pb_TimerSet*) #0
+
+declare void @pb_FreeParameters(%struct.pb_Parameters*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) #0
+
+; Function Attrs: noreturn
+declare void @_ZSt17__throw_bad_allocv() #5
+
+declare noalias i8* @_Znwm(i64) #0
+
+; Function Attrs: nounwind
+declare void @_ZdlPv(i8*) #6
+
+declare %"class.std::basic_ostream"* @_ZNSo3putEc(%"class.std::basic_ostream"*, i8 signext) #0
+
+declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(%"class.std::ctype"*) #0
+
+; Function Attrs: noreturn
+declare void @_ZSt16__throw_bad_castv() #5
+
+declare %"class.std::basic_ostream"* @_ZNSo5flushEv(%"class.std::basic_ostream"*) #0
+
+declare %"class.std::basic_ostream"* @_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l(%"class.std::basic_ostream"*, i8*, i64) #0
+
+; Function Attrs: nounwind
+define internal void @_GLOBAL__I_a() #1 section ".text.startup" {
+entry:
+  tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* @_ZStL8__ioinit) #1
+  %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* @__dso_handle) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+declare i8* @llvm_visc_initializeTimerSet()
+
+declare void @llvm_visc_switchToTimer(i8**, i32)
+
+declare void @llvm_visc_printTimerSet(i8**, i8*)
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getNode() #1
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getParentNode(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #1
+
+; Function Attrs: nounwind uwtable
+define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level1(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY) #2 {
+entry:
+  %_Z9mysgemmNTPfiS_iS_iiff.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float)* @_Z9mysgemmNTPfiS_iS_iiff to i8*), i32 %dimX, i32 %dimY)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 0, i32 0)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 1, i32 1)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 2, i32 2)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 3, i32 3)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 4, i32 4)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 5, i32 5)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 6, i32 6)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 7, i32 7)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 8, i32 8)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 9, i32 9)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 10, i32 10)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiff.node, i32 11, i32 11)
+  ret %rtype undef
+}
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.input(i8*, i32, i32) #1
+
+; Function Attrs: nounwind uwtable
+define %rtype @_Z9mysgemmNTPfiS_iS_iiffInternal_level2(float* in %A, i64 %bytes_A, i32 %lda, float* in %B, i64 %bytes_B, i32 %ldb, float* in out %C, i64 %bytes_C, i32 %ldc, i32 %k, float %alpha, float %beta, i32 %dimX, i32 %dimY, i32 %dimX1, i32 %dimY2) #2 {
+entry:
+  %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, i32, float*, i64, i32, float*, i64, i32, i32, float, float, i32, i32)* @_Z9mysgemmNTPfiS_iS_iiffInternal_level1 to i8*), i32 %dimX1, i32 %dimY2)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 0, i32 0)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 1, i32 1)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 2, i32 2)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 3, i32 3)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 4, i32 4)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 5, i32 5)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 6, i32 6)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 7, i32 7)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 8, i32 8)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 9, i32 9)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 10, i32 10)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 11, i32 11)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 12, i32 12)
+  call void @llvm.visc.bind.input(i8* %_Z9mysgemmNTPfiS_iS_iiffInternal_level1.node, i32 13, i32 13)
+  ret %rtype undef
+}
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.launch(i8*, i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.visc.wait(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.visc.init() #1
+
+; Function Attrs: nounwind
+declare void @llvm.visc.cleanup() #1
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #5 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #6 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #7 = { noreturn nounwind }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"vtable pointer", metadata !2}
+!4 = metadata !{metadata !"any pointer", metadata !1}
+!5 = metadata !{metadata !"int", metadata !1}
+!6 = metadata !{metadata !"branch_weights", i32 4, i32 64}
diff --git a/llvm/test/VISC/parboil/driver/options.py b/llvm/test/VISC/parboil/driver/options.py
index 1b8ef5f79fd896011651395d72fee534908fe5e5..bdf4c45212f273634bc5f67fbf077266b5494e07 100644
--- a/llvm/test/VISC/parboil/driver/options.py
+++ b/llvm/test/VISC/parboil/driver/options.py
@@ -355,7 +355,7 @@ def time_options(progname, cmd, args):
                                     'TEST'      : [("small", 20), ("medium", 20), ("large", 10)]
                                 }
                     )
-                    ,('sgemm',   {   'VERSION'   : ["visc", "opencl_base"],
+                   ,('sgemm',   {   'VERSION'   : ["visc", "visc_tc", "visc_tc_vec", "opencl_base", "opencl_base_tc", "opencl_nvidia"],
                                     'TEST'      : [("small", 20), ("medium", 10)]
                                 }
                     )
@@ -396,7 +396,8 @@ def time_options(progname, cmd, args):
                     arguments.append("-v")
 
                 print "Cleaning " + app + " " + ver
-                action = clean_options(progname, 'clean', arguments).run()
+                #action = clean_options(progname, 'clean', arguments).run()
+                action = False
                 if action:
                     action()
                 else:
diff --git a/llvm/test/VISC/parboil/parboilParser.py b/llvm/test/VISC/parboil/parboilParser.py
index 58813a6d43e8e67099f84665fb4902e0aff77c17..0d1f10b6862c15fb8f591972dea8f13dfba45e30 100755
--- a/llvm/test/VISC/parboil/parboilParser.py
+++ b/llvm/test/VISC/parboil/parboilParser.py
@@ -94,71 +94,60 @@ def parseCSVFile(filename):
   # return dict
   return csvDict
 
-# returns the name of the opecl version of the given application
-def getOpenCLVersionName(app):
-  return 'opencl_base' if app=='stencil' or app=='sgemm' else 'opencl_nvidia'
 
 # returns a list of available tests for the given application
-# the tests are found based on the opecl version, because the visc
-# might not exist in the dict
+# the tests are found based on the visc version, because it exists
+# for all apps in the dict
 def getTests(app, csvDict):
-  v = getOpenCLVersionName(app)
-  return csvDict[app][v].keys()
+  return csvDict[app]["visc"].keys()
+
+
+def isViscVersion(version):
+  return version.startswith("visc")
+
+
+def getAllVersions(csvDict):
+  apps = csvDict.keys()
+
+  versions = set()
+  for app in apps:
+    versions = versions | set(csvDict[app].keys())
+
+  versionsList = sorted(list(versions))
+  return versionsList
+
 
 # print total execution time for all applications and corresponding
 # test sizes
 def printTotalExecutionTimeTable(csvDict):
-  print 'application,visc,opencl'
+  versions = getAllVersions(csvDict)
+
+  line = "application"
+  for v in versions:
+    line = line + "," + v
+  print line
+
   t = 'Final'
   cat = 'Timer Wall - IO'
   for app in csvDict.iterkeys():
-    v1 = 'visc'
-    v2 = getOpenCLVersionName(app)
     tests = getTests(app, csvDict)
     for test in tests:
-      print "{0:s}-{1:s},{2:s},{3:s}".format(app, test, csvDict[app][v1][test][t][cat], csvDict[app][v2][test][t][cat])
+      line = "{0:s}-{1:s}".format(app, test)
+      for v in versions:
+        line = line + "," + csvDict[app][v][test][t][cat]
+      print line
 
-def plotTotalExecutionTimeTable(csvDict):
-  global num_figs
-  t1 = 'GenVISC_Timer'
-  t2 = 'Final'
-  cat = 'Timer Wall Time'
-  visc_time = []
-  opencl_time = []
-  axis_values = []
-  n_groups = 0;
-  for app in csvDict.iterkeys():
-    v1 = 'visc'
-    v2 = getOpenCLVersionName(app)
-    tests = getTests(app, csvDict)
-    for test in tests:
-      n_groups += 1
-      visc_time.append(float(csvDict[app][v1][test][t1][cat]))
-      opencl_time.append(float(csvDict[app][v2][test][t2][cat]))
-      axis_values.append(app + '\n' + test);
-
-  bar_width = 0.35
-  opacity = 0.4
-  num_figs += 1
-  fig, ax = plt.subplots(num=num_figs)
-  index = np.arange(n_groups)
-  bars_visc = plt.bar(index, visc_time, bar_width, alpha=opacity, color='b', label='VISC')
-  bars_opencl = plt.bar(index + bar_width, opencl_time, bar_width, alpha=opacity, color='r', label='OpenCL')
-  plt.xlabel('Experiments')
-  plt.ylabel('Total Execution Time (s)')
-  plt.title('Total Execution Time - VISC and OpenCL')
-  plt.xticks(index + bar_width, axis_values)
-  plt.legend(loc='best')
-  plt.tight_layout()
-
-def printTimerDecomposition(csvDict, isVisc):
+
+def printTimerDecomposition(csvDict, version):
   # get apps
   apps = csvDict.keys()
 
+  isVisc = isViscVersion(version)
+
   # get tests for each app
   tests = dict()
   for app in apps:
-    tests[app] = getTests(app, csvDict)
+    tests[app] = csvDict[app][version].keys()
 
   # list of timer-category pairs
   if isVisc:
@@ -206,133 +195,26 @@ def printTimerDecomposition(csvDict, isVisc):
              ('Final', 'IO'), 
              ('Final', 'Timer Wall Time')]
 
-  line = "Category,"
+  print version
+  line = "Category"
   for app in apps:
     for test in tests[app]:
-      line = line + app + "-" + test + ","
+      line = line + "," + app + "-" + test
   print line
  
   for (t, cat) in timers:
-    line = cat + ","
+    line = cat
     for app in apps:
-      v = 'visc' if isVisc else getOpenCLVersionName(app)
       for test in tests[app]:
-        line = line + csvDict[app][v][test][t][cat] + ","
-    print line 
-
-def plotTimerDecomposition(csvDict, plotapp):
-  global num_figs
-
-  # get apps
-  if not plotapp:
-    apps = csvDict.keys()
-  else:
-    apps = [ plotapp ]
-
-  # get tests for each app
-  tests = dict()
-  for app in apps:
-    tests[app] = getTests(app, csvDict)
-
-  # list of timer-category pairs
-  visc_timers =[('Final', 'Kernel'), 
-                ('Final', 'Load Program Binary'), 
-                ('Final', 'Argument Unpack'), 
-                ('Final', 'Marshal Arguments'), 
-                ('Final', 'Free Memory'), 
-                ('Final', 'Memory Track'), 
-                ('Final', 'Clear Context'), 
-                ('Final', 'Total GPU Computation'), 
-                ('Final', 'Copy Pointer Arguments'), 
-                ('Final', 'Initialize Context'), 
-                ('Final', 'Read Output'), 
-                ('Final', 'Pthread Create'), 
-                ('Final', 'Copy Scalar Arguments'), 
-                ('Final', 'WorkGroup Size Calculation'), 
-                ('Final', 'IO'), 
-                ('Final', 'Output Pack'), 
-                ('Parboil', 'Mem_Untrack'), 
-                ('Parboil', 'Clear_Ctx'),
-                ('Final', 'Timer Wall - IO'),
-                ('Final', 'Timer Wall Time')]
-  opencl_timers =[('Final', 'Init_Ctx'),
-                  ('Final', 'Arg_Unpack'), 
-                  ('Final', 'Copy_Scalar'), 
-                  ('Final', 'Mem_Track'), 
-                  ('Final', 'Driver'), 
-                  ('Final', 'Output_Unpack'), 
-                  ('Final', 'Arg_Pack'), 
-                  ('Final', 'Copy'), 
-                  ('Final', 'Compute'), 
-                  ('Final', 'Setup'), 
-                  ('Final', 'Read_Output'), 
-                  ('Final', 'IO'), 
-                  ('Final', 'Pthread_Create'), 
-                  ('Final', 'Kernel'), 
-                  ('Final', 'Mem_Free'), 
-                  ('Final', 'Copy Async'), 
-                  ('Final', 'Copy_Ptr'),
-                  ('Final', 'Output_Pack'),
-                  ('Final', 'Mem_Untrack'),
-                  ('Final', 'Clear_Ctx'),
-                  ('Final', 'Timer Wall - IO'),
-                  ('Final', 'Timer Wall Time')]
-
-  for app in apps:
-    for test in tests[app]:
-# Plotting visc timers
-      v = 'visc'
-      n_vals = 0;
-      axis_values = []
-      visc_time_decomp = []
-      for (t, cat) in visc_timers:
-        n_vals += 1
-        axis_values.append(cat);
-        visc_time_decomp.append(float(csvDict[app][v][test][t][cat])) 
-
-      opacity = 0.4
-      num_figs += 1
-      fig = plt.figure(num=num_figs)
-      ax = fig.add_subplot(211);
-      index = np.arange(n_vals)
-      plt.barh(index, visc_time_decomp, alpha=opacity, color='b', label=v)
-      plt.xlabel('Time (s)')
-      plt.ylabel('Timers')
-      plt.title('Time Decomposition - ' + app + ' ' + test + ' ' + v)
-      plt.yticks(index, axis_values)
-      plt.tight_layout()
-
-      v = getOpenCLVersionName(app)
-      n_vals = 0;
-      axis_values = []
-      opencl_time_decomp = []
-      for (t, cat) in opencl_timers:
-        n_vals += 1
-        axis_values.append(cat);
-        opencl_time_decomp.append(float(csvDict[app][v][test][t][cat])) 
-
-      opacity = 0.4
-      ax = fig.add_subplot(212);
-      index = np.arange(n_vals)
-      plt.barh(index, opencl_time_decomp, alpha=opacity, color='r', label=v)
-      plt.xlabel('Time (s)')
-      plt.ylabel('Timers')
-      plt.title('Time Decomposition - ' + app + ' ' + test + ' ' + v)
-      plt.yticks(index, axis_values)
-      plt.tight_layout()
-
+        line = line + "," + csvDict[app][version][test][t][cat]
+    print line
 
 # command line options parser
 parser = OptionParser()
 parser.add_option("-f","--file",action="store",type="string",dest="filename")
 parser.add_option("--print-totals",action="store_true",default=False,dest="printTotals")
-parser.add_option("--print-timers-visc",action="store_true",default=False,dest="printTimersVisc")
-parser.add_option("--print-timers-opencl",action="store_true",default=False,dest="printTimersOpencl")
+parser.add_option("--print-timers",action="store",type="string",dest="printTimers")
 parser.add_option("--print-all",action="store_true",default=False,dest="printAll")
-parser.add_option("--plot-totals",action="store_true",default=False,dest="plotTotals")
-parser.add_option("--plot-timers",action="store_true",default=False,dest="plotTimers")
-parser.add_option("--plot-all",action="store_true",default=False,dest="plotAll")
-parser.add_option("--plot-app",action="store",type="string",dest="plotapp")
 
 # main
 def main():
@@ -357,29 +239,15 @@ def main():
     printTotalExecutionTimeTable(csvDict)
     print ''
 
-  if options.printTimersVisc or options.printAll:
-    printTimerDecomposition(csvDict, isVisc=True)
+  if options.printTimers == "all" or options.printAll:
+    versions = getAllVersions(csvDict)
+    for v in versions:
+      printTimerDecomposition(csvDict, v)
     print ''
-
-  if options.printTimersOpencl or options.printAll:
-    printTimerDecomposition(csvDict, isVisc=False)
+  elif options.printTimers:
+    printTimerDecomposition(csvDict, options.printTimers)
     print ''
 
-  # plot graphs
-
-  if options.plotTotals or options.plotAll:
-    plotTotalExecutionTimeTable(csvDict)
-
-  if options.plotapp and not options.plotapp in csvDict.keys():
-    print "parboilReader.py: Error: No input file was given!"
-    raise OSError
-  elif options.plotapp or options.plotTimers or options.plotAll:
-    plotTimerDecomposition(csvDict, options.plotapp)
-
-  plt.show()
-  for i in range(num_figs):
-    plt.close(i)
-
 
 if __name__ == '__main__':
   main()